Changed "llama" to "jarvis"
This commit is contained in:
parent
4dfbcf9646
commit
52ab617954
372 changed files with 8788 additions and 8788 deletions
|
@ -7,16 +7,16 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
|
||||||
checkout scm // Clone the repo on Runner
|
checkout scm // Clone the repo on Runner
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage('Compiling llama.cpp'){
|
stage('Compiling jarvis.cpp'){
|
||||||
sh'''#!/bin/bash
|
sh'''#!/bin/bash
|
||||||
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
|
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling jarvis for RISC-V
|
||||||
'''
|
'''
|
||||||
}
|
}
|
||||||
stage('Running llama.cpp'){
|
stage('Running jarvis.cpp'){
|
||||||
sh'''#!/bin/bash
|
sh'''#!/bin/bash
|
||||||
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
||||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./jarvis-cli -m /home/alitariq/codejarvis-7b.Q4_K_M.gguf -p "Anything" -n 9 > jarvis_log.txt # Running jarvis.cpp on vector qemu-riscv64
|
||||||
cat llama_log.txt # Printing results
|
cat jarvis_log.txt # Printing results
|
||||||
'''
|
'''
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,7 +26,7 @@ COPY . .
|
||||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
fi && \
|
fi && \
|
||||||
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
cmake -B build -DGGML_CUDA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
cmake --build build --config Release -j$(nproc) && \
|
cmake --build build --config Release -j$(nproc) && \
|
||||||
cp build/bin/* .
|
cp build/bin/* .
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,7 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
RUN cmake -B build -DGGML_MUSA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
cmake --build build --config Release -j$(nproc) && \
|
cmake --build build --config Release -j$(nproc) && \
|
||||||
cp build/bin/* .
|
cp build/bin/* .
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# Unless otherwise specified, we make a fat build.
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
ARG ROCM_DOCKER_ARCH="\
|
ARG ROCM_DOCKER_ARCH="\
|
||||||
gfx803 \
|
gfx803 \
|
||||||
|
@ -41,7 +41,7 @@ ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV JARVIS_CURL=1
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
ENV LLAMA_CURL=1
|
ENV JARVIS_CURL=1
|
||||||
|
|
||||||
|
|
||||||
RUN make -j$(nproc)
|
RUN make -j$(nproc)
|
||||||
|
|
|
@ -23,11 +23,11 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
||||||
RUN echo "Building with static libs" && \
|
RUN echo "Building with static libs" && \
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
||||||
cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
|
cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
|
||||||
cmake --build build --config Release --target llama-cli
|
cmake --build build --config Release --target jarvis-cli
|
||||||
|
|
||||||
# TODO: use image with NNRT
|
# TODO: use image with NNRT
|
||||||
FROM cosdt/cann:$ASCEND_VERSION AS runtime
|
FROM cosdt/cann:$ASCEND_VERSION AS runtime
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
@ -41,4 +41,4 @@ ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
||||||
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
|
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
|
||||||
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
|
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
|
||||||
|
|
||||||
ENTRYPOINT ["/llama-cli" ]
|
ENTRYPOINT ["/jarvis-cli" ]
|
||||||
|
|
|
@ -23,7 +23,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
fi && \
|
fi && \
|
||||||
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
cmake --build build --config Release --target llama-cli -j$(nproc)
|
cmake --build build --config Release --target jarvis-cli -j$(nproc)
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ RUN apt-get update && \
|
||||||
apt-get install -y libgomp1
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
ENTRYPOINT [ "/jarvis-cli" ]
|
||||||
|
|
|
@ -17,12 +17,12 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
echo "Building with static libs" && \
|
echo "Building with static libs" && \
|
||||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
|
||||||
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
|
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
|
||||||
cmake --build build --config Release --target llama-cli
|
cmake --build build --config Release --target jarvis-cli
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
ENTRYPOINT [ "/jarvis-cli" ]
|
||||||
|
|
|
@ -16,7 +16,7 @@ WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
cmake --build build --config Release --target llama-cli -j$(nproc)
|
cmake --build build --config Release --target jarvis-cli -j$(nproc)
|
||||||
|
|
||||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ RUN apt-get update && \
|
||||||
apt-get install -y libgomp1
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
ENTRYPOINT [ "/jarvis-cli" ]
|
||||||
|
|
|
@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# Unless otherwise specified, we make a fat build.
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
ARG ROCM_DOCKER_ARCH="\
|
ARG ROCM_DOCKER_ARCH="\
|
||||||
gfx803 \
|
gfx803 \
|
||||||
|
@ -40,6 +40,6 @@ ENV GGML_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-cli
|
RUN make -j$(nproc) jarvis-cli
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-cli" ]
|
ENTRYPOINT [ "/app/jarvis-cli" ]
|
||||||
|
|
|
@ -15,13 +15,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cmake -B build -DGGML_VULKAN=1 && \
|
RUN cmake -B build -DGGML_VULKAN=1 && \
|
||||||
cmake --build build --config Release --target llama-cli
|
cmake --build build --config Release --target jarvis-cli
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
RUN cp /app/build/bin/llama-cli /llama-cli && \
|
RUN cp /app/build/bin/jarvis-cli /jarvis-cli && \
|
||||||
rm -rf /app
|
rm -rf /app
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
ENTRYPOINT [ "/jarvis-cli" ]
|
||||||
|
|
|
@ -9,15 +9,15 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-cli
|
RUN make -j$(nproc) jarvis-cli
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libgomp1
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/llama-cli /llama-cli
|
COPY --from=build /app/jarvis-cli /jarvis-cli
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
ENTRYPOINT [ "/jarvis-cli" ]
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# Built and maintained by John Boero - boeroboy@gmail.com
|
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||||
|
|
||||||
# Notes for llama.cpp:
|
# Notes for jarvis.cpp:
|
||||||
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
||||||
# We need to declare standard versioning if people want to sort latest releases.
|
# We need to declare standard versioning if people want to sort latest releases.
|
||||||
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
||||||
|
@ -12,44 +12,44 @@
|
||||||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||||
# It is up to the user to install the correct vendor-specific support.
|
# It is up to the user to install the correct vendor-specific support.
|
||||||
|
|
||||||
Name: llama.cpp-cuda
|
Name: jarvis.cpp-cuda
|
||||||
Version: %( date "+%%Y%%m%%d" )
|
Version: %( date "+%%Y%%m%%d" )
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||||
License: MIT
|
License: MIT
|
||||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
Source0: https://github.com/ggerganov/jarvis.cpp/archive/refs/heads/master.tar.gz
|
||||||
BuildRequires: coreutils make gcc-c++ git cuda-toolkit
|
BuildRequires: coreutils make gcc-c++ git cuda-toolkit
|
||||||
Requires: cuda-toolkit
|
Requires: cuda-toolkit
|
||||||
URL: https://github.com/ggerganov/llama.cpp
|
URL: https://github.com/ggerganov/jarvis.cpp
|
||||||
|
|
||||||
%define debug_package %{nil}
|
%define debug_package %{nil}
|
||||||
%define source_date_epoch_from_changelog 0
|
%define source_date_epoch_from_changelog 0
|
||||||
|
|
||||||
%description
|
%description
|
||||||
CPU inference for Meta's Lllama2 models using default options.
|
CPU inference for Meta's Ljarvis2 models using default options.
|
||||||
|
|
||||||
%prep
|
%prep
|
||||||
%setup -n llama.cpp-master
|
%setup -n jarvis.cpp-master
|
||||||
|
|
||||||
%build
|
%build
|
||||||
make -j GGML_CUDA=1
|
make -j GGML_CUDA=1
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
cp -p jarvis-cli %{buildroot}%{_bindir}/jarvis-cuda-cli
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
cp -p jarvis-server %{buildroot}%{_bindir}/jarvis-cuda-server
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
cp -p jarvis-simple %{buildroot}%{_bindir}/jarvis-cuda-simple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/jarviscuda.service
|
||||||
[Unit]
|
[Unit]
|
||||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
Description=Jarvis.cpp server, CPU only (no GPU support in this build).
|
||||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/jarvis
|
||||||
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
|
ExecStart=/usr/bin/jarvis-cuda-server $JARVIS_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -58,8 +58,8 @@ WantedBy=default.target
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
mkdir -p %{buildroot}/etc/sysconfig
|
mkdir -p %{buildroot}/etc/sysconfig
|
||||||
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/jarvis
|
||||||
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
JARVIS_ARGS="-m /opt/jarvis2/ggml-model-f32.bin"
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
%clean
|
%clean
|
||||||
|
@ -67,11 +67,11 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama-cuda-cli
|
%{_bindir}/jarvis-cuda-cli
|
||||||
%{_bindir}/llama-cuda-server
|
%{_bindir}/jarvis-cuda-server
|
||||||
%{_bindir}/llama-cuda-simple
|
%{_bindir}/jarvis-cuda-simple
|
||||||
/usr/lib/systemd/system/llamacuda.service
|
/usr/lib/systemd/system/jarviscuda.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/jarvis
|
||||||
|
|
||||||
%pre
|
%pre
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# Built and maintained by John Boero - boeroboy@gmail.com
|
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||||
|
|
||||||
# Notes for llama.cpp:
|
# Notes for jarvis.cpp:
|
||||||
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
||||||
# We need to declare standard versioning if people want to sort latest releases.
|
# We need to declare standard versioning if people want to sort latest releases.
|
||||||
# In the meantime, YYYYMMDD format will be used.
|
# In the meantime, YYYYMMDD format will be used.
|
||||||
|
@ -13,45 +13,45 @@
|
||||||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||||
# It is up to the user to install the correct vendor-specific support.
|
# It is up to the user to install the correct vendor-specific support.
|
||||||
|
|
||||||
Name: llama.cpp
|
Name: jarvis.cpp
|
||||||
Version: %( date "+%%Y%%m%%d" )
|
Version: %( date "+%%Y%%m%%d" )
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||||
License: MIT
|
License: MIT
|
||||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
Source0: https://github.com/ggerganov/jarvis.cpp/archive/refs/heads/master.tar.gz
|
||||||
BuildRequires: coreutils make gcc-c++ git libstdc++-devel
|
BuildRequires: coreutils make gcc-c++ git libstdc++-devel
|
||||||
Requires: libstdc++
|
Requires: libstdc++
|
||||||
URL: https://github.com/ggerganov/llama.cpp
|
URL: https://github.com/ggerganov/jarvis.cpp
|
||||||
|
|
||||||
%define debug_package %{nil}
|
%define debug_package %{nil}
|
||||||
%define source_date_epoch_from_changelog 0
|
%define source_date_epoch_from_changelog 0
|
||||||
|
|
||||||
%description
|
%description
|
||||||
CPU inference for Meta's Lllama2 models using default options.
|
CPU inference for Meta's Ljarvis2 models using default options.
|
||||||
Models are not included in this package and must be downloaded separately.
|
Models are not included in this package and must be downloaded separately.
|
||||||
|
|
||||||
%prep
|
%prep
|
||||||
%setup -n llama.cpp-master
|
%setup -n jarvis.cpp-master
|
||||||
|
|
||||||
%build
|
%build
|
||||||
make -j
|
make -j
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
cp -p jarvis-cli %{buildroot}%{_bindir}/jarvis-cli
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
cp -p jarvis-server %{buildroot}%{_bindir}/jarvis-server
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
cp -p jarvis-simple %{buildroot}%{_bindir}/jarvis-simple
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/jarvis.service
|
||||||
[Unit]
|
[Unit]
|
||||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
Description=Jarvis.cpp server, CPU only (no GPU support in this build).
|
||||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
EnvironmentFile=/etc/sysconfig/jarvis
|
||||||
ExecStart=/usr/bin/llama-server $LLAMA_ARGS
|
ExecStart=/usr/bin/jarvis-server $JARVIS_ARGS
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
ExecReload=/bin/kill -s HUP $MAINPID
|
||||||
Restart=never
|
Restart=never
|
||||||
|
|
||||||
|
@ -60,8 +60,8 @@ WantedBy=default.target
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
mkdir -p %{buildroot}/etc/sysconfig
|
mkdir -p %{buildroot}/etc/sysconfig
|
||||||
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/jarvis
|
||||||
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
JARVIS_ARGS="-m /opt/jarvis2/ggml-model-f32.bin"
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
%clean
|
%clean
|
||||||
|
@ -69,11 +69,11 @@ rm -rf %{buildroot}
|
||||||
rm -rf %{_builddir}/*
|
rm -rf %{_builddir}/*
|
||||||
|
|
||||||
%files
|
%files
|
||||||
%{_bindir}/llama-cli
|
%{_bindir}/jarvis-cli
|
||||||
%{_bindir}/llama-server
|
%{_bindir}/jarvis-server
|
||||||
%{_bindir}/llama-simple
|
%{_bindir}/jarvis-simple
|
||||||
/usr/lib/systemd/system/llama.service
|
/usr/lib/systemd/system/jarvis.service
|
||||||
%config /etc/sysconfig/llama
|
%config /etc/sysconfig/jarvis
|
||||||
|
|
||||||
%pre
|
%pre
|
||||||
|
|
||||||
|
|
|
@ -22,8 +22,8 @@ COPY . .
|
||||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
fi && \
|
fi && \
|
||||||
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
cmake -B build -DGGML_CUDA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
cmake --build build --config Release --target llama-server -j$(nproc)
|
cmake --build build --config Release --target jarvis-server -j$(nproc)
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
|
@ -31,12 +31,12 @@ RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
COPY --from=build /app/build/bin/jarvis-server /jarvis-server
|
||||||
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
ENV JARVIS_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
ENTRYPOINT [ "/jarvis-server" ]
|
||||||
|
|
|
@ -15,20 +15,20 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||||
fi && \
|
fi && \
|
||||||
echo "Building with dynamic libs" && \
|
echo "Building with dynamic libs" && \
|
||||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DJARVIS_CURL=ON ${OPT_SYCL_F16} && \
|
||||||
cmake --build build --config Release --target llama-server
|
cmake --build build --config Release --target jarvis-server
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev curl
|
apt-get install -y libcurl4-openssl-dev curl
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
COPY --from=build /app/build/bin/jarvis-server /jarvis-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
ENV JARVIS_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
ENTRYPOINT [ "/jarvis-server" ]
|
||||||
|
|
|
@ -15,8 +15,8 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
RUN cmake -B build -DGGML_MUSA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
cmake --build build --config Release --target llama-server -j$(nproc)
|
cmake --build build --config Release --target jarvis-server -j$(nproc)
|
||||||
|
|
||||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
|
@ -24,12 +24,12 @@ RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
COPY --from=build /app/build/bin/jarvis-server /jarvis-server
|
||||||
|
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
ENV JARVIS_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
ENTRYPOINT [ "/jarvis-server" ]
|
||||||
|
|
|
@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# Unless otherwise specified, we make a fat build.
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
|
||||||
# This is mostly tied to rocBLAS supported archs.
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
ARG ROCM_DOCKER_ARCH="\
|
ARG ROCM_DOCKER_ARCH="\
|
||||||
gfx803 \
|
gfx803 \
|
||||||
|
@ -40,15 +40,15 @@ ENV GGML_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
ENV JARVIS_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV JARVIS_CURL=1
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev curl
|
apt-get install -y libcurl4-openssl-dev curl
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
RUN make -j$(nproc) jarvis-server
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
ENTRYPOINT [ "/app/jarvis-server" ]
|
||||||
|
|
|
@ -14,18 +14,18 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
|
||||||
# Build it
|
# Build it
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
RUN cmake -B build -DGGML_VULKAN=1 -DJARVIS_CURL=1 && \
|
||||||
cmake --build build --config Release --target llama-server
|
cmake --build build --config Release --target jarvis-server
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
RUN cp /app/build/bin/llama-server /llama-server && \
|
RUN cp /app/build/bin/jarvis-server /jarvis-server && \
|
||||||
rm -rf /app
|
rm -rf /app
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
ENV JARVIS_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
ENTRYPOINT [ "/jarvis-server" ]
|
||||||
|
|
|
@ -9,21 +9,21 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
ENV LLAMA_CURL=1
|
ENV JARVIS_CURL=1
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
RUN make -j$(nproc) jarvis-server
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
COPY --from=build /app/llama-server /llama-server
|
COPY --from=build /app/jarvis-server /jarvis-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
ENV JARVIS_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
ENTRYPOINT [ "/jarvis-server" ]
|
||||||
|
|
|
@ -6,10 +6,10 @@
|
||||||
let
|
let
|
||||||
inherit (config.packages) default;
|
inherit (config.packages) default;
|
||||||
binaries = [
|
binaries = [
|
||||||
"llama-cli"
|
"jarvis-cli"
|
||||||
"llama-embedding"
|
"jarvis-embedding"
|
||||||
"llama-server"
|
"jarvis-server"
|
||||||
"llama-quantize"
|
"jarvis-quantize"
|
||||||
];
|
];
|
||||||
mkApp = name: {
|
mkApp = name: {
|
||||||
type = "app";
|
type = "app";
|
||||||
|
|
|
@ -2,14 +2,14 @@
|
||||||
lib,
|
lib,
|
||||||
dockerTools,
|
dockerTools,
|
||||||
buildEnv,
|
buildEnv,
|
||||||
llama-cpp,
|
jarvis-cpp,
|
||||||
interactive ? true,
|
interactive ? true,
|
||||||
coreutils,
|
coreutils,
|
||||||
}:
|
}:
|
||||||
|
|
||||||
# A tar that can be fed into `docker load`:
|
# A tar that can be fed into `docker load`:
|
||||||
#
|
#
|
||||||
# $ nix build .#llamaPackages.docker
|
# $ nix build .#jarvisPackages.docker
|
||||||
# $ docker load < result
|
# $ docker load < result
|
||||||
|
|
||||||
# For details and variations cf.
|
# For details and variations cf.
|
||||||
|
@ -19,16 +19,16 @@
|
||||||
|
|
||||||
# Approximate (compressed) sizes, at the time of writing, are:
|
# Approximate (compressed) sizes, at the time of writing, are:
|
||||||
#
|
#
|
||||||
# .#llamaPackages.docker: 125M;
|
# .#jarvisPackages.docker: 125M;
|
||||||
# .#llamaPackagesCuda.docker: 537M;
|
# .#jarvisPackagesCuda.docker: 537M;
|
||||||
# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
|
# .#legacyPackages.aarch64-linux.jarvisPackagesXavier.docker: 415M.
|
||||||
|
|
||||||
dockerTools.buildLayeredImage {
|
dockerTools.buildLayeredImage {
|
||||||
name = llama-cpp.pname;
|
name = jarvis-cpp.pname;
|
||||||
tag = "latest";
|
tag = "latest";
|
||||||
|
|
||||||
contents =
|
contents =
|
||||||
[ llama-cpp ]
|
[ jarvis-cpp ]
|
||||||
++ lib.optionals interactive [
|
++ lib.optionals interactive [
|
||||||
coreutils
|
coreutils
|
||||||
dockerTools.binSh
|
dockerTools.binSh
|
||||||
|
|
|
@ -11,10 +11,10 @@
|
||||||
{
|
{
|
||||||
legacyPackages =
|
legacyPackages =
|
||||||
let
|
let
|
||||||
caps.llamaPackagesXavier = "7.2";
|
caps.jarvisPackagesXavier = "7.2";
|
||||||
caps.llamaPackagesOrin = "8.7";
|
caps.jarvisPackagesOrin = "8.7";
|
||||||
caps.llamaPackagesTX2 = "6.2";
|
caps.jarvisPackagesTX2 = "6.2";
|
||||||
caps.llamaPackagesNano = "5.3";
|
caps.jarvisPackagesNano = "5.3";
|
||||||
|
|
||||||
pkgsFor =
|
pkgsFor =
|
||||||
cap:
|
cap:
|
||||||
|
@ -31,9 +31,9 @@
|
||||||
builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
|
builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
|
||||||
|
|
||||||
packages = lib.optionalAttrs (system == "aarch64-linux") {
|
packages = lib.optionalAttrs (system == "aarch64-linux") {
|
||||||
jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
|
jetson-xavier = config.legacyPackages.jarvisPackagesXavier.jarvis-cpp;
|
||||||
jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
|
jetson-orin = config.legacyPackages.jarvisPackagesOrin.jarvis-cpp;
|
||||||
jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
|
jetson-nano = config.legacyPackages.jarvisPackagesNano.jarvis-cpp;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
lib,
|
lib,
|
||||||
llamaVersion,
|
jarvisVersion,
|
||||||
numpy,
|
numpy,
|
||||||
tqdm,
|
tqdm,
|
||||||
sentencepiece,
|
sentencepiece,
|
||||||
|
@ -12,7 +12,7 @@
|
||||||
|
|
||||||
buildPythonPackage {
|
buildPythonPackage {
|
||||||
pname = "gguf";
|
pname = "gguf";
|
||||||
version = llamaVersion;
|
version = jarvisVersion;
|
||||||
pyproject = true;
|
pyproject = true;
|
||||||
nativeBuildInputs = [ poetry-core ];
|
nativeBuildInputs = [ poetry-core ];
|
||||||
propagatedBuildInputs = [
|
propagatedBuildInputs = [
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
useRocm ? config.rocmSupport,
|
useRocm ? config.rocmSupport,
|
||||||
enableCurl ? true,
|
enableCurl ? true,
|
||||||
useVulkan ? false,
|
useVulkan ? false,
|
||||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
jarvisVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||||
|
|
||||||
# It's necessary to consistently use backendStdenv when building with CUDA support,
|
# It's necessary to consistently use backendStdenv when building with CUDA support,
|
||||||
# otherwise we get libstdc++ errors downstream.
|
# otherwise we get libstdc++ errors downstream.
|
||||||
|
@ -103,8 +103,8 @@ let
|
||||||
in
|
in
|
||||||
|
|
||||||
effectiveStdenv.mkDerivation (finalAttrs: {
|
effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
pname = "llama-cpp${pnameSuffix}";
|
pname = "jarvis-cpp${pnameSuffix}";
|
||||||
version = llamaVersion;
|
version = jarvisVersion;
|
||||||
|
|
||||||
# Note: none of the files discarded here are visible in the sandbox or
|
# Note: none of the files discarded here are visible in the sandbox or
|
||||||
# affect the output hash. This also means they can be modified without
|
# affect the output hash. This also means they can be modified without
|
||||||
|
@ -132,12 +132,12 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
||||||
'';
|
'';
|
||||||
|
|
||||||
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
|
# With PR#6015 https://github.com/ggerganov/jarvis.cpp/pull/6015,
|
||||||
# `default.metallib` may be compiled with Metal compiler from XCode
|
# `default.metallib` may be compiled with Metal compiler from XCode
|
||||||
# and we need to escape sandbox on MacOS to access Metal compiler.
|
# and we need to escape sandbox on MacOS to access Metal compiler.
|
||||||
# `xcrun` is used find the path of the Metal compiler, which is varible
|
# `xcrun` is used find the path of the Metal compiler, which is varible
|
||||||
# and not on $PATH
|
# and not on $PATH
|
||||||
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
|
# see https://github.com/ggerganov/jarvis.cpp/pull/6118 for discussion
|
||||||
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
|
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
|
||||||
|
|
||||||
nativeBuildInputs =
|
nativeBuildInputs =
|
||||||
|
@ -166,10 +166,10 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
|
|
||||||
cmakeFlags =
|
cmakeFlags =
|
||||||
[
|
[
|
||||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
(cmakeBool "JARVIS_BUILD_SERVER" true)
|
||||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||||
(cmakeBool "LLAMA_CURL" enableCurl)
|
(cmakeBool "JARVIS_CURL" enableCurl)
|
||||||
(cmakeBool "GGML_NATIVE" false)
|
(cmakeBool "GGML_NATIVE" false)
|
||||||
(cmakeBool "GGML_BLAS" useBlas)
|
(cmakeBool "GGML_BLAS" useBlas)
|
||||||
(cmakeBool "GGML_CUDA" useCuda)
|
(cmakeBool "GGML_CUDA" useCuda)
|
||||||
|
@ -205,7 +205,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
# if they haven't been added yet.
|
# if they haven't been added yet.
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
mkdir -p $out/include
|
mkdir -p $out/include
|
||||||
cp $src/include/llama.h $out/include/
|
cp $src/include/jarvis.h $out/include/
|
||||||
'';
|
'';
|
||||||
|
|
||||||
meta = {
|
meta = {
|
||||||
|
@ -219,11 +219,11 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
||||||
|
|
||||||
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
||||||
homepage = "https://github.com/ggerganov/llama.cpp/";
|
homepage = "https://github.com/ggerganov/jarvis.cpp/";
|
||||||
license = lib.licenses.mit;
|
license = lib.licenses.mit;
|
||||||
|
|
||||||
# Accommodates `nix run` and `lib.getExe`
|
# Accommodates `nix run` and `lib.getExe`
|
||||||
mainProgram = "llama-cli";
|
mainProgram = "jarvis-cli";
|
||||||
|
|
||||||
# These people might respond, on the best effort basis, if you ping them
|
# These people might respond, on the best effort basis, if you ping them
|
||||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
||||||
|
|
|
@ -9,7 +9,7 @@
|
||||||
}@inputs:
|
}@inputs:
|
||||||
|
|
||||||
let
|
let
|
||||||
llama-python-deps = with python3Packages; [
|
jarvis-python-deps = with python3Packages; [
|
||||||
numpy
|
numpy
|
||||||
sentencepiece
|
sentencepiece
|
||||||
transformers
|
transformers
|
||||||
|
@ -18,7 +18,7 @@ let
|
||||||
gguf-py
|
gguf-py
|
||||||
tqdm
|
tqdm
|
||||||
|
|
||||||
# for scripts/compare-llama-bench.py
|
# for scripts/compare-jarvis-bench.py
|
||||||
gitpython
|
gitpython
|
||||||
tabulate
|
tabulate
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ let
|
||||||
|
|
||||||
];
|
];
|
||||||
|
|
||||||
llama-python-test-deps = with python3Packages; [
|
jarvis-python-test-deps = with python3Packages; [
|
||||||
# Server bench
|
# Server bench
|
||||||
matplotlib
|
matplotlib
|
||||||
|
|
||||||
|
@ -40,7 +40,7 @@ let
|
||||||
in
|
in
|
||||||
|
|
||||||
buildPythonPackage ({
|
buildPythonPackage ({
|
||||||
pname = "llama-scripts";
|
pname = "jarvis-scripts";
|
||||||
version = "0.0.0";
|
version = "0.0.0";
|
||||||
pyproject = true;
|
pyproject = true;
|
||||||
|
|
||||||
|
@ -61,6 +61,6 @@ buildPythonPackage ({
|
||||||
src = lib.cleanSource ../../.;
|
src = lib.cleanSource ../../.;
|
||||||
};
|
};
|
||||||
nativeBuildInputs = [ poetry-core ];
|
nativeBuildInputs = [ poetry-core ];
|
||||||
nativeCheckInputs = llama-python-test-deps;
|
nativeCheckInputs = jarvis-python-test-deps;
|
||||||
dependencies = llama-python-deps;
|
dependencies = jarvis-python-deps;
|
||||||
})
|
})
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
lib,
|
lib,
|
||||||
newScope,
|
newScope,
|
||||||
python3,
|
python3,
|
||||||
llamaVersion ? "0.0.0",
|
jarvisVersion ? "0.0.0",
|
||||||
}:
|
}:
|
||||||
|
|
||||||
let
|
let
|
||||||
|
@ -21,7 +21,7 @@ in
|
||||||
# Cf. https://noogle.dev/f/lib/makeScope
|
# Cf. https://noogle.dev/f/lib/makeScope
|
||||||
|
|
||||||
lib.makeScope newScope (self: {
|
lib.makeScope newScope (self: {
|
||||||
inherit llamaVersion;
|
inherit jarvisVersion;
|
||||||
gguf-py = self.callPackage ./package-gguf-py.nix {
|
gguf-py = self.callPackage ./package-gguf-py.nix {
|
||||||
inherit
|
inherit
|
||||||
buildPythonPackage
|
buildPythonPackage
|
||||||
|
@ -34,7 +34,7 @@ lib.makeScope newScope (self: {
|
||||||
;
|
;
|
||||||
};
|
};
|
||||||
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
|
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
|
||||||
llama-cpp = self.callPackage ./package.nix { };
|
jarvis-cpp = self.callPackage ./package.nix { };
|
||||||
docker = self.callPackage ./docker.nix { };
|
docker = self.callPackage ./docker.nix { };
|
||||||
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
||||||
sif = self.callPackage ./sif.nix { };
|
sif = self.callPackage ./sif.nix { };
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
lib,
|
lib,
|
||||||
singularity-tools,
|
singularity-tools,
|
||||||
llama-cpp,
|
jarvis-cpp,
|
||||||
bashInteractive,
|
bashInteractive,
|
||||||
interactive ? false,
|
interactive ? false,
|
||||||
}:
|
}:
|
||||||
|
@ -10,8 +10,8 @@ let
|
||||||
optionalInt = cond: x: if cond then x else 0;
|
optionalInt = cond: x: if cond then x else 0;
|
||||||
in
|
in
|
||||||
singularity-tools.buildImage rec {
|
singularity-tools.buildImage rec {
|
||||||
inherit (llama-cpp) name;
|
inherit (jarvis-cpp) name;
|
||||||
contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
|
contents = [ jarvis-cpp ] ++ lib.optionals interactive [ bashInteractive ];
|
||||||
|
|
||||||
# These are excessive (but safe) for most variants. Building singularity
|
# These are excessive (but safe) for most variants. Building singularity
|
||||||
# images requires superuser privileges, so we build them inside a VM in a
|
# images requires superuser privileges, so we build them inside a VM in a
|
||||||
|
@ -22,6 +22,6 @@ singularity-tools.buildImage rec {
|
||||||
# Expected image sizes:
|
# Expected image sizes:
|
||||||
# - cpu/blas: 150M,
|
# - cpu/blas: 150M,
|
||||||
# - cuda, all gencodes: 560M,
|
# - cuda, all gencodes: 560M,
|
||||||
diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
|
diskSize = 4096 + optionalInt jarvis-cpp.useRocm 16384;
|
||||||
memSize = diskSize;
|
memSize = diskSize;
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,9 +10,9 @@ shift
|
||||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||||
python3 ./convert_hf_to_gguf.py "$@"
|
python3 ./convert_hf_to_gguf.py "$@"
|
||||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
./llama-quantize "$@"
|
./jarvis-quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
./llama-cli "$@"
|
./jarvis-cli "$@"
|
||||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Converting PTH to GGML..."
|
echo "Converting PTH to GGML..."
|
||||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
|
@ -20,17 +20,17 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||||
else
|
else
|
||||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||||
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
./jarvis-quantize "$i" "${i/f16/q4_0}" q4_0
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||||
./llama-server "$@"
|
./jarvis-server "$@"
|
||||||
else
|
else
|
||||||
echo "Unknown command: $arg1"
|
echo "Unknown command: $arg1"
|
||||||
echo "Available commands: "
|
echo "Available commands: "
|
||||||
echo " --run (-r): Run a model previously converted into ggml"
|
echo " --run (-r): Run a model previously converted into ggml"
|
||||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||||
echo " --convert (-c): Convert a llama model into ggml"
|
echo " --convert (-c): Convert a jarvis model into ggml"
|
||||||
echo " ex: --outtype f16 \"/models/7B/\" "
|
echo " ex: --outtype f16 \"/models/7B/\" "
|
||||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||||
|
|
|
@ -12,8 +12,8 @@ build*/
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
|
|
||||||
/llama-cli
|
/jarvis-cli
|
||||||
/llama-quantize
|
/jarvis-quantize
|
||||||
|
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
|
|
|
@ -24,7 +24,7 @@ insert_final_newline = unset
|
||||||
[examples/server/public/*]
|
[examples/server/public/*]
|
||||||
indent_size = 2
|
indent_size = 2
|
||||||
|
|
||||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
[examples/jarvis.swiftui/jarvis.swiftui.xcodeproj/*]
|
||||||
indent_style = tab
|
indent_style = tab
|
||||||
|
|
||||||
[examples/cvector-generator/*.txt]
|
[examples/cvector-generator/*.txt]
|
||||||
|
|
6
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
6
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
|
@ -1,5 +1,5 @@
|
||||||
name: Low Severity Bugs
|
name: Low Severity Bugs
|
||||||
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
|
description: Used to report low severity bugs in jarvis.cpp (e.g. cosmetic issues, non critical UI glitches)
|
||||||
title: "Bug: "
|
title: "Bug: "
|
||||||
labels: ["bug-unconfirmed", "low severity"]
|
labels: ["bug-unconfirmed", "low severity"]
|
||||||
body:
|
body:
|
||||||
|
@ -8,7 +8,7 @@ body:
|
||||||
value: |
|
value: |
|
||||||
Thanks for taking the time to fill out this bug report!
|
Thanks for taking the time to fill out this bug report!
|
||||||
Please include information about your system, the steps to reproduce the bug,
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
and the version of llama.cpp that you are using.
|
and the version of jarvis.cpp that you are using.
|
||||||
If possible, please provide a minimal code example that reproduces the bug.
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
id: what-happened
|
id: what-happened
|
||||||
|
@ -24,7 +24,7 @@ body:
|
||||||
label: Name and Version
|
label: Name and Version
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
placeholder: |
|
placeholder: |
|
||||||
$./llama-cli --version
|
$./jarvis-cli --version
|
||||||
version: 2999 (42b4109e)
|
version: 2999 (42b4109e)
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
validations:
|
validations:
|
||||||
|
|
6
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
6
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
|
@ -1,5 +1,5 @@
|
||||||
name: Medium Severity Bug
|
name: Medium Severity Bug
|
||||||
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
|
description: Used to report medium severity bugs in jarvis.cpp (e.g. Malfunctioning Features but generally still useable)
|
||||||
title: "Bug: "
|
title: "Bug: "
|
||||||
labels: ["bug-unconfirmed", "medium severity"]
|
labels: ["bug-unconfirmed", "medium severity"]
|
||||||
body:
|
body:
|
||||||
|
@ -8,7 +8,7 @@ body:
|
||||||
value: |
|
value: |
|
||||||
Thanks for taking the time to fill out this bug report!
|
Thanks for taking the time to fill out this bug report!
|
||||||
Please include information about your system, the steps to reproduce the bug,
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
and the version of llama.cpp that you are using.
|
and the version of jarvis.cpp that you are using.
|
||||||
If possible, please provide a minimal code example that reproduces the bug.
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
id: what-happened
|
id: what-happened
|
||||||
|
@ -24,7 +24,7 @@ body:
|
||||||
label: Name and Version
|
label: Name and Version
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
placeholder: |
|
placeholder: |
|
||||||
$./llama-cli --version
|
$./jarvis-cli --version
|
||||||
version: 2999 (42b4109e)
|
version: 2999 (42b4109e)
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
validations:
|
validations:
|
||||||
|
|
6
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
6
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
|
@ -1,5 +1,5 @@
|
||||||
name: High Severity Bug
|
name: High Severity Bug
|
||||||
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
|
description: Used to report high severity bugs in jarvis.cpp (e.g. Malfunctioning features hindering important common workflow)
|
||||||
title: "Bug: "
|
title: "Bug: "
|
||||||
labels: ["bug-unconfirmed", "high severity"]
|
labels: ["bug-unconfirmed", "high severity"]
|
||||||
body:
|
body:
|
||||||
|
@ -8,7 +8,7 @@ body:
|
||||||
value: |
|
value: |
|
||||||
Thanks for taking the time to fill out this bug report!
|
Thanks for taking the time to fill out this bug report!
|
||||||
Please include information about your system, the steps to reproduce the bug,
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
and the version of llama.cpp that you are using.
|
and the version of jarvis.cpp that you are using.
|
||||||
If possible, please provide a minimal code example that reproduces the bug.
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
id: what-happened
|
id: what-happened
|
||||||
|
@ -24,7 +24,7 @@ body:
|
||||||
label: Name and Version
|
label: Name and Version
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
placeholder: |
|
placeholder: |
|
||||||
$./llama-cli --version
|
$./jarvis-cli --version
|
||||||
version: 2999 (42b4109e)
|
version: 2999 (42b4109e)
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
validations:
|
validations:
|
||||||
|
|
6
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
6
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
|
@ -1,5 +1,5 @@
|
||||||
name: Critical Severity Bug
|
name: Critical Severity Bug
|
||||||
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
|
description: Used to report critical severity bugs in jarvis.cpp (e.g. Crashing, Corrupted, Dataloss)
|
||||||
title: "Bug: "
|
title: "Bug: "
|
||||||
labels: ["bug-unconfirmed", "critical severity"]
|
labels: ["bug-unconfirmed", "critical severity"]
|
||||||
body:
|
body:
|
||||||
|
@ -8,7 +8,7 @@ body:
|
||||||
value: |
|
value: |
|
||||||
Thanks for taking the time to fill out this bug report!
|
Thanks for taking the time to fill out this bug report!
|
||||||
Please include information about your system, the steps to reproduce the bug,
|
Please include information about your system, the steps to reproduce the bug,
|
||||||
and the version of llama.cpp that you are using.
|
and the version of jarvis.cpp that you are using.
|
||||||
If possible, please provide a minimal code example that reproduces the bug.
|
If possible, please provide a minimal code example that reproduces the bug.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
id: what-happened
|
id: what-happened
|
||||||
|
@ -24,7 +24,7 @@ body:
|
||||||
label: Name and Version
|
label: Name and Version
|
||||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||||
placeholder: |
|
placeholder: |
|
||||||
$./llama-cli --version
|
$./jarvis-cli --version
|
||||||
version: 2999 (42b4109e)
|
version: 2999 (42b4109e)
|
||||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||||
validations:
|
validations:
|
||||||
|
|
12
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
12
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
|
@ -1,12 +1,12 @@
|
||||||
name: Enhancement
|
name: Enhancement
|
||||||
description: Used to request enhancements for llama.cpp
|
description: Used to request enhancements for jarvis.cpp
|
||||||
title: "Feature Request: "
|
title: "Feature Request: "
|
||||||
labels: ["enhancement"]
|
labels: ["enhancement"]
|
||||||
body:
|
body:
|
||||||
- type: markdown
|
- type: markdown
|
||||||
attributes:
|
attributes:
|
||||||
value: |
|
value: |
|
||||||
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
|
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/jarvis.cpp/discussions/categories/ideas)
|
||||||
|
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: prerequisites
|
id: prerequisites
|
||||||
|
@ -16,18 +16,18 @@ body:
|
||||||
options:
|
options:
|
||||||
- label: I am running the latest code. Mention the version if possible as well.
|
- label: I am running the latest code. Mention the version if possible as well.
|
||||||
required: true
|
required: true
|
||||||
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
- label: I carefully followed the [README.md](https://github.com/ggerganov/jarvis.cpp/blob/master/README.md).
|
||||||
required: true
|
required: true
|
||||||
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
||||||
required: true
|
required: true
|
||||||
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
|
- label: I reviewed the [Discussions](https://github.com/ggerganov/jarvis.cpp/discussions), and have a new and useful enhancement to share.
|
||||||
required: true
|
required: true
|
||||||
|
|
||||||
- type: textarea
|
- type: textarea
|
||||||
id: feature-description
|
id: feature-description
|
||||||
attributes:
|
attributes:
|
||||||
label: Feature Description
|
label: Feature Description
|
||||||
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
description: Please provide a detailed written description of what you were trying to do, and what you expected `jarvis.cpp` to do as an enhancement.
|
||||||
placeholder: Detailed description of the enhancement
|
placeholder: Detailed description of the enhancement
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
|
@ -36,7 +36,7 @@ body:
|
||||||
id: motivation
|
id: motivation
|
||||||
attributes:
|
attributes:
|
||||||
label: Motivation
|
label: Motivation
|
||||||
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `jarvis.cpp` users.
|
||||||
placeholder: Explanation of why this feature is needed and its benefits
|
placeholder: Explanation of why this feature is needed and its benefits
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
|
|
2
.github/ISSUE_TEMPLATE/06-research.yml
vendored
2
.github/ISSUE_TEMPLATE/06-research.yml
vendored
|
@ -6,7 +6,7 @@ body:
|
||||||
- type: markdown
|
- type: markdown
|
||||||
attributes:
|
attributes:
|
||||||
value: |
|
value: |
|
||||||
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
|
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
|
||||||
|
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: research-stage
|
id: research-stage
|
||||||
|
|
4
.github/ISSUE_TEMPLATE/07-refactor.yml
vendored
4
.github/ISSUE_TEMPLATE/07-refactor.yml
vendored
|
@ -6,8 +6,8 @@ body:
|
||||||
- type: markdown
|
- type: markdown
|
||||||
attributes:
|
attributes:
|
||||||
value: |
|
value: |
|
||||||
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
||||||
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/jarvis.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
||||||
|
|
||||||
- type: textarea
|
- type: textarea
|
||||||
id: background-description
|
id: background-description
|
||||||
|
|
6
.github/ISSUE_TEMPLATE/config.yml
vendored
6
.github/ISSUE_TEMPLATE/config.yml
vendored
|
@ -1,11 +1,11 @@
|
||||||
blank_issues_enabled: true
|
blank_issues_enabled: true
|
||||||
contact_links:
|
contact_links:
|
||||||
- name: Got an idea?
|
- name: Got an idea?
|
||||||
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
|
url: https://github.com/ggerganov/jarvis.cpp/discussions/categories/ideas
|
||||||
about: Pop it there. It may then become an enhancement ticket.
|
about: Pop it there. It may then become an enhancement ticket.
|
||||||
- name: Got a question?
|
- name: Got a question?
|
||||||
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
|
url: https://github.com/ggerganov/jarvis.cpp/discussions/categories/q-a
|
||||||
about: Ask a question there!
|
about: Ask a question there!
|
||||||
- name: Want to contribute?
|
- name: Want to contribute?
|
||||||
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
|
url: https://github.com/ggerganov/jarvis.cpp/wiki/contribute
|
||||||
about: Head to the contribution guide page of the wiki for areas you can help with
|
about: Head to the contribution guide page of the wiki for areas you can help with
|
||||||
|
|
2
.github/labeler.yml
vendored
2
.github/labeler.yml
vendored
|
@ -67,7 +67,7 @@ script:
|
||||||
android:
|
android:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- examples/llama.android/**
|
- examples/jarvis.android/**
|
||||||
server:
|
server:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
|
|
2
.github/pull_request_template.md
vendored
2
.github/pull_request_template.md
vendored
|
@ -1,6 +1,6 @@
|
||||||
|
|
||||||
|
|
||||||
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
|
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/jarvis.cpp/blob/master/CONTRIBUTING.md)
|
||||||
- Self-reported review complexity:
|
- Self-reported review complexity:
|
||||||
- [ ] Low
|
- [ ] Low
|
||||||
- [ ] Medium
|
- [ ] Medium
|
||||||
|
|
26
.github/workflows/bench.yml.disabled
vendored
26
.github/workflows/bench.yml.disabled
vendored
|
@ -1,5 +1,5 @@
|
||||||
# TODO: there have been some issues with the workflow, so disabling for now
|
# TODO: there have been some issues with the workflow, so disabling for now
|
||||||
# https://github.com/ggerganov/llama.cpp/issues/7893
|
# https://github.com/ggerganov/jarvis.cpp/issues/7893
|
||||||
#
|
#
|
||||||
# Benchmark
|
# Benchmark
|
||||||
name: Benchmark
|
name: Benchmark
|
||||||
|
@ -27,10 +27,10 @@ on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
paths: ['jarvis.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
paths: ['jarvis.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '04 2 * * *'
|
- cron: '04 2 * * *'
|
||||||
|
|
||||||
|
@ -113,16 +113,16 @@ jobs:
|
||||||
set -eux
|
set -eux
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DGGML_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DJARVIS_BUILD_SERVER=ON \
|
||||||
-DLLAMA_CURL=ON \
|
-DJARVIS_CURL=ON \
|
||||||
-DLLAMA_CUBLAS=ON \
|
-DJARVIS_CUBLAS=ON \
|
||||||
-DCUDAToolkit_ROOT=/usr/local/cuda \
|
-DCUDAToolkit_ROOT=/usr/local/cuda \
|
||||||
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
|
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
|
||||||
-DCMAKE_CUDA_ARCHITECTURES=75 \
|
-DCMAKE_CUDA_ARCHITECTURES=75 \
|
||||||
-DLLAMA_FATAL_WARNINGS=OFF \
|
-DJARVIS_FATAL_WARNINGS=OFF \
|
||||||
-DLLAMA_ALL_WARNINGS=OFF \
|
-DJARVIS_ALL_WARNINGS=OFF \
|
||||||
-DCMAKE_BUILD_TYPE=Release;
|
-DCMAKE_BUILD_TYPE=Release;
|
||||||
cmake --build build --config Release -j $(nproc) --target llama-server
|
cmake --build build --config Release -j $(nproc) --target jarvis-server
|
||||||
|
|
||||||
- name: Download the dataset
|
- name: Download the dataset
|
||||||
id: download_dataset
|
id: download_dataset
|
||||||
|
@ -240,7 +240,7 @@ jobs:
|
||||||
message: |
|
message: |
|
||||||
<p align="center">
|
<p align="center">
|
||||||
|
|
||||||
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
|
📈 **jarvis.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
|
||||||
|
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
|
@ -249,9 +249,9 @@ jobs:
|
||||||
<summary>Expand details for performance related PR only</summary>
|
<summary>Expand details for performance related PR only</summary>
|
||||||
|
|
||||||
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
|
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
|
||||||
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
|
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.JARVISCPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.JARVISCPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
|
||||||
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
|
- Prompt processing (pp): avg=${{ env.JARVISCPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.JARVISCPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
|
||||||
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
|
- Token generation (tg): avg=${{ env.JARVISCPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.JARVISCPP_TOKENS_SECOND_P_95_ }}tk/s
|
||||||
- ${{ env.BENCH_GRAPH_XLABEL }}
|
- ${{ env.BENCH_GRAPH_XLABEL }}
|
||||||
|
|
||||||
|
|
||||||
|
|
148
.github/workflows/build.yml
vendored
148
.github/workflows/build.yml
vendored
|
@ -28,9 +28,9 @@ env:
|
||||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
GGML_NLOOP: 3
|
GGML_NLOOP: 3
|
||||||
GGML_N_THREADS: 1
|
GGML_N_THREADS: 1
|
||||||
LLAMA_LOG_COLORS: 1
|
JARVIS_LOG_COLORS: 1
|
||||||
LLAMA_LOG_PREFIX: 1
|
JARVIS_LOG_PREFIX: 1
|
||||||
LLAMA_LOG_TIMESTAMPS: 1
|
JARVIS_LOG_TIMESTAMPS: 1
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
macOS-latest-cmake-arm64:
|
macOS-latest-cmake-arm64:
|
||||||
|
@ -55,7 +55,7 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
|
cmake -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -82,14 +82,14 @@ jobs:
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
run: |
|
run: |
|
||||||
cp LICENSE ./build/bin/
|
cp LICENSE ./build/bin/
|
||||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
|
zip -r jarvis-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
|
path: jarvis-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
|
||||||
name: llama-bin-macos-arm64.zip
|
name: jarvis-bin-macos-arm64.zip
|
||||||
|
|
||||||
macOS-latest-cmake-x64:
|
macOS-latest-cmake-x64:
|
||||||
runs-on: macos-12
|
runs-on: macos-12
|
||||||
|
@ -112,8 +112,8 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
sysctl -a
|
sysctl -a
|
||||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
# https://github.com/ggerganov/jarvis.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||||
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
|
cmake -B build -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
|
||||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -140,20 +140,20 @@ jobs:
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
run: |
|
run: |
|
||||||
cp LICENSE ./build/bin/
|
cp LICENSE ./build/bin/
|
||||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
|
zip -r jarvis-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
path: jarvis-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
||||||
name: llama-bin-macos-x64.zip
|
name: jarvis-bin-macos-x64.zip
|
||||||
|
|
||||||
ubuntu-focal-make:
|
ubuntu-focal-make:
|
||||||
runs-on: ubuntu-20.04
|
runs-on: ubuntu-20.04
|
||||||
env:
|
env:
|
||||||
LLAMA_NODE_AVAILABLE: true
|
JARVIS_NODE_AVAILABLE: true
|
||||||
LLAMA_PYTHON_AVAILABLE: true
|
JARVIS_PYTHON_AVAILABLE: true
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -177,7 +177,7 @@ jobs:
|
||||||
- name: Build
|
- name: Build
|
||||||
id: make_build
|
id: make_build
|
||||||
env:
|
env:
|
||||||
LLAMA_FATAL_WARNINGS: 1
|
JARVIS_FATAL_WARNINGS: 1
|
||||||
run: |
|
run: |
|
||||||
CC=gcc-8 make -j $(nproc)
|
CC=gcc-8 make -j $(nproc)
|
||||||
|
|
||||||
|
@ -204,8 +204,8 @@ jobs:
|
||||||
- name: Build
|
- name: Build
|
||||||
id: make_build
|
id: make_build
|
||||||
env:
|
env:
|
||||||
LLAMA_FATAL_WARNINGS: 1
|
JARVIS_FATAL_WARNINGS: 1
|
||||||
LLAMA_CURL: 1
|
JARVIS_CURL: 1
|
||||||
run: |
|
run: |
|
||||||
CC=gcc-8 make -j $(nproc)
|
CC=gcc-8 make -j $(nproc)
|
||||||
|
|
||||||
|
@ -230,7 +230,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
|
cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -239,16 +239,16 @@ jobs:
|
||||||
cd build
|
cd build
|
||||||
ctest -L 'main|curl' --verbose --timeout 900
|
ctest -L 'main|curl' --verbose --timeout 900
|
||||||
|
|
||||||
- name: Test llama2c conversion
|
- name: Test jarvis2c conversion
|
||||||
id: llama2c_test
|
id: jarvis2c_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
echo "Fetch tokenizer"
|
echo "Fetch tokenizer"
|
||||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
wget https://huggingface.co/karpathy/tinyjarviss/resolve/main/stories260K/tok512.bin
|
||||||
echo "Fetch llama2c model"
|
echo "Fetch jarvis2c model"
|
||||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
wget https://huggingface.co/karpathy/tinyjarviss/resolve/main/stories260K/stories260K.bin
|
||||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
./bin/jarvis-convert-jarvis2c-to-ggml --copy-vocab-from-model ./tok512.bin --jarvis2c-model stories260K.bin --jarvis2c-output-model stories260K.gguf
|
||||||
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
./bin/jarvis-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
id: tag
|
id: tag
|
||||||
|
@ -268,14 +268,14 @@ jobs:
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
run: |
|
run: |
|
||||||
cp LICENSE ./build/bin/
|
cp LICENSE ./build/bin/
|
||||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
|
zip -r jarvis-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
|
path: jarvis-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
|
||||||
name: llama-bin-ubuntu-x64.zip
|
name: jarvis-bin-ubuntu-x64.zip
|
||||||
|
|
||||||
ubuntu-latest-cmake-sanitizer:
|
ubuntu-latest-cmake-sanitizer:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
@ -304,7 +304,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
|
||||||
- name: Build (no OpenMP)
|
- name: Build (no OpenMP)
|
||||||
|
@ -313,7 +313,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
|
cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
|
||||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -487,7 +487,7 @@ jobs:
|
||||||
|
|
||||||
# TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
# TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
# how to debug it.
|
# how to debug it.
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
# ref: https://github.com/ggerganov/jarvis.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
||||||
macOS-latest-make:
|
macOS-latest-make:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
@ -505,7 +505,7 @@ jobs:
|
||||||
- name: Build
|
- name: Build
|
||||||
id: make_build
|
id: make_build
|
||||||
env:
|
env:
|
||||||
LLAMA_FATAL_WARNINGS: 1
|
JARVIS_FATAL_WARNINGS: 1
|
||||||
run: |
|
run: |
|
||||||
GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
|
GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
|
@ -517,7 +517,7 @@ jobs:
|
||||||
|
|
||||||
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
# how to debug it.
|
# how to debug it.
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
# ref: https://github.com/ggerganov/jarvis.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
||||||
# would be great if we fix these
|
# would be great if we fix these
|
||||||
macOS-latest-cmake:
|
macOS-latest-cmake:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
@ -539,7 +539,7 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
|
cmake -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -570,9 +570,9 @@ jobs:
|
||||||
cd build
|
cd build
|
||||||
cmake -G Xcode .. \
|
cmake -G Xcode .. \
|
||||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
-DJARVIS_BUILD_EXAMPLES=OFF \
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DJARVIS_BUILD_TESTS=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=OFF \
|
-DJARVIS_BUILD_SERVER=OFF \
|
||||||
-DCMAKE_SYSTEM_NAME=iOS \
|
-DCMAKE_SYSTEM_NAME=iOS \
|
||||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
|
@ -600,9 +600,9 @@ jobs:
|
||||||
cd build
|
cd build
|
||||||
cmake -G Xcode .. \
|
cmake -G Xcode .. \
|
||||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
-DJARVIS_BUILD_EXAMPLES=OFF \
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DJARVIS_BUILD_TESTS=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=OFF \
|
-DJARVIS_BUILD_SERVER=OFF \
|
||||||
-DCMAKE_SYSTEM_NAME=tvOS \
|
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
|
@ -629,7 +629,7 @@ jobs:
|
||||||
- name: xcodebuild for swift package
|
- name: xcodebuild for swift package
|
||||||
id: xcodebuild
|
id: xcodebuild
|
||||||
run: |
|
run: |
|
||||||
xcodebuild -scheme llama -destination "${{ matrix.destination }}"
|
xcodebuild -scheme jarvis -destination "${{ matrix.destination }}"
|
||||||
|
|
||||||
- name: Build Swift Example
|
- name: Build Swift Example
|
||||||
id: make_build_swift_example
|
id: make_build_swift_example
|
||||||
|
@ -705,23 +705,23 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- build: 'noavx-x64'
|
- build: 'noavx-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx2-x64'
|
- build: 'avx2-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx-x64'
|
- build: 'avx-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx512-x64'
|
- build: 'avx512-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'openblas-x64'
|
- build: 'openblas-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||||
- build: 'kompute-x64'
|
- build: 'kompute-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'vulkan-x64'
|
- build: 'vulkan-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'llvm-arm64'
|
- build: 'llvm-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'msvc-arm64'
|
- build: 'msvc-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -807,7 +807,7 @@ jobs:
|
||||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||||
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||||
cd build
|
cd build
|
||||||
$env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
|
$env:JARVIS_SKIP_TESTS_SLOW_ON_EMULATOR = 1
|
||||||
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
|
@ -827,15 +827,15 @@ jobs:
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
run: |
|
run: |
|
||||||
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
|
Copy-Item LICENSE .\build\bin\Release\jarvis.cpp.txt
|
||||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
path: jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||||
name: llama-bin-win-${{ matrix.build }}.zip
|
name: jarvis-bin-win-${{ matrix.build }}.zip
|
||||||
|
|
||||||
windows-latest-cmake-cuda:
|
windows-latest-cmake-cuda:
|
||||||
runs-on: windows-2019
|
runs-on: windows-2019
|
||||||
|
@ -865,7 +865,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
|
cmake .. -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
|
||||||
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
|
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
|
@ -886,28 +886,28 @@ jobs:
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
run: |
|
run: |
|
||||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
|
7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
path: jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
||||||
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
name: jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||||
|
|
||||||
- name: Copy and pack Cuda runtime
|
- name: Copy and pack Cuda runtime
|
||||||
run: |
|
run: |
|
||||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
||||||
$dst='.\build\bin\cudart\'
|
$dst='.\build\bin\cudart\'
|
||||||
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
|
7z a cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
|
||||||
|
|
||||||
- name: Upload Cuda runtime
|
- name: Upload Cuda runtime
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
path: cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||||
name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
name: cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||||
|
|
||||||
windows-latest-cmake-sycl:
|
windows-latest-cmake-sycl:
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
|
@ -963,14 +963,14 @@ jobs:
|
||||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
path: jarvis-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
||||||
name: llama-bin-win-sycl-x64.zip
|
name: jarvis-bin-win-sycl-x64.zip
|
||||||
|
|
||||||
windows-latest-cmake-hip:
|
windows-latest-cmake-hip:
|
||||||
if: ${{ github.event.inputs.create_release != 'true' }}
|
if: ${{ github.event.inputs.create_release != 'true' }}
|
||||||
|
@ -1060,13 +1060,13 @@ jobs:
|
||||||
- name: Pack artifacts
|
- name: Pack artifacts
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
run: |
|
run: |
|
||||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
|
7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
path: jarvis-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||||
name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
name: jarvis-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||||
|
|
||||||
ios-xcode-build:
|
ios-xcode-build:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
@ -1076,7 +1076,7 @@ jobs:
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Build Xcode project
|
- name: Build Xcode project
|
||||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
run: xcodebuild -project examples/jarvis.swiftui/jarvis.swiftui.xcodeproj -scheme jarvis.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||||
|
|
||||||
android-build:
|
android-build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
@ -1098,7 +1098,7 @@ jobs:
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
run: |
|
run: |
|
||||||
cd examples/llama.android
|
cd examples/jarvis.android
|
||||||
|
|
||||||
./gradlew build --no-daemon
|
./gradlew build --no-daemon
|
||||||
|
|
||||||
|
@ -1261,7 +1261,7 @@ jobs:
|
||||||
# sudo apt-get install cmake
|
# sudo apt-get install cmake
|
||||||
#
|
#
|
||||||
# - name: Configure
|
# - name: Configure
|
||||||
# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
|
# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||||
#
|
#
|
||||||
# - name: Build
|
# - name: Build
|
||||||
# run: |
|
# run: |
|
||||||
|
@ -1300,7 +1300,7 @@ jobs:
|
||||||
# - name: Upload binaries
|
# - name: Upload binaries
|
||||||
# uses: actions/upload-artifact@v4
|
# uses: actions/upload-artifact@v4
|
||||||
# with:
|
# with:
|
||||||
# name: llama-bin-${{ matrix.arch }}
|
# name: jarvis-bin-${{ matrix.arch }}
|
||||||
# path: build/bin/${{ matrix.build }}
|
# path: build/bin/${{ matrix.build }}
|
||||||
#
|
#
|
||||||
# windows-blas:
|
# windows-blas:
|
||||||
|
@ -1339,7 +1339,7 @@ jobs:
|
||||||
# run: >
|
# run: >
|
||||||
# cmake -S . -B ./build -A ${{ matrix.arch }}
|
# cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||||
# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||||
# -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
|
# -DJARVIS_SUPPORT_OPENBLAS=${{ matrix.blas }}
|
||||||
# -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
|
# -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
|
||||||
#
|
#
|
||||||
# - name: Build
|
# - name: Build
|
||||||
|
@ -1355,7 +1355,7 @@ jobs:
|
||||||
# if: matrix.blas == 'ON'
|
# if: matrix.blas == 'ON'
|
||||||
# uses: actions/upload-artifact@v4
|
# uses: actions/upload-artifact@v4
|
||||||
# with:
|
# with:
|
||||||
# name: llama-blas-bin-${{ matrix.arch }}
|
# name: jarvis-blas-bin-${{ matrix.arch }}
|
||||||
# path: build/bin/${{ matrix.build }}
|
# path: build/bin/${{ matrix.build }}
|
||||||
#
|
#
|
||||||
# emscripten:
|
# emscripten:
|
||||||
|
|
20
.github/workflows/docker.yml
vendored
20
.github/workflows/docker.yml
vendored
|
@ -37,21 +37,21 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
config:
|
config:
|
||||||
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light", dockerfile: ".devops/jarvis-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server", dockerfile: ".devops/jarvis-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-cuda", dockerfile: ".devops/jarvis-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-cuda", dockerfile: ".devops/jarvis-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-musa", dockerfile: ".devops/jarvis-cli-musa.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-musa", dockerfile: ".devops/jarvis-server-musa.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
|
||||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||||
#- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
#- { tag: "light-rocm", dockerfile: ".devops/jarvis-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
#- { tag: "server-rocm", dockerfile: ".devops/jarvis-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-intel", dockerfile: ".devops/jarvis-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-intel", dockerfile: ".devops/jarvis-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
2
.github/workflows/labeler.yml
vendored
2
.github/workflows/labeler.yml
vendored
|
@ -11,7 +11,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
repository: "ggerganov/llama.cpp"
|
repository: "ggerganov/jarvis.cpp"
|
||||||
- uses: actions/labeler@v5
|
- uses: actions/labeler@v5
|
||||||
with:
|
with:
|
||||||
configuration-path: '.github/labeler.yml'
|
configuration-path: '.github/labeler.yml'
|
||||||
|
|
6
.github/workflows/nix-ci-aarch64.yml
vendored
6
.github/workflows/nix-ci-aarch64.yml
vendored
|
@ -47,8 +47,8 @@ jobs:
|
||||||
extra-conf: |
|
extra-conf: |
|
||||||
extra-platforms = aarch64-linux
|
extra-platforms = aarch64-linux
|
||||||
extra-system-features = nixos-test kvm
|
extra-system-features = nixos-test kvm
|
||||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
with:
|
with:
|
||||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
@ -56,7 +56,7 @@ jobs:
|
||||||
uses: cachix/cachix-action@v13
|
uses: cachix/cachix-action@v13
|
||||||
with:
|
with:
|
||||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||||
name: llama-cpp
|
name: jarvis-cpp
|
||||||
- name: Show all output paths
|
- name: Show all output paths
|
||||||
run: >
|
run: >
|
||||||
nix run github:nix-community/nix-eval-jobs
|
nix run github:nix-community/nix-eval-jobs
|
||||||
|
|
10
.github/workflows/nix-ci.yml
vendored
10
.github/workflows/nix-ci.yml
vendored
|
@ -34,8 +34,8 @@ jobs:
|
||||||
with:
|
with:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
extra-conf: |
|
extra-conf: |
|
||||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
with:
|
with:
|
||||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
@ -61,8 +61,8 @@ jobs:
|
||||||
with:
|
with:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
extra-conf: |
|
extra-conf: |
|
||||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||||
with:
|
with:
|
||||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||||
|
@ -70,7 +70,7 @@ jobs:
|
||||||
uses: cachix/cachix-action@v13
|
uses: cachix/cachix-action@v13
|
||||||
with:
|
with:
|
||||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||||
name: llama-cpp
|
name: jarvis-cpp
|
||||||
- name: Build
|
- name: Build
|
||||||
run: >
|
run: >
|
||||||
nix run github:Mic92/nix-fast-build
|
nix run github:Mic92/nix-fast-build
|
||||||
|
|
32
.github/workflows/server.yml
vendored
32
.github/workflows/server.yml
vendored
|
@ -21,10 +21,10 @@ on:
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||||
|
|
||||||
env:
|
env:
|
||||||
LLAMA_LOG_COLORS: 1
|
JARVIS_LOG_COLORS: 1
|
||||||
LLAMA_LOG_PREFIX: 1
|
JARVIS_LOG_PREFIX: 1
|
||||||
LLAMA_LOG_TIMESTAMPS: 1
|
JARVIS_LOG_TIMESTAMPS: 1
|
||||||
LLAMA_LOG_VERBOSITY: 10
|
JARVIS_LOG_VERBOSITY: 10
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||||
|
@ -41,7 +41,7 @@ jobs:
|
||||||
include:
|
include:
|
||||||
- build_type: Release
|
- build_type: Release
|
||||||
sanitizer: ""
|
sanitizer: ""
|
||||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
fail-fast: false # While -DJARVIS_SANITIZE_THREAD=ON is broken
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
|
@ -99,12 +99,12 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DGGML_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DJARVIS_BUILD_SERVER=ON \
|
||||||
-DLLAMA_CURL=ON \
|
-DJARVIS_CURL=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
-DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
-DGGML_OPENMP=OFF ;
|
-DGGML_OPENMP=OFF ;
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target jarvis-server
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
@ -112,11 +112,11 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DGGML_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DJARVIS_BUILD_SERVER=ON \
|
||||||
-DLLAMA_CURL=ON \
|
-DJARVIS_CURL=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
-DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target jarvis-server
|
||||||
|
|
||||||
- name: Tests
|
- name: Tests
|
||||||
id: server_integration_tests
|
id: server_integration_tests
|
||||||
|
@ -155,8 +155,8 @@ jobs:
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
cmake -B build -DJARVIS_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
||||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target jarvis-server
|
||||||
|
|
||||||
- name: Python setup
|
- name: Python setup
|
||||||
id: setup_python
|
id: setup_python
|
||||||
|
@ -180,7 +180,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
$env:PYTHONIOENCODING = ":replace"
|
$env:PYTHONIOENCODING = ":replace"
|
||||||
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags jarvis.cpp
|
||||||
|
|
||||||
- name: Slow tests
|
- name: Slow tests
|
||||||
id: server_integration_tests_slow
|
id: server_integration_tests_slow
|
||||||
|
|
8
.gitignore
vendored
8
.gitignore
vendored
|
@ -48,8 +48,8 @@ build*
|
||||||
!build-info.sh
|
!build-info.sh
|
||||||
!build.zig
|
!build.zig
|
||||||
!docs/build.md
|
!docs/build.md
|
||||||
/libllama.so
|
/libjarvis.so
|
||||||
/llama-*
|
/jarvis-*
|
||||||
/vulkan-shaders-gen
|
/vulkan-shaders-gen
|
||||||
android-ndk-*
|
android-ndk-*
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
|
@ -57,7 +57,7 @@ cmake-build-*
|
||||||
CMakeSettings.json
|
CMakeSettings.json
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
ggml-metal-embed.metal
|
ggml-metal-embed.metal
|
||||||
llama-batched-swift
|
jarvis-batched-swift
|
||||||
/rpc-server
|
/rpc-server
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
|
@ -118,7 +118,7 @@ poetry.toml
|
||||||
/tests/test-double-float
|
/tests/test-double-float
|
||||||
/tests/test-grad0
|
/tests/test-grad0
|
||||||
/tests/test-grammar-parser
|
/tests/test-grammar-parser
|
||||||
/tests/test-llama-grammar
|
/tests/test-jarvis-grammar
|
||||||
/tests/test-opt
|
/tests/test-opt
|
||||||
/tests/test-quantize-fns
|
/tests/test-quantize-fns
|
||||||
/tests/test-quantize-perf
|
/tests/test-quantize-perf
|
||||||
|
|
118
CMakeLists.txt
118
CMakeLists.txt
|
@ -1,5 +1,5 @@
|
||||||
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
||||||
project("llama.cpp" C CXX)
|
project("jarvis.cpp" C CXX)
|
||||||
include(CheckIncludeFileCXX)
|
include(CheckIncludeFileCXX)
|
||||||
|
|
||||||
#set(CMAKE_WARN_DEPRECATED YES)
|
#set(CMAKE_WARN_DEPRECATED YES)
|
||||||
|
@ -18,20 +18,20 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||||
|
|
||||||
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||||
set(LLAMA_STANDALONE ON)
|
set(JARVIS_STANDALONE ON)
|
||||||
|
|
||||||
include(git-vars)
|
include(git-vars)
|
||||||
|
|
||||||
# configure project version
|
# configure project version
|
||||||
# TODO
|
# TODO
|
||||||
else()
|
else()
|
||||||
set(LLAMA_STANDALONE OFF)
|
set(JARVIS_STANDALONE OFF)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||||
|
|
||||||
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
|
option(JARVIS_WASM_SINGLE_FILE "jarvis: embed WASM inside the generated jarvis.js" ON)
|
||||||
else()
|
else()
|
||||||
if (MINGW)
|
if (MINGW)
|
||||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||||
|
@ -51,41 +51,41 @@ endif()
|
||||||
#
|
#
|
||||||
|
|
||||||
# debug
|
# debug
|
||||||
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
|
option(JARVIS_ALL_WARNINGS "jarvis: enable all compiler warnings" ON)
|
||||||
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
|
option(JARVIS_ALL_WARNINGS_3RD_PARTY "jarvis: enable all compiler warnings in 3rd party libs" OFF)
|
||||||
|
|
||||||
# build
|
# build
|
||||||
option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF)
|
option(JARVIS_FATAL_WARNINGS "jarvis: enable -Werror flag" OFF)
|
||||||
|
|
||||||
# sanitizers
|
# sanitizers
|
||||||
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
|
option(JARVIS_SANITIZE_THREAD "jarvis: enable thread sanitizer" OFF)
|
||||||
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
|
option(JARVIS_SANITIZE_ADDRESS "jarvis: enable address sanitizer" OFF)
|
||||||
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
option(JARVIS_SANITIZE_UNDEFINED "jarvis: enable undefined sanitizer" OFF)
|
||||||
|
|
||||||
# utils
|
# utils
|
||||||
option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
|
option(JARVIS_BUILD_COMMON "jarvis: build common utils library" ${JARVIS_STANDALONE})
|
||||||
|
|
||||||
# extra artifacts
|
# extra artifacts
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(JARVIS_BUILD_TESTS "jarvis: build tests" ${JARVIS_STANDALONE})
|
||||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
option(JARVIS_BUILD_EXAMPLES "jarvis: build examples" ${JARVIS_STANDALONE})
|
||||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
option(JARVIS_BUILD_SERVER "jarvis: build server example" ${JARVIS_STANDALONE})
|
||||||
|
|
||||||
# 3rd party libs
|
# 3rd party libs
|
||||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
option(JARVIS_CURL "jarvis: use libcurl to download model from an URL" OFF)
|
||||||
|
|
||||||
# Required for relocatable CMake package
|
# Required for relocatable CMake package
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||||
|
|
||||||
# override ggml options
|
# override ggml options
|
||||||
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
|
set(GGML_SANITIZE_THREAD ${JARVIS_SANITIZE_THREAD})
|
||||||
set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
|
set(GGML_SANITIZE_ADDRESS ${JARVIS_SANITIZE_ADDRESS})
|
||||||
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
|
set(GGML_SANITIZE_UNDEFINED ${JARVIS_SANITIZE_UNDEFINED})
|
||||||
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
set(GGML_ALL_WARNINGS ${JARVIS_ALL_WARNINGS})
|
||||||
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
set(GGML_FATAL_WARNINGS ${JARVIS_FATAL_WARNINGS})
|
||||||
|
|
||||||
# change the default for these ggml options
|
# change the default for these ggml options
|
||||||
if (NOT DEFINED GGML_LLAMAFILE)
|
if (NOT DEFINED GGML_JARVISFILE)
|
||||||
set(GGML_LLAMAFILE_DEFAULT ON)
|
set(GGML_JARVISFILE_DEFAULT ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT DEFINED GGML_AMX)
|
if (NOT DEFINED GGML_AMX)
|
||||||
|
@ -97,23 +97,23 @@ if (NOT DEFINED GGML_CUDA_GRAPHS)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# transition helpers
|
# transition helpers
|
||||||
function (llama_option_depr TYPE OLD NEW)
|
function (jarvis_option_depr TYPE OLD NEW)
|
||||||
if (${OLD})
|
if (${OLD})
|
||||||
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
|
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
|
||||||
set(${NEW} ON PARENT_SCOPE)
|
set(${NEW} ON PARENT_SCOPE)
|
||||||
endif()
|
endif()
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
|
jarvis_option_depr(FATAL_ERROR JARVIS_CUBLAS GGML_CUDA)
|
||||||
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
|
jarvis_option_depr(WARNING JARVIS_CUDA GGML_CUDA)
|
||||||
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
jarvis_option_depr(WARNING JARVIS_KOMPUTE GGML_KOMPUTE)
|
||||||
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
jarvis_option_depr(WARNING JARVIS_METAL GGML_METAL)
|
||||||
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
jarvis_option_depr(WARNING JARVIS_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
||||||
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
jarvis_option_depr(WARNING JARVIS_NATIVE GGML_NATIVE)
|
||||||
llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
|
jarvis_option_depr(WARNING JARVIS_RPC GGML_RPC)
|
||||||
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
|
jarvis_option_depr(WARNING JARVIS_SYCL GGML_SYCL)
|
||||||
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
|
jarvis_option_depr(WARNING JARVIS_SYCL_F16 GGML_SYCL_F16)
|
||||||
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
|
jarvis_option_depr(WARNING JARVIS_CANN GGML_CANN)
|
||||||
|
|
||||||
#
|
#
|
||||||
# build the library
|
# build the library
|
||||||
|
@ -132,18 +132,18 @@ add_subdirectory(src)
|
||||||
include(GNUInstallDirs)
|
include(GNUInstallDirs)
|
||||||
include(CMakePackageConfigHelpers)
|
include(CMakePackageConfigHelpers)
|
||||||
|
|
||||||
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
set(JARVIS_BUILD_NUMBER ${BUILD_NUMBER})
|
||||||
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
set(JARVIS_BUILD_COMMIT ${BUILD_COMMIT})
|
||||||
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
set(JARVIS_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||||
|
|
||||||
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
set(JARVIS_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
||||||
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
set(JARVIS_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
||||||
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
set(JARVIS_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
||||||
|
|
||||||
|
|
||||||
# At the moment some compile definitions are placed within the ggml/src
|
# At the moment some compile definitions are placed within the ggml/src
|
||||||
# directory but not exported on the `ggml` target. This could be improved by
|
# directory but not exported on the `ggml` target. This could be improved by
|
||||||
# determining _precisely_ which defines are necessary for the llama-config
|
# determining _precisely_ which defines are necessary for the jarvis-config
|
||||||
# package.
|
# package.
|
||||||
#
|
#
|
||||||
set(GGML_TRANSIENT_DEFINES)
|
set(GGML_TRANSIENT_DEFINES)
|
||||||
|
@ -158,25 +158,25 @@ if (GGML_TARGET_DEFINES)
|
||||||
endif()
|
endif()
|
||||||
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
||||||
|
|
||||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
|
set_target_properties(jarvis PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/jarvis.h)
|
||||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
install(TARGETS jarvis LIBRARY PUBLIC_HEADER)
|
||||||
|
|
||||||
configure_package_config_file(
|
configure_package_config_file(
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
|
${CMAKE_CURRENT_SOURCE_DIR}/cmake/jarvis-config.cmake.in
|
||||||
${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
|
${CMAKE_CURRENT_BINARY_DIR}/jarvis-config.cmake
|
||||||
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama
|
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/jarvis
|
||||||
PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
|
PATH_VARS JARVIS_INCLUDE_INSTALL_DIR
|
||||||
LLAMA_LIB_INSTALL_DIR
|
JARVIS_LIB_INSTALL_DIR
|
||||||
LLAMA_BIN_INSTALL_DIR )
|
JARVIS_BIN_INSTALL_DIR )
|
||||||
|
|
||||||
write_basic_package_version_file(
|
write_basic_package_version_file(
|
||||||
${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
|
${CMAKE_CURRENT_BINARY_DIR}/jarvis-version.cmake
|
||||||
VERSION ${LLAMA_INSTALL_VERSION}
|
VERSION ${JARVIS_INSTALL_VERSION}
|
||||||
COMPATIBILITY SameMajorVersion)
|
COMPATIBILITY SameMajorVersion)
|
||||||
|
|
||||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
|
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/jarvis-config.cmake
|
||||||
${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
|
${CMAKE_CURRENT_BINARY_DIR}/jarvis-version.cmake
|
||||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
|
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/jarvis)
|
||||||
|
|
||||||
install(
|
install(
|
||||||
FILES convert_hf_to_gguf.py
|
FILES convert_hf_to_gguf.py
|
||||||
|
@ -190,27 +190,27 @@ install(
|
||||||
WORLD_EXECUTE
|
WORLD_EXECUTE
|
||||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||||
|
|
||||||
configure_file(cmake/llama.pc.in
|
configure_file(cmake/jarvis.pc.in
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
"${CMAKE_CURRENT_BINARY_DIR}/jarvis.pc"
|
||||||
@ONLY)
|
@ONLY)
|
||||||
|
|
||||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/jarvis.pc"
|
||||||
DESTINATION lib/pkgconfig)
|
DESTINATION lib/pkgconfig)
|
||||||
|
|
||||||
#
|
#
|
||||||
# utils, programs, examples and tests
|
# utils, programs, examples and tests
|
||||||
#
|
#
|
||||||
|
|
||||||
if (LLAMA_BUILD_COMMON)
|
if (JARVIS_BUILD_COMMON)
|
||||||
add_subdirectory(common)
|
add_subdirectory(common)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
if (JARVIS_BUILD_COMMON AND JARVIS_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||||
include(CTest)
|
include(CTest)
|
||||||
add_subdirectory(tests)
|
add_subdirectory(tests)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
|
if (JARVIS_BUILD_COMMON AND JARVIS_BUILD_EXAMPLES)
|
||||||
add_subdirectory(examples)
|
add_subdirectory(examples)
|
||||||
add_subdirectory(pocs)
|
add_subdirectory(pocs)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
|
|
||||||
- Squash-merge PRs
|
- Squash-merge PRs
|
||||||
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
||||||
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
|
- Optionally pick a `<module>` from here: https://github.com/ggerganov/jarvis.cpp/wiki/Modules
|
||||||
|
|
||||||
# Coding guidelines
|
# Coding guidelines
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@
|
||||||
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
|
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
|
||||||
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
|
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
|
||||||
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
|
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
|
||||||
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
|
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/jarvis.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
@ -30,4 +30,4 @@
|
||||||
|
|
||||||
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
|
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
|
||||||
|
|
||||||
https://github.com/ggerganov/llama.cpp/projects
|
https://github.com/ggerganov/jarvis.cpp/projects
|
||||||
|
|
|
@ -7,7 +7,7 @@ import java.util.Scanner;
|
||||||
public class LLMCLI {
|
public class LLMCLI {
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
// Path to the .exe file
|
// Path to the .exe file
|
||||||
String exePath = "bin/llama-cli.exe";
|
String exePath = "bin/jarvis-cli.exe";
|
||||||
|
|
||||||
System.out.println("Enter -h for help");
|
System.out.println("Enter -h for help");
|
||||||
// Scanner to take user input for various commands
|
// Scanner to take user input for various commands
|
||||||
|
|
388
Makefile
388
Makefile
|
@ -1,44 +1,44 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = \
|
BUILD_TARGETS = \
|
||||||
libllava.a \
|
libllava.a \
|
||||||
llama-baby-llama \
|
jarvis-baby-jarvis \
|
||||||
llama-batched \
|
jarvis-batched \
|
||||||
llama-batched-bench \
|
jarvis-batched-bench \
|
||||||
llama-bench \
|
jarvis-bench \
|
||||||
llama-cli \
|
jarvis-cli \
|
||||||
llama-convert-llama2c-to-ggml \
|
jarvis-convert-jarvis2c-to-ggml \
|
||||||
llama-embedding \
|
jarvis-embedding \
|
||||||
llama-eval-callback \
|
jarvis-eval-callback \
|
||||||
llama-export-lora \
|
jarvis-export-lora \
|
||||||
llama-gbnf-validator \
|
jarvis-gbnf-validator \
|
||||||
llama-gguf \
|
jarvis-gguf \
|
||||||
llama-gguf-hash \
|
jarvis-gguf-hash \
|
||||||
llama-gguf-split \
|
jarvis-gguf-split \
|
||||||
llama-gritlm \
|
jarvis-gritlm \
|
||||||
llama-imatrix \
|
jarvis-imatrix \
|
||||||
llama-infill \
|
jarvis-infill \
|
||||||
llama-llava-cli \
|
jarvis-llava-cli \
|
||||||
llama-minicpmv-cli\
|
jarvis-minicpmv-cli\
|
||||||
llama-lookahead \
|
jarvis-lookahead \
|
||||||
llama-lookup \
|
jarvis-lookup \
|
||||||
llama-lookup-create \
|
jarvis-lookup-create \
|
||||||
llama-lookup-merge \
|
jarvis-lookup-merge \
|
||||||
llama-lookup-stats \
|
jarvis-lookup-stats \
|
||||||
llama-parallel \
|
jarvis-parallel \
|
||||||
llama-passkey \
|
jarvis-passkey \
|
||||||
llama-perplexity \
|
jarvis-perplexity \
|
||||||
llama-q8dot \
|
jarvis-q8dot \
|
||||||
llama-quantize \
|
jarvis-quantize \
|
||||||
llama-quantize-stats \
|
jarvis-quantize-stats \
|
||||||
llama-retrieval \
|
jarvis-retrieval \
|
||||||
llama-save-load-state \
|
jarvis-save-load-state \
|
||||||
llama-server \
|
jarvis-server \
|
||||||
llama-simple \
|
jarvis-simple \
|
||||||
llama-speculative \
|
jarvis-speculative \
|
||||||
llama-tokenize \
|
jarvis-tokenize \
|
||||||
llama-vdot \
|
jarvis-vdot \
|
||||||
llama-cvector-generator \
|
jarvis-cvector-generator \
|
||||||
llama-gen-docs \
|
jarvis-gen-docs \
|
||||||
tests/test-c.o
|
tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
|
@ -52,7 +52,7 @@ TEST_TARGETS = \
|
||||||
tests/test-grammar-integration \
|
tests/test-grammar-integration \
|
||||||
tests/test-grammar-parser \
|
tests/test-grammar-parser \
|
||||||
tests/test-json-schema-to-grammar \
|
tests/test-json-schema-to-grammar \
|
||||||
tests/test-llama-grammar \
|
tests/test-jarvis-grammar \
|
||||||
tests/test-log \
|
tests/test-log \
|
||||||
tests/test-model-load-cancel \
|
tests/test-model-load-cancel \
|
||||||
tests/test-opt \
|
tests/test-opt \
|
||||||
|
@ -65,8 +65,8 @@ TEST_TARGETS = \
|
||||||
tests/test-tokenizer-1-spm
|
tests/test-tokenizer-1-spm
|
||||||
|
|
||||||
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
||||||
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
|
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-jarvis2c-to-ggml \
|
||||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
simple batched batched-bench save-load-state server gguf gguf-split eval-callback jarvis-bench libllava.a llava-cli baby-jarvis \
|
||||||
retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm
|
retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm
|
||||||
|
|
||||||
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
|
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
|
||||||
|
@ -74,80 +74,80 @@ LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding
|
||||||
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
|
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
|
||||||
|
|
||||||
# Deprecation aliases
|
# Deprecation aliases
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef JARVIS_CUBLAS
|
||||||
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
|
$(error JARVIS_CUBLAS is removed. Use GGML_CUDA instead.)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_CUDA
|
ifdef JARVIS_CUDA
|
||||||
GGML_CUDA := 1
|
GGML_CUDA := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_KOMPUTE
|
ifdef JARVIS_KOMPUTE
|
||||||
GGML_KOMPUTE := 1
|
GGML_KOMPUTE := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_METAL
|
ifdef JARVIS_METAL
|
||||||
GGML_METAL := 1
|
GGML_METAL := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_RPC
|
ifdef JARVIS_RPC
|
||||||
GGML_RPC := 1
|
GGML_RPC := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_SYCL
|
ifdef JARVIS_SYCL
|
||||||
GGML_SYCL := 1
|
GGML_SYCL := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_SYCL_F16
|
ifdef JARVIS_SYCL_F16
|
||||||
GGML_SYCL_F16 := 1
|
GGML_SYCL_F16 := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_OPENBLAS
|
ifdef JARVIS_OPENBLAS
|
||||||
GGML_OPENBLAS := 1
|
GGML_OPENBLAS := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_OPENBLAS64
|
ifdef JARVIS_OPENBLAS64
|
||||||
GGML_OPENBLAS64 := 1
|
GGML_OPENBLAS64 := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_BLIS
|
ifdef JARVIS_BLIS
|
||||||
GGML_BLIS := 1
|
GGML_BLIS := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_NO_LLAMAFILE
|
ifdef JARVIS_NO_JARVISFILE
|
||||||
GGML_NO_LLAMAFILE := 1
|
GGML_NO_JARVISFILE := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_NO_ACCELERATE
|
ifdef JARVIS_NO_ACCELERATE
|
||||||
GGML_NO_ACCELERATE := 1
|
GGML_NO_ACCELERATE := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_NO_OPENMP
|
ifdef JARVIS_NO_OPENMP
|
||||||
GGML_NO_OPENMP := 1
|
GGML_NO_OPENMP := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_NO_METAL
|
ifdef JARVIS_NO_METAL
|
||||||
GGML_NO_METAL := 1
|
GGML_NO_METAL := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_DISABLE_LOGS
|
ifdef JARVIS_DISABLE_LOGS
|
||||||
REMOVE_WARNING := 1
|
REMOVE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_SERVER_VERBOSE
|
ifdef JARVIS_SERVER_VERBOSE
|
||||||
REMOVE_WARNING := 1
|
REMOVE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -211,8 +211,8 @@ test: $(TEST_TARGETS)
|
||||||
@failures=0; \
|
@failures=0; \
|
||||||
for test_target in $(TEST_TARGETS); do \
|
for test_target in $(TEST_TARGETS); do \
|
||||||
if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
|
if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
|
||||||
./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
|
./$$test_target $(CURDIR)/models/ggml-vocab-jarvis-spm.gguf; \
|
||||||
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
|
./$$test_target $(CURDIR)/models/ggml-vocab-jarvis-bpe.gguf; \
|
||||||
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
|
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
|
||||||
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
||||||
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
|
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
|
||||||
|
@ -257,7 +257,7 @@ MK_CFLAGS = -std=c11 -fPIC
|
||||||
MK_CXXFLAGS = -std=c++11 -fPIC
|
MK_CXXFLAGS = -std=c++11 -fPIC
|
||||||
MK_NVCCFLAGS = -std=c++11
|
MK_NVCCFLAGS = -std=c++11
|
||||||
|
|
||||||
ifdef LLAMA_NO_CCACHE
|
ifdef JARVIS_NO_CCACHE
|
||||||
GGML_NO_CCACHE := 1
|
GGML_NO_CCACHE := 1
|
||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
@ -320,7 +320,7 @@ ifdef GGML_SCHED_MAX_COPIES
|
||||||
MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES)
|
MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_DEBUG
|
ifdef JARVIS_DEBUG
|
||||||
MK_CFLAGS += -O0 -g
|
MK_CFLAGS += -O0 -g
|
||||||
MK_CXXFLAGS += -O0 -g
|
MK_CXXFLAGS += -O0 -g
|
||||||
MK_LDFLAGS += -g
|
MK_LDFLAGS += -g
|
||||||
|
@ -336,25 +336,25 @@ else
|
||||||
MK_NVCCFLAGS += -O3 -g
|
MK_NVCCFLAGS += -O3 -g
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_SANITIZE_THREAD
|
ifdef JARVIS_SANITIZE_THREAD
|
||||||
MK_CFLAGS += -fsanitize=thread -g
|
MK_CFLAGS += -fsanitize=thread -g
|
||||||
MK_CXXFLAGS += -fsanitize=thread -g
|
MK_CXXFLAGS += -fsanitize=thread -g
|
||||||
MK_LDFLAGS += -fsanitize=thread -g
|
MK_LDFLAGS += -fsanitize=thread -g
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_SANITIZE_ADDRESS
|
ifdef JARVIS_SANITIZE_ADDRESS
|
||||||
MK_CFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
MK_CFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
||||||
MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
||||||
MK_LDFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
MK_LDFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_SANITIZE_UNDEFINED
|
ifdef JARVIS_SANITIZE_UNDEFINED
|
||||||
MK_CFLAGS += -fsanitize=undefined -g
|
MK_CFLAGS += -fsanitize=undefined -g
|
||||||
MK_CXXFLAGS += -fsanitize=undefined -g
|
MK_CXXFLAGS += -fsanitize=undefined -g
|
||||||
MK_LDFLAGS += -fsanitize=undefined -g
|
MK_LDFLAGS += -fsanitize=undefined -g
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_SERVER_SSL
|
ifdef JARVIS_SERVER_SSL
|
||||||
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
|
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
|
||||||
MK_LDFLAGS += -lssl -lcrypto
|
MK_LDFLAGS += -lssl -lcrypto
|
||||||
endif
|
endif
|
||||||
|
@ -381,7 +381,7 @@ MK_CXXFLAGS += \
|
||||||
-Wmissing-declarations \
|
-Wmissing-declarations \
|
||||||
-Wmissing-noreturn
|
-Wmissing-noreturn
|
||||||
|
|
||||||
ifeq ($(LLAMA_FATAL_WARNINGS),1)
|
ifeq ($(JARVIS_FATAL_WARNINGS),1)
|
||||||
MK_CFLAGS += -Werror
|
MK_CFLAGS += -Werror
|
||||||
MK_CXXFLAGS += -Werror
|
MK_CXXFLAGS += -Werror
|
||||||
endif
|
endif
|
||||||
|
@ -420,7 +420,7 @@ ifeq ($(_WIN32),1)
|
||||||
LWINSOCK2 := -lws2_32
|
LWINSOCK2 := -lws2_32
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_GPROF
|
ifdef JARVIS_GPROF
|
||||||
MK_CFLAGS += -pg
|
MK_CFLAGS += -pg
|
||||||
MK_CXXFLAGS += -pg
|
MK_CXXFLAGS += -pg
|
||||||
endif
|
endif
|
||||||
|
@ -448,7 +448,7 @@ endif
|
||||||
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
|
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
|
||||||
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
|
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
|
||||||
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
|
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
|
||||||
# https://github.com/ggerganov/llama.cpp/issues/2922
|
# https://github.com/ggerganov/jarvis.cpp/issues/2922
|
||||||
MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
|
MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
|
||||||
MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
|
MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
|
||||||
|
|
||||||
|
@ -574,9 +574,9 @@ ifdef GGML_NVPL
|
||||||
OBJ_GGML += ggml/src/ggml-blas.o
|
OBJ_GGML += ggml/src/ggml-blas.o
|
||||||
endif # GGML_NVPL
|
endif # GGML_NVPL
|
||||||
|
|
||||||
ifndef GGML_NO_LLAMAFILE
|
ifndef GGML_NO_JARVISFILE
|
||||||
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
MK_CPPFLAGS += -DGGML_USE_JARVISFILE
|
||||||
OBJ_GGML += ggml/src/llamafile/sgemm.o
|
OBJ_GGML += ggml/src/jarvisfile/sgemm.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef GGML_NO_AMX
|
ifndef GGML_NO_AMX
|
||||||
|
@ -627,9 +627,9 @@ ifdef GGML_CUDA
|
||||||
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
|
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
|
||||||
OBJ_GGML += $(OBJ_CUDA_TMPL)
|
OBJ_GGML += $(OBJ_CUDA_TMPL)
|
||||||
|
|
||||||
ifdef LLAMA_FATAL_WARNINGS
|
ifdef JARVIS_FATAL_WARNINGS
|
||||||
MK_NVCCFLAGS += -Werror all-warnings
|
MK_NVCCFLAGS += -Werror all-warnings
|
||||||
endif # LLAMA_FATAL_WARNINGS
|
endif # JARVIS_FATAL_WARNINGS
|
||||||
|
|
||||||
ifndef GGML_MUSA
|
ifndef GGML_MUSA
|
||||||
ifndef JETSON_EOL_MODULE_DETECT
|
ifndef JETSON_EOL_MODULE_DETECT
|
||||||
|
@ -637,9 +637,9 @@ ifndef JETSON_EOL_MODULE_DETECT
|
||||||
endif # JETSON_EOL_MODULE_DETECT
|
endif # JETSON_EOL_MODULE_DETECT
|
||||||
endif # GGML_MUSA
|
endif # GGML_MUSA
|
||||||
|
|
||||||
ifdef LLAMA_DEBUG
|
ifdef JARVIS_DEBUG
|
||||||
MK_NVCCFLAGS += -lineinfo
|
MK_NVCCFLAGS += -lineinfo
|
||||||
endif # LLAMA_DEBUG
|
endif # JARVIS_DEBUG
|
||||||
|
|
||||||
ifdef GGML_CUDA_DEBUG
|
ifdef GGML_CUDA_DEBUG
|
||||||
MK_NVCCFLAGS += --device-debug
|
MK_NVCCFLAGS += --device-debug
|
||||||
|
@ -920,11 +920,11 @@ OBJ_GGML += \
|
||||||
ggml/src/ggml-quants.o \
|
ggml/src/ggml-quants.o \
|
||||||
ggml/src/ggml-aarch64.o
|
ggml/src/ggml-aarch64.o
|
||||||
|
|
||||||
OBJ_LLAMA = \
|
OBJ_JARVIS = \
|
||||||
src/llama.o \
|
src/jarvis.o \
|
||||||
src/llama-vocab.o \
|
src/jarvis-vocab.o \
|
||||||
src/llama-grammar.o \
|
src/jarvis-grammar.o \
|
||||||
src/llama-sampling.o \
|
src/jarvis-sampling.o \
|
||||||
src/unicode.o \
|
src/unicode.o \
|
||||||
src/unicode-data.o
|
src/unicode-data.o
|
||||||
|
|
||||||
|
@ -939,19 +939,19 @@ OBJ_COMMON = \
|
||||||
common/build-info.o \
|
common/build-info.o \
|
||||||
common/json-schema-to-grammar.o
|
common/json-schema-to-grammar.o
|
||||||
|
|
||||||
OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
|
OBJ_ALL = $(OBJ_GGML) $(OBJ_JARVIS) $(OBJ_COMMON)
|
||||||
|
|
||||||
LIB_GGML = $(LIB_PRE)ggml$(DSO_EXT)
|
LIB_GGML = $(LIB_PRE)ggml$(DSO_EXT)
|
||||||
LIB_GGML_S = $(LIB_PRE)ggml.a
|
LIB_GGML_S = $(LIB_PRE)ggml.a
|
||||||
|
|
||||||
LIB_LLAMA = $(LIB_PRE)llama$(DSO_EXT)
|
LIB_JARVIS = $(LIB_PRE)jarvis$(DSO_EXT)
|
||||||
LIB_LLAMA_S = $(LIB_PRE)llama.a
|
LIB_JARVIS_S = $(LIB_PRE)jarvis.a
|
||||||
|
|
||||||
LIB_COMMON = $(LIB_PRE)common$(DSO_EXT)
|
LIB_COMMON = $(LIB_PRE)common$(DSO_EXT)
|
||||||
LIB_COMMON_S = $(LIB_PRE)common.a
|
LIB_COMMON_S = $(LIB_PRE)common.a
|
||||||
|
|
||||||
LIB_ALL = $(LIB_GGML) $(LIB_LLAMA) $(LIB_COMMON)
|
LIB_ALL = $(LIB_GGML) $(LIB_JARVIS) $(LIB_COMMON)
|
||||||
LIB_ALL_S = $(LIB_GGML_S) $(LIB_LLAMA_S) $(LIB_COMMON_S)
|
LIB_ALL_S = $(LIB_GGML_S) $(LIB_JARVIS_S) $(LIB_COMMON_S)
|
||||||
|
|
||||||
GF_CC := $(CC)
|
GF_CC := $(CC)
|
||||||
include scripts/get-flags.mk
|
include scripts/get-flags.mk
|
||||||
|
@ -971,8 +971,8 @@ include scripts/get-flags.mk
|
||||||
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_CURL
|
ifdef JARVIS_CURL
|
||||||
override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
|
override CXXFLAGS := $(CXXFLAGS) -DJARVIS_USE_CURL
|
||||||
override LDFLAGS := $(LDFLAGS) -lcurl
|
override LDFLAGS := $(LDFLAGS) -lcurl
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -980,7 +980,7 @@ endif
|
||||||
# Print build information
|
# Print build information
|
||||||
#
|
#
|
||||||
|
|
||||||
$(info I llama.cpp build info: )
|
$(info I jarvis.cpp build info: )
|
||||||
$(info I UNAME_S: $(UNAME_S))
|
$(info I UNAME_S: $(UNAME_S))
|
||||||
$(info I UNAME_P: $(UNAME_P))
|
$(info I UNAME_P: $(UNAME_P))
|
||||||
$(info I UNAME_M: $(UNAME_M))
|
$(info I UNAME_M: $(UNAME_M))
|
||||||
|
@ -1009,30 +1009,30 @@ $(info )
|
||||||
|
|
||||||
ifdef DEPRECATE_WARNING
|
ifdef DEPRECATE_WARNING
|
||||||
$(info !!! DEPRECATION WARNING !!!)
|
$(info !!! DEPRECATION WARNING !!!)
|
||||||
$(info The following LLAMA_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead)
|
$(info The following JARVIS_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead)
|
||||||
$(info - LLAMA_CUDA)
|
$(info - JARVIS_CUDA)
|
||||||
$(info - LLAMA_METAL)
|
$(info - JARVIS_METAL)
|
||||||
$(info - LLAMA_METAL_EMBED_LIBRARY)
|
$(info - JARVIS_METAL_EMBED_LIBRARY)
|
||||||
$(info - LLAMA_OPENMP)
|
$(info - JARVIS_OPENMP)
|
||||||
$(info - LLAMA_RPC)
|
$(info - JARVIS_RPC)
|
||||||
$(info - LLAMA_SYCL)
|
$(info - JARVIS_SYCL)
|
||||||
$(info - LLAMA_SYCL_F16)
|
$(info - JARVIS_SYCL_F16)
|
||||||
$(info - LLAMA_OPENBLAS)
|
$(info - JARVIS_OPENBLAS)
|
||||||
$(info - LLAMA_OPENBLAS64)
|
$(info - JARVIS_OPENBLAS64)
|
||||||
$(info - LLAMA_BLIS)
|
$(info - JARVIS_BLIS)
|
||||||
$(info - LLAMA_NO_LLAMAFILE)
|
$(info - JARVIS_NO_JARVISFILE)
|
||||||
$(info - LLAMA_NO_ACCELERATE)
|
$(info - JARVIS_NO_ACCELERATE)
|
||||||
$(info - LLAMA_NO_OPENMP)
|
$(info - JARVIS_NO_OPENMP)
|
||||||
$(info - LLAMA_NO_METAL)
|
$(info - JARVIS_NO_METAL)
|
||||||
$(info - LLAMA_NO_CCACHE)
|
$(info - JARVIS_NO_CCACHE)
|
||||||
$(info )
|
$(info )
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef REMOVE_WARNING
|
ifdef REMOVE_WARNING
|
||||||
$(info !!! REMOVAL WARNING !!!)
|
$(info !!! REMOVAL WARNING !!!)
|
||||||
$(info The following LLAMA_ options have been removed and are no longer supported)
|
$(info The following JARVIS_ options have been removed and are no longer supported)
|
||||||
$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418))
|
$(info - JARVIS_DISABLE_LOGS (https://github.com/ggerganov/jarvis.cpp/pull/9418))
|
||||||
$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
|
$(info - JARVIS_SERVER_VERBOSE (https://github.com/ggerganov/jarvis.cpp/pull/9418))
|
||||||
$(info )
|
$(info )
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -1079,13 +1079,13 @@ ggml/src/ggml-blas.o: \
|
||||||
ggml/include/ggml-blas.h
|
ggml/include/ggml-blas.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
ifndef GGML_NO_LLAMAFILE
|
ifndef GGML_NO_JARVISFILE
|
||||||
ggml/src/llamafile/sgemm.o: \
|
ggml/src/jarvisfile/sgemm.o: \
|
||||||
ggml/src/llamafile/sgemm.cpp \
|
ggml/src/jarvisfile/sgemm.cpp \
|
||||||
ggml/src/llamafile/sgemm.h \
|
ggml/src/jarvisfile/sgemm.h \
|
||||||
ggml/include/ggml.h
|
ggml/include/ggml.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
endif # GGML_NO_LLAMAFILE
|
endif # GGML_NO_JARVISFILE
|
||||||
|
|
||||||
ifndef GGML_NO_AMX
|
ifndef GGML_NO_AMX
|
||||||
ggml/src/ggml-amx.o: \
|
ggml/src/ggml-amx.o: \
|
||||||
|
@ -1115,7 +1115,7 @@ $(LIB_GGML_S): \
|
||||||
$(OBJ_GGML)
|
$(OBJ_GGML)
|
||||||
ar rcs $(LIB_GGML_S) $^
|
ar rcs $(LIB_GGML_S) $^
|
||||||
|
|
||||||
# llama
|
# jarvis
|
||||||
|
|
||||||
src/unicode.o: \
|
src/unicode.o: \
|
||||||
src/unicode.cpp \
|
src/unicode.cpp \
|
||||||
|
@ -1127,14 +1127,14 @@ src/unicode-data.o: \
|
||||||
src/unicode-data.h
|
src/unicode-data.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
src/llama.o: \
|
src/jarvis.o: \
|
||||||
src/llama.cpp \
|
src/jarvis.cpp \
|
||||||
src/llama-impl.h \
|
src/jarvis-impl.h \
|
||||||
src/llama-vocab.h \
|
src/jarvis-vocab.h \
|
||||||
src/llama-grammar.h \
|
src/jarvis-grammar.h \
|
||||||
src/llama-sampling.h \
|
src/jarvis-sampling.h \
|
||||||
src/unicode.h \
|
src/unicode.h \
|
||||||
include/llama.h \
|
include/jarvis.h \
|
||||||
ggml/include/ggml-cuda.h \
|
ggml/include/ggml-cuda.h \
|
||||||
ggml/include/ggml-metal.h \
|
ggml/include/ggml-metal.h \
|
||||||
ggml/include/ggml.h \
|
ggml/include/ggml.h \
|
||||||
|
@ -1142,37 +1142,37 @@ src/llama.o: \
|
||||||
ggml/include/ggml-backend.h
|
ggml/include/ggml-backend.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
src/llama-vocab.o: \
|
src/jarvis-vocab.o: \
|
||||||
src/llama-vocab.cpp \
|
src/jarvis-vocab.cpp \
|
||||||
src/llama-vocab.h \
|
src/jarvis-vocab.h \
|
||||||
src/llama-impl.h \
|
src/jarvis-impl.h \
|
||||||
include/llama.h
|
include/jarvis.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
src/llama-grammar.o: \
|
src/jarvis-grammar.o: \
|
||||||
src/llama-grammar.cpp \
|
src/jarvis-grammar.cpp \
|
||||||
src/llama-grammar.h \
|
src/jarvis-grammar.h \
|
||||||
src/llama-impl.h \
|
src/jarvis-impl.h \
|
||||||
src/llama-vocab.h \
|
src/jarvis-vocab.h \
|
||||||
src/llama-sampling.h \
|
src/jarvis-sampling.h \
|
||||||
include/llama.h
|
include/jarvis.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
src/llama-sampling.o: \
|
src/jarvis-sampling.o: \
|
||||||
src/llama-sampling.cpp \
|
src/jarvis-sampling.cpp \
|
||||||
src/llama-sampling.h \
|
src/jarvis-sampling.h \
|
||||||
src/llama-impl.h \
|
src/jarvis-impl.h \
|
||||||
include/llama.h
|
include/jarvis.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
$(LIB_LLAMA): \
|
$(LIB_JARVIS): \
|
||||||
$(OBJ_LLAMA) \
|
$(OBJ_JARVIS) \
|
||||||
$(LIB_GGML)
|
$(LIB_GGML)
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
$(LIB_LLAMA_S): \
|
$(LIB_JARVIS_S): \
|
||||||
$(OBJ_LLAMA)
|
$(OBJ_JARVIS)
|
||||||
ar rcs $(LIB_LLAMA_S) $^
|
ar rcs $(LIB_JARVIS_S) $^
|
||||||
|
|
||||||
# common
|
# common
|
||||||
|
|
||||||
|
@ -1183,7 +1183,7 @@ common/common.o: \
|
||||||
common/sampling.h \
|
common/sampling.h \
|
||||||
common/json.hpp \
|
common/json.hpp \
|
||||||
common/json-schema-to-grammar.h \
|
common/json-schema-to-grammar.h \
|
||||||
include/llama.h
|
include/jarvis.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common/arg.o: \
|
common/arg.o: \
|
||||||
|
@ -1199,7 +1199,7 @@ common/log.o: \
|
||||||
common/sampling.o: \
|
common/sampling.o: \
|
||||||
common/sampling.cpp \
|
common/sampling.cpp \
|
||||||
common/sampling.h \
|
common/sampling.h \
|
||||||
include/llama.h
|
include/jarvis.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common/console.o: \
|
common/console.o: \
|
||||||
|
@ -1224,7 +1224,7 @@ common/ngram-cache.o: \
|
||||||
|
|
||||||
$(LIB_COMMON): \
|
$(LIB_COMMON): \
|
||||||
$(OBJ_COMMON) \
|
$(OBJ_COMMON) \
|
||||||
$(LIB_LLAMA) \
|
$(LIB_JARVIS) \
|
||||||
$(LIB_GGML)
|
$(LIB_GGML)
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
|
@ -1246,7 +1246,7 @@ clean:
|
||||||
rm -rvf ggml/*.dll
|
rm -rvf ggml/*.dll
|
||||||
rm -rvf ggml/*.so
|
rm -rvf ggml/*.so
|
||||||
rm -vrf ggml/src/*.o
|
rm -vrf ggml/src/*.o
|
||||||
rm -rvf ggml/src/llamafile/*.o
|
rm -rvf ggml/src/jarvisfile/*.o
|
||||||
rm -rvf common/build-info.cpp
|
rm -rvf common/build-info.cpp
|
||||||
rm -vrf ggml/src/ggml-metal-embed.metal
|
rm -vrf ggml/src/ggml-metal-embed.metal
|
||||||
rm -vrf ggml/src/ggml-cuda/*.o
|
rm -vrf ggml/src/ggml-cuda/*.o
|
||||||
|
@ -1269,75 +1269,75 @@ clean:
|
||||||
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
|
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
|
||||||
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
|
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
|
||||||
|
|
||||||
llama-cli: examples/main/main.cpp \
|
jarvis-cli: examples/main/main.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
@echo
|
@echo
|
||||||
@echo '==== Run ./llama-cli -h for help. ===='
|
@echo '==== Run ./jarvis-cli -h for help. ===='
|
||||||
@echo
|
@echo
|
||||||
|
|
||||||
llama-infill: examples/infill/infill.cpp \
|
jarvis-infill: examples/infill/infill.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-simple: examples/simple/simple.cpp \
|
jarvis-simple: examples/simple/simple.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-tokenize: examples/tokenize/tokenize.cpp \
|
jarvis-tokenize: examples/tokenize/tokenize.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-batched: examples/batched/batched.cpp \
|
jarvis-batched: examples/batched/batched.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-batched-bench: examples/batched-bench/batched-bench.cpp \
|
jarvis-batched-bench: examples/batched-bench/batched-bench.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-quantize: examples/quantize/quantize.cpp \
|
jarvis-quantize: examples/quantize/quantize.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
|
jarvis-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-perplexity: examples/perplexity/perplexity.cpp \
|
jarvis-perplexity: examples/perplexity/perplexity.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-imatrix: examples/imatrix/imatrix.cpp \
|
jarvis-imatrix: examples/imatrix/imatrix.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-embedding: examples/embedding/embedding.cpp \
|
jarvis-embedding: examples/embedding/embedding.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-gritlm: examples/gritlm/gritlm.cpp \
|
jarvis-gritlm: examples/gritlm/gritlm.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-save-load-state: examples/save-load-state/save-load-state.cpp \
|
jarvis-save-load-state: examples/save-load-state/save-load-state.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-gguf: examples/gguf/gguf.cpp \
|
jarvis-gguf: examples/gguf/gguf.cpp \
|
||||||
$(OBJ_GGML)
|
$(OBJ_GGML)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
@ -1354,92 +1354,92 @@ examples/gguf-hash/deps/sha256/sha256.o: \
|
||||||
examples/gguf-hash/deps/sha256/sha256.c
|
examples/gguf-hash/deps/sha256/sha256.c
|
||||||
$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
|
$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
|
||||||
|
|
||||||
llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
|
jarvis-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-gguf-split: examples/gguf-split/gguf-split.cpp \
|
jarvis-gguf-split: examples/gguf-split/gguf-split.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-eval-callback: examples/eval-callback/eval-callback.cpp \
|
jarvis-eval-callback: examples/eval-callback/eval-callback.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
|
jarvis-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
|
jarvis-convert-jarvis2c-to-ggml: examples/convert-jarvis2c-to-ggml/convert-jarvis2c-to-ggml.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-bench: examples/llama-bench/llama-bench.cpp \
|
jarvis-bench: examples/jarvis-bench/jarvis-bench.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-baby-llama: examples/baby-llama/baby-llama.cpp \
|
jarvis-baby-jarvis: examples/baby-jarvis/baby-jarvis.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-export-lora: examples/export-lora/export-lora.cpp \
|
jarvis-export-lora: examples/export-lora/export-lora.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-retrieval: examples/retrieval/retrieval.cpp \
|
jarvis-retrieval: examples/retrieval/retrieval.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-speculative: examples/speculative/speculative.cpp \
|
jarvis-speculative: examples/speculative/speculative.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-parallel: examples/parallel/parallel.cpp \
|
jarvis-parallel: examples/parallel/parallel.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-lookahead: examples/lookahead/lookahead.cpp \
|
jarvis-lookahead: examples/lookahead/lookahead.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-lookup: examples/lookup/lookup.cpp \
|
jarvis-lookup: examples/lookup/lookup.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-lookup-create: examples/lookup/lookup-create.cpp \
|
jarvis-lookup-create: examples/lookup/lookup-create.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-lookup-merge: examples/lookup/lookup-merge.cpp \
|
jarvis-lookup-merge: examples/lookup/lookup-merge.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-lookup-stats: examples/lookup/lookup-stats.cpp \
|
jarvis-lookup-stats: examples/lookup/lookup-stats.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-passkey: examples/passkey/passkey.cpp \
|
jarvis-passkey: examples/passkey/passkey.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
|
jarvis-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
@ -1450,7 +1450,7 @@ rpc-server: examples/rpc/rpc-server.cpp \
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
endif # GGML_RPC
|
endif # GGML_RPC
|
||||||
|
|
||||||
llama-server: \
|
jarvis-server: \
|
||||||
examples/server/server.cpp \
|
examples/server/server.cpp \
|
||||||
examples/server/utils.hpp \
|
examples/server/utils.hpp \
|
||||||
examples/server/httplib.h \
|
examples/server/httplib.h \
|
||||||
|
@ -1485,7 +1485,7 @@ examples/server/%.hpp: examples/server/public/% Makefile
|
||||||
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
||||||
) > $@
|
) > $@
|
||||||
|
|
||||||
llama-gen-docs: examples/gen-docs/gen-docs.cpp \
|
jarvis-gen-docs: examples/gen-docs/gen-docs.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
@ -1499,7 +1499,7 @@ libllava.a: examples/llava/llava.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
||||||
|
|
||||||
llama-llava-cli: examples/llava/llava-cli.cpp \
|
jarvis-llava-cli: examples/llava/llava-cli.cpp \
|
||||||
examples/llava/llava.cpp \
|
examples/llava/llava.cpp \
|
||||||
examples/llava/llava.h \
|
examples/llava/llava.h \
|
||||||
examples/llava/clip.cpp \
|
examples/llava/clip.cpp \
|
||||||
|
@ -1507,7 +1507,7 @@ llama-llava-cli: examples/llava/llava-cli.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||||
|
|
||||||
llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
|
jarvis-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
|
||||||
examples/llava/llava.cpp \
|
examples/llava/llava.cpp \
|
||||||
examples/llava/llava.h \
|
examples/llava/llava.h \
|
||||||
examples/llava/clip.cpp \
|
examples/llava/clip.cpp \
|
||||||
|
@ -1542,7 +1542,7 @@ tests/test-arg-parser: tests/test-arg-parser.cpp \
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp \
|
tests/test-jarvis-grammar: tests/test-jarvis-grammar.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
@ -1616,7 +1616,7 @@ tests/test-rope: tests/test-rope.cpp ggml/src/ggml.o \
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-c.o: tests/test-c.c include/llama.h
|
tests/test-c.o: tests/test-c.c include/jarvis.h
|
||||||
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
|
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
|
||||||
|
|
||||||
tests/test-backend-ops: tests/test-backend-ops.cpp \
|
tests/test-backend-ops: tests/test-backend-ops.cpp \
|
||||||
|
@ -1643,12 +1643,12 @@ tests/test-chat-template: tests/test-chat-template.cpp \
|
||||||
# PoCs
|
# PoCs
|
||||||
#
|
#
|
||||||
|
|
||||||
llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
|
jarvis-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
|
||||||
$(OBJ_GGML)
|
$(OBJ_GGML)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
|
jarvis-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
|
||||||
$(OBJ_GGML)
|
$(OBJ_GGML)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
@ -1667,17 +1667,17 @@ examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning
|
||||||
# Eventually we will want to remove these target from building all the time.
|
# Eventually we will want to remove these target from building all the time.
|
||||||
main: examples/deprecation-warning/deprecation-warning.o
|
main: examples/deprecation-warning/deprecation-warning.o
|
||||||
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||||
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
|
@echo "NOTICE: The 'main' binary is deprecated. Please use 'jarvis-cli' instead."
|
||||||
|
|
||||||
server: examples/deprecation-warning/deprecation-warning.o
|
server: examples/deprecation-warning/deprecation-warning.o
|
||||||
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||||
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
|
@echo "NOTICE: The 'server' binary is deprecated. Please use 'jarvis-server' instead."
|
||||||
|
|
||||||
quantize: examples/deprecation-warning/deprecation-warning.o
|
quantize: examples/deprecation-warning/deprecation-warning.o
|
||||||
ifneq (,$(wildcard quantize))
|
ifneq (,$(wildcard quantize))
|
||||||
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||||
@echo "#########"
|
@echo "#########"
|
||||||
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
|
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'jarvis-quantize' instead."
|
||||||
@echo " Remove the 'quantize' binary to remove this warning."
|
@echo " Remove the 'quantize' binary to remove this warning."
|
||||||
@echo "#########"
|
@echo "#########"
|
||||||
endif
|
endif
|
||||||
|
@ -1686,7 +1686,7 @@ perplexity: examples/deprecation-warning/deprecation-warning.o
|
||||||
ifneq (,$(wildcard perplexity))
|
ifneq (,$(wildcard perplexity))
|
||||||
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||||
@echo "#########"
|
@echo "#########"
|
||||||
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
|
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'jarvis-perplexity' instead."
|
||||||
@echo " Remove the 'perplexity' binary to remove this warning."
|
@echo " Remove the 'perplexity' binary to remove this warning."
|
||||||
@echo "#########"
|
@echo "#########"
|
||||||
endif
|
endif
|
||||||
|
@ -1695,7 +1695,7 @@ embedding: examples/deprecation-warning/deprecation-warning.o
|
||||||
ifneq (,$(wildcard embedding))
|
ifneq (,$(wildcard embedding))
|
||||||
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||||
@echo "#########"
|
@echo "#########"
|
||||||
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
|
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'jarvis-embedding' instead."
|
||||||
@echo " Remove the 'embedding' binary to remove this warning."
|
@echo " Remove the 'embedding' binary to remove this warning."
|
||||||
@echo "#########"
|
@echo "#########"
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -3,10 +3,10 @@
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
var sources = [
|
var sources = [
|
||||||
"src/llama.cpp",
|
"src/jarvis.cpp",
|
||||||
"src/llama-vocab.cpp",
|
"src/jarvis-vocab.cpp",
|
||||||
"src/llama-grammar.cpp",
|
"src/jarvis-grammar.cpp",
|
||||||
"src/llama-sampling.cpp",
|
"src/jarvis-sampling.cpp",
|
||||||
"src/unicode.cpp",
|
"src/unicode.cpp",
|
||||||
"src/unicode-data.cpp",
|
"src/unicode-data.cpp",
|
||||||
"ggml/src/ggml.c",
|
"ggml/src/ggml.c",
|
||||||
|
@ -45,7 +45,7 @@ cSettings.append(
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
let package = Package(
|
let package = Package(
|
||||||
name: "llama",
|
name: "jarvis",
|
||||||
platforms: [
|
platforms: [
|
||||||
.macOS(.v12),
|
.macOS(.v12),
|
||||||
.iOS(.v14),
|
.iOS(.v14),
|
||||||
|
@ -53,11 +53,11 @@ let package = Package(
|
||||||
.tvOS(.v14)
|
.tvOS(.v14)
|
||||||
],
|
],
|
||||||
products: [
|
products: [
|
||||||
.library(name: "llama", targets: ["llama"]),
|
.library(name: "jarvis", targets: ["jarvis"]),
|
||||||
],
|
],
|
||||||
targets: [
|
targets: [
|
||||||
.target(
|
.target(
|
||||||
name: "llama",
|
name: "jarvis",
|
||||||
path: ".",
|
path: ".",
|
||||||
exclude: [
|
exclude: [
|
||||||
"cmake",
|
"cmake",
|
||||||
|
|
170
README.md
170
README.md
|
@ -1,30 +1,30 @@
|
||||||
# llama.cpp
|
# jarvis.cpp
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
[](https://github.com/ggerganov/jarvis.cpp/actions/workflows/server.yml)
|
||||||
[](https://conan.io/center/llama-cpp)
|
[](https://conan.io/center/jarvis-cpp)
|
||||||
|
|
||||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/jarvis.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/jarvis.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||||
|
|
||||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||||
|
|
||||||
## Recent API changes
|
## Recent API changes
|
||||||
|
|
||||||
- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
|
- [Changelog for `libjarvis` API](https://github.com/ggerganov/jarvis.cpp/issues/9289)
|
||||||
- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
|
- [Changelog for `jarvis-server` REST API](https://github.com/ggerganov/jarvis.cpp/issues/9291)
|
||||||
|
|
||||||
## Hot topics
|
## Hot topics
|
||||||
|
|
||||||
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
|
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/jarvis.cpp/discussions/9669**
|
||||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/jarvis.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||||
|
|
||||||
----
|
----
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
The main goal of `jarvis.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
||||||
variety of hardware - locally and in the cloud.
|
variety of hardware - locally and in the cloud.
|
||||||
|
|
||||||
- Plain C/C++ implementation without any dependencies
|
- Plain C/C++ implementation without any dependencies
|
||||||
|
@ -35,7 +35,7 @@ variety of hardware - locally and in the cloud.
|
||||||
- Vulkan and SYCL backend support
|
- Vulkan and SYCL backend support
|
||||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||||
|
|
||||||
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
|
Since its [inception](https://github.com/ggerganov/jarvis.cpp/issues/33#issuecomment-1465108022), the project has
|
||||||
improved significantly thanks to many contributions. It is the main playground for developing new features for the
|
improved significantly thanks to many contributions. It is the main playground for developing new features for the
|
||||||
[ggml](https://github.com/ggerganov/ggml) library.
|
[ggml](https://github.com/ggerganov/ggml) library.
|
||||||
|
|
||||||
|
@ -52,22 +52,22 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
|
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
|
||||||
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
|
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
|
||||||
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
||||||
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
|
- [X] [BERT](https://github.com/ggerganov/jarvis.cpp/pull/5423)
|
||||||
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
|
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
|
||||||
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
|
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
|
||||||
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
|
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
|
||||||
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
|
- [X] [Starcoder models](https://github.com/ggerganov/jarvis.cpp/pull/3187)
|
||||||
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
|
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
|
||||||
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
|
- [X] [MPT](https://github.com/ggerganov/jarvis.cpp/pull/3417)
|
||||||
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
|
- [X] [Bloom](https://github.com/ggerganov/jarvis.cpp/pull/3553)
|
||||||
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
|
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
|
||||||
- [X] [StableLM models](https://huggingface.co/stabilityai)
|
- [X] [StableLM models](https://huggingface.co/stabilityai)
|
||||||
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
|
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
|
||||||
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
|
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
|
||||||
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
|
- [x] [PLaMo-13B](https://github.com/ggerganov/jarvis.cpp/pull/3557)
|
||||||
- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
|
- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
|
||||||
- [x] [GPT-2](https://huggingface.co/gpt2)
|
- [x] [GPT-2](https://huggingface.co/gpt2)
|
||||||
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
|
- [x] [Orion 14B](https://github.com/ggerganov/jarvis.cpp/pull/5118)
|
||||||
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
|
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
|
||||||
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
|
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
|
||||||
- [x] [Gemma](https://ai.google.dev/gemma)
|
- [x] [Gemma](https://ai.google.dev/gemma)
|
||||||
|
@ -111,36 +111,36 @@ Typically finetunes of the base models below are supported as well.
|
||||||
|
|
||||||
**Bindings:**
|
**Bindings:**
|
||||||
|
|
||||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
- Python: [abetlen/jarvis-cpp-python](https://github.com/abetlen/jarvis-cpp-python)
|
||||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
- Go: [go-skynet/go-jarvis.cpp](https://github.com/go-skynet/go-jarvis.cpp)
|
||||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
- Node.js: [withcatai/node-jarvis-cpp](https://github.com/withcatai/node-jarvis-cpp)
|
||||||
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
|
- JS/TS (jarvis.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/jarviscpp)
|
||||||
- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
|
- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
|
||||||
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
|
- JavaScript/Wasm (works in browser): [tangledgroup/jarvis-cpp-wasm](https://github.com/tangledgroup/jarvis-cpp-wasm)
|
||||||
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
|
- Typescript/Wasm (nicer API, available on npm): [ngxson/wjarvis](https://github.com/ngxson/wjarvis)
|
||||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
- Ruby: [yoshoku/jarvis_cpp.rb](https://github.com/yoshoku/jarvis_cpp.rb)
|
||||||
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
|
- Rust (more features): [edgenai/jarvis_cpp-rs](https://github.com/edgenai/jarvis_cpp-rs)
|
||||||
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
- Rust (nicer API): [mdrokz/rust-jarvis.cpp](https://github.com/mdrokz/rust-jarvis.cpp)
|
||||||
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
|
- Rust (more direct bindings): [utilityai/jarvis-cpp-rs](https://github.com/utilityai/jarvis-cpp-rs)
|
||||||
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
- C#/.NET: [SciSharp/JarvisSharp](https://github.com/SciSharp/JarvisSharp)
|
||||||
- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
|
- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
|
||||||
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
|
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
|
||||||
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
|
- Clojure: [phronmophobic/jarvis.clj](https://github.com/phronmophobic/jarvis.clj)
|
||||||
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
|
- React Native: [mybigday/jarvis.rn](https://github.com/mybigday/jarvis.rn)
|
||||||
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
|
- Java: [kherud/java-jarvis.cpp](https://github.com/kherud/java-jarvis.cpp)
|
||||||
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
|
- Zig: [deins/jarvis.cpp.zig](https://github.com/Deins/jarvis.cpp.zig)
|
||||||
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
|
- Flutter/Dart: [netdur/jarvis_cpp_dart](https://github.com/netdur/jarvis_cpp_dart)
|
||||||
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
|
- PHP (API bindings and features built on top of jarvis.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/jarvis.cpp/pull/6326)
|
||||||
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
|
- Guile Scheme: [guile_jarvis_cpp](https://savannah.nongnu.org/projects/guile-jarvis-cpp)
|
||||||
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
|
- Swift [srgtuszy/jarvis-cpp-swift](https://github.com/srgtuszy/jarvis-cpp-swift)
|
||||||
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
|
- Swift [ShenghaiWang/SwiftJarvis](https://github.com/ShenghaiWang/SwiftJarvis)
|
||||||
|
|
||||||
**UI:**
|
**UI:**
|
||||||
|
|
||||||
Unless otherwise noted these projects are open-source with permissive licensing:
|
Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
|
|
||||||
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
|
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
|
||||||
- [iohub/collama](https://github.com/iohub/coLLaMA)
|
- [iohub/cojarvis](https://github.com/iohub/coLLaMA)
|
||||||
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
||||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||||
- [Faraday](https://faraday.dev/) (proprietary)
|
- [Faraday](https://faraday.dev/) (proprietary)
|
||||||
|
@ -149,9 +149,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [ramalama](https://github.com/containers/ramalama) (MIT)
|
- [ramalama](https://github.com/containers/ramalama) (MIT)
|
||||||
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
||||||
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
||||||
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
|
- [Mozilla-Ocho/jarvisfile](https://github.com/Mozilla-Ocho/jarvisfile)
|
||||||
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
|
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
|
||||||
- [ollama/ollama](https://github.com/ollama/ollama)
|
- [ojarvis/ojarvis](https://github.com/ojarvis/ojarvis)
|
||||||
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
|
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
|
||||||
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
||||||
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
||||||
|
@ -173,24 +173,24 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
||||||
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
||||||
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
||||||
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
|
- [Jarvis Assistant](https://github.com/vietanhdev/jarvis-assistant) (GPL)
|
||||||
- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
|
- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
|
||||||
|
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `jarvis.cpp`)*
|
||||||
|
|
||||||
**Tools:**
|
**Tools:**
|
||||||
|
|
||||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||||
- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
|
- [akx/ojarvis-dl](https://github.com/akx/ojarvis-dl) – download models from the Ojarvis library to be used directly with jarvis.cpp
|
||||||
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
- [crashr/gppm](https://github.com/crashr/gppm) – launch jarvis.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
||||||
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
||||||
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
|
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-jarvis-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
|
||||||
|
|
||||||
**Infrastructure:**
|
**Infrastructure:**
|
||||||
|
|
||||||
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for jarvis.cpp
|
||||||
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
||||||
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
|
- [jarvis_cpp_canister](https://github.com/onicai/jarvis_cpp_canister) - jarvis.cpp as a smart contract on the Internet Computer, using WebAssembly
|
||||||
|
|
||||||
**Games:**
|
**Games:**
|
||||||
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
|
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
|
||||||
|
@ -201,8 +201,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
|
<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
|
||||||
|
|
||||||
```
|
```
|
||||||
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
$ make -j && ./jarvis-cli -m models/jarvis-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
||||||
I llama.cpp build info:
|
I jarvis.cpp build info:
|
||||||
I UNAME_S: Darwin
|
I UNAME_S: Darwin
|
||||||
I UNAME_P: arm
|
I UNAME_P: arm
|
||||||
I UNAME_M: arm64
|
I UNAME_M: arm64
|
||||||
|
@ -215,12 +215,12 @@ I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
|
||||||
make: Nothing to be done for `default'.
|
make: Nothing to be done for `default'.
|
||||||
main: build = 1041 (cf658ad)
|
main: build = 1041 (cf658ad)
|
||||||
main: seed = 1692823051
|
main: seed = 1692823051
|
||||||
llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
|
jarvis_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/jarvis-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
|
||||||
llama_model_loader: - type f32: 81 tensors
|
jarvis_model_loader: - type f32: 81 tensors
|
||||||
llama_model_loader: - type q4_0: 281 tensors
|
jarvis_model_loader: - type q4_0: 281 tensors
|
||||||
llama_model_loader: - type q6_K: 1 tensors
|
jarvis_model_loader: - type q6_K: 1 tensors
|
||||||
llm_load_print_meta: format = GGUF V1 (latest)
|
llm_load_print_meta: format = GGUF V1 (latest)
|
||||||
llm_load_print_meta: arch = llama
|
llm_load_print_meta: arch = jarvis
|
||||||
llm_load_print_meta: vocab type = SPM
|
llm_load_print_meta: vocab type = SPM
|
||||||
llm_load_print_meta: n_vocab = 32000
|
llm_load_print_meta: n_vocab = 32000
|
||||||
llm_load_print_meta: n_merges = 0
|
llm_load_print_meta: n_merges = 0
|
||||||
|
@ -248,8 +248,8 @@ llm_load_print_meta: LF token = 13 '<0x0A>'
|
||||||
llm_load_tensors: ggml ctx size = 0.11 MB
|
llm_load_tensors: ggml ctx size = 0.11 MB
|
||||||
llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state)
|
llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state)
|
||||||
...................................................................................................
|
...................................................................................................
|
||||||
llama_new_context_with_model: kv self size = 400.00 MB
|
jarvis_new_context_with_model: kv self size = 400.00 MB
|
||||||
llama_new_context_with_model: compute buffer total size = 75.41 MB
|
jarvis_new_context_with_model: compute buffer total size = 75.41 MB
|
||||||
|
|
||||||
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
|
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
|
||||||
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
|
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
|
||||||
|
@ -271,11 +271,11 @@ How does a Website Work?
|
||||||
A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
|
A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
|
||||||
The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
|
The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
|
||||||
How to
|
How to
|
||||||
llama_print_timings: load time = 576.45 ms
|
jarvis_print_timings: load time = 576.45 ms
|
||||||
llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second)
|
jarvis_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second)
|
||||||
llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second)
|
jarvis_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second)
|
||||||
llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second)
|
jarvis_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second)
|
||||||
llama_print_timings: total time = 25431.49 ms
|
jarvis_print_timings: total time = 25431.49 ms
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
@ -297,14 +297,14 @@ Here are the end-to-end binary build and model conversion steps for most support
|
||||||
|
|
||||||
Firstly, you need to get the binary. There are different methods that you can follow:
|
Firstly, you need to get the binary. There are different methods that you can follow:
|
||||||
- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
|
- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
|
||||||
- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
|
- Method 2: If you are using MacOS or Linux, you can install jarvis.cpp via [brew, flox or nix](./docs/install.md)
|
||||||
- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
|
- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
|
||||||
- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
|
- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/jarvis.cpp/releases)
|
||||||
|
|
||||||
You can run a basic completion using this command:
|
You can run a basic completion using this command:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
|
jarvis-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
|
||||||
|
|
||||||
# Output:
|
# Output:
|
||||||
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
|
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
|
||||||
|
@ -317,7 +317,7 @@ See [this page](./examples/main/README.md) for a full list of parameters.
|
||||||
If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
|
If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
|
jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
|
||||||
|
|
||||||
# Output:
|
# Output:
|
||||||
# > hi, who are you?
|
# > hi, who are you?
|
||||||
|
@ -327,26 +327,26 @@ llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
|
||||||
# Easy peasy! The answer to 1+1 is... 2!
|
# Easy peasy! The answer to 1+1 is... 2!
|
||||||
```
|
```
|
||||||
|
|
||||||
By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/jarvis.cpp/wiki/Templates-supported-by-jarvis_chat_apply_template)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
|
./jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
|
You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
|
./jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
|
||||||
```
|
```
|
||||||
|
|
||||||
### Web server
|
### Web server
|
||||||
|
|
||||||
[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
[jarvis.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
||||||
|
|
||||||
Example usage:
|
Example usage:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-server -m your_model.gguf --port 8080
|
./jarvis-server -m your_model.gguf --port 8080
|
||||||
|
|
||||||
# Basic web UI can be accessed via browser: http://localhost:8080
|
# Basic web UI can be accessed via browser: http://localhost:8080
|
||||||
# Chat completion endpoint: http://localhost:8080/v1/chat/completions
|
# Chat completion endpoint: http://localhost:8080/v1/chat/completions
|
||||||
|
@ -369,16 +369,16 @@ Here is an example of a few-shot interaction, invoked with the command
|
||||||
./examples/chat-13B.sh
|
./examples/chat-13B.sh
|
||||||
|
|
||||||
# custom arguments using a 13B model
|
# custom arguments using a 13B model
|
||||||
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
./jarvis-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
|
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `jarvis-cli` example program.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
### Persistent Interaction
|
### Persistent Interaction
|
||||||
|
|
||||||
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
The prompt, user inputs, and model generations can be saved and resumed across calls to `./jarvis-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Start a new chat
|
# Start a new chat
|
||||||
|
@ -397,10 +397,10 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
|
||||||
|
|
||||||
### Constrained output with grammars
|
### Constrained output with grammars
|
||||||
|
|
||||||
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
`jarvis.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
./jarvis-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
||||||
```
|
```
|
||||||
|
|
||||||
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
||||||
|
@ -409,7 +409,7 @@ For authoring more complex JSON grammars, you can also check out https://grammar
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
Please refer to [Build llama.cpp locally](./docs/build.md)
|
Please refer to [Build jarvis.cpp locally](./docs/build.md)
|
||||||
|
|
||||||
## Supported backends
|
## Supported backends
|
||||||
|
|
||||||
|
@ -430,11 +430,11 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
|
||||||
### Prepare and Quantize
|
### Prepare and Quantize
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
|
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `jarvis.cpp` main every 6 hours.
|
||||||
|
|
||||||
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-jarvis-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
||||||
|
|
||||||
Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
|
Note: `convert.py` has been moved to `examples/convert_legacy_jarvis.py` and shouldn't be used for anything other than `Jarvis/Jarvis2/Mistral` models and their derivatives.
|
||||||
It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
||||||
|
|
||||||
To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
|
To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
|
||||||
|
@ -444,17 +444,17 @@ To learn more about quantizing model, [read this documentation](./examples/quant
|
||||||
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
|
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
|
||||||
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
|
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
|
||||||
|
|
||||||
To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
|
To learn more how to measure perplexity using jarvis.cpp, [read this documentation](./examples/perplexity/README.md)
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
- Contributors can open PRs
|
- Contributors can open PRs
|
||||||
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
|
- Collaborators can push to branches in the `jarvis.cpp` repo and merge PRs into the `master` branch
|
||||||
- Collaborators will be invited based on contributions
|
- Collaborators will be invited based on contributions
|
||||||
- Any help with managing issues, PRs and projects is very appreciated!
|
- Any help with managing issues, PRs and projects is very appreciated!
|
||||||
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
|
- See [good first issues](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
|
||||||
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
|
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
|
||||||
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
|
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/jarvis.cpp/discussions/205)
|
||||||
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
|
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
|
||||||
|
|
||||||
## Other documentations
|
## Other documentations
|
||||||
|
@ -470,13 +470,13 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
|
||||||
- [Running on Docker](./docs/docker.md)
|
- [Running on Docker](./docs/docker.md)
|
||||||
- [Build on Android](./docs/android.md)
|
- [Build on Android](./docs/android.md)
|
||||||
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
|
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
|
||||||
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
- [GGML tips & tricks](https://github.com/ggerganov/jarvis.cpp/wiki/GGML-Tips-&-Tricks)
|
||||||
|
|
||||||
**Seminal papers and background on the models**
|
**Seminal papers and background on the models**
|
||||||
|
|
||||||
If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
|
If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
|
||||||
- LLaMA:
|
- LLaMA:
|
||||||
- [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
|
- [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-jarvis-meta-ai/)
|
||||||
- [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
|
- [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
|
||||||
- GPT-3
|
- GPT-3
|
||||||
- [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
|
- [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# Security Policy
|
# Security Policy
|
||||||
|
|
||||||
- [**Using llama.cpp securely**](#using-llamacpp-securely)
|
- [**Using jarvis.cpp securely**](#using-jarviscpp-securely)
|
||||||
- [Untrusted models](#untrusted-models)
|
- [Untrusted models](#untrusted-models)
|
||||||
- [Untrusted inputs](#untrusted-inputs)
|
- [Untrusted inputs](#untrusted-inputs)
|
||||||
- [Data privacy](#data-privacy)
|
- [Data privacy](#data-privacy)
|
||||||
|
@ -8,7 +8,7 @@
|
||||||
- [Multi-Tenant environments](#multi-tenant-environments)
|
- [Multi-Tenant environments](#multi-tenant-environments)
|
||||||
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
|
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
|
||||||
|
|
||||||
## Using llama.cpp securely
|
## Using jarvis.cpp securely
|
||||||
|
|
||||||
### Untrusted models
|
### Untrusted models
|
||||||
Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
|
Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
|
||||||
|
@ -57,11 +57,11 @@ If you intend to run multiple models in parallel with shared memory, it is your
|
||||||
|
|
||||||
## Reporting a vulnerability
|
## Reporting a vulnerability
|
||||||
|
|
||||||
Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
|
Beware that none of the topics under [Using jarvis.cpp securely](#using-jarviscpp-securely) are considered vulnerabilities of LLaMA C++.
|
||||||
|
|
||||||
<!-- normal version -->
|
<!-- normal version -->
|
||||||
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
|
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
|
||||||
|
|
||||||
Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
|
Please disclose it as a private [security advisory](https://github.com/ggerganov/jarvis.cpp/security/advisories/new).
|
||||||
|
|
||||||
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
# CI
|
# CI
|
||||||
|
|
||||||
In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
|
In addition to [Github Actions](https://github.com/ggerganov/jarvis.cpp/actions) `jarvis.cpp` uses a custom CI framework:
|
||||||
|
|
||||||
https://github.com/ggml-org/ci
|
https://github.com/ggml-org/ci
|
||||||
|
|
||||||
It monitors the `master` branch for new commits and runs the
|
It monitors the `master` branch for new commits and runs the
|
||||||
[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
|
[ci/run.sh](https://github.com/ggerganov/jarvis.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
|
||||||
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
|
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
|
||||||
to cover various hardware architectures, including GPU and Apple Silicon instances.
|
to cover various hardware architectures, including GPU and Apple Silicon instances.
|
||||||
|
|
||||||
|
|
268
ci/run.sh
268
ci/run.sh
|
@ -36,7 +36,7 @@ sd=`dirname $0`
|
||||||
cd $sd/../
|
cd $sd/../
|
||||||
SRC=`pwd`
|
SRC=`pwd`
|
||||||
|
|
||||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
CMAKE_EXTRA="-DJARVIS_FATAL_WARNINGS=ON"
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||||
|
@ -217,7 +217,7 @@ function gg_sum_test_scripts_release {
|
||||||
function gg_get_model {
|
function gg_get_model {
|
||||||
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
|
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
|
||||||
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
|
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
|
||||||
local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
local gguf_2="$MNT/models/open-jarvis/7B-v2/ggml-model-f16.gguf"
|
||||||
if [[ -s $gguf_0 ]]; then
|
if [[ -s $gguf_0 ]]; then
|
||||||
echo -n "$gguf_0"
|
echo -n "$gguf_0"
|
||||||
elif [[ -s $gguf_1 ]]; then
|
elif [[ -s $gguf_1 ]]; then
|
||||||
|
@ -236,7 +236,7 @@ function gg_run_ctest_with_model_debug {
|
||||||
local model; model=$(gg_get_model)
|
local model; model=$(gg_get_model)
|
||||||
cd build-ci-debug
|
cd build-ci-debug
|
||||||
set -e
|
set -e
|
||||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(JARVISCPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
set +e
|
set +e
|
||||||
cd ..
|
cd ..
|
||||||
}
|
}
|
||||||
|
@ -247,7 +247,7 @@ function gg_run_ctest_with_model_release {
|
||||||
local model; model=$(gg_get_model)
|
local model; model=$(gg_get_model)
|
||||||
cd build-ci-release
|
cd build-ci-release
|
||||||
set -e
|
set -e
|
||||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(JARVISCPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
set +e
|
set +e
|
||||||
cd ..
|
cd ..
|
||||||
}
|
}
|
||||||
|
@ -272,24 +272,24 @@ function gg_sum_ctest_with_model_release {
|
||||||
gg_printf '```\n'
|
gg_printf '```\n'
|
||||||
}
|
}
|
||||||
|
|
||||||
# open_llama_7b_v2
|
# open_jarvis_7b_v2
|
||||||
|
|
||||||
function gg_run_open_llama_7b_v2 {
|
function gg_run_open_jarvis_7b_v2 {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
|
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
|
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/config.json
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
|
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/tokenizer.model
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
|
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/tokenizer_config.json
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
|
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/special_tokens_map.json
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
|
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/pytorch_model.bin.index.json
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
|
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
|
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
|
||||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
|
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/generation_config.json
|
||||||
|
|
||||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||||
|
|
||||||
path_models="../models-mnt/open-llama/7B-v2"
|
path_models="../models-mnt/open-jarvis/7B-v2"
|
||||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||||
|
|
||||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||||
|
@ -299,7 +299,7 @@ function gg_run_open_llama_7b_v2 {
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../examples/convert_legacy_jarvis.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
@ -315,47 +315,47 @@ function gg_run_open_llama_7b_v2 {
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
wiki_test="${path_wiki}/wiki.test.raw"
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/jarvis-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/jarvis-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/jarvis-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/jarvis-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/jarvis-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/jarvis-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/jarvis-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/jarvis-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/jarvis-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/jarvis-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/jarvis-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/jarvis-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/jarvis-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/jarvis-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/jarvis-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/jarvis-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -387,7 +387,7 @@ function gg_run_open_llama_7b_v2 {
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
||||||
function gg_sum_open_llama_7b_v2 {
|
function gg_sum_open_jarvis_7b_v2 {
|
||||||
gg_printf '### %s\n\n' "${ci}"
|
gg_printf '### %s\n\n' "${ci}"
|
||||||
|
|
||||||
gg_printf 'OpenLLaMA 7B-v2:\n'
|
gg_printf 'OpenLLaMA 7B-v2:\n'
|
||||||
|
@ -449,45 +449,45 @@ function gg_run_pythia_1_4b {
|
||||||
|
|
||||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/jarvis-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/jarvis-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/jarvis-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/jarvis-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/jarvis-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/jarvis-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/jarvis-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/jarvis-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/jarvis-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/jarvis-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/jarvis-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/jarvis-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/jarvis-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/jarvis-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -580,47 +580,47 @@ function gg_run_pythia_2_8b {
|
||||||
|
|
||||||
wiki_test="${path_wiki}/wiki.test.raw"
|
wiki_test="${path_wiki}/wiki.test.raw"
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
|
||||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
|
||||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k
|
||||||
|
|
||||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/jarvis-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/jarvis-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/jarvis-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/jarvis-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/jarvis-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/jarvis-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/jarvis-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/jarvis-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/jarvis-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/jarvis-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/jarvis-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/jarvis-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
(time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
(time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
(time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
(time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
(time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||||
|
|
||||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
(time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||||
|
|
||||||
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/jarvis-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/jarvis-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/jarvis-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
(time ./bin/jarvis-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||||
|
|
||||||
function check_ppl {
|
function check_ppl {
|
||||||
qnt="$1"
|
qnt="$1"
|
||||||
|
@ -704,10 +704,10 @@ function gg_run_embd_bge_small {
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
|
||||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
|
||||||
|
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
(time ./bin/jarvis-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
(time ./bin/jarvis-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
@ -752,7 +752,7 @@ function gg_run_rerank_tiny {
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
|
||||||
# for this model, the SEP token is "</s>"
|
# for this model, the SEP token is "</s>"
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
(time ./bin/jarvis-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||||
|
|
||||||
# sample output
|
# sample output
|
||||||
# rerank score 0: 0.029
|
# rerank score 0: 0.029
|
||||||
|
@ -804,11 +804,11 @@ function gg_check_build_requirements {
|
||||||
|
|
||||||
## main
|
## main
|
||||||
|
|
||||||
export LLAMA_LOG_PREFIX=1
|
export JARVIS_LOG_PREFIX=1
|
||||||
export LLAMA_LOG_TIMESTAMPS=1
|
export JARVIS_LOG_TIMESTAMPS=1
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
# Create symlink: ./jarvis.cpp/models-mnt -> $MNT/models/models-mnt
|
||||||
rm -rf ${SRC}/models-mnt
|
rm -rf ${SRC}/models-mnt
|
||||||
mnt_models=${MNT}/models
|
mnt_models=${MNT}/models
|
||||||
mkdir -p ${mnt_models}
|
mkdir -p ${mnt_models}
|
||||||
|
@ -841,7 +841,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
test $ret -eq 0 && gg_run pythia_1_4b
|
test $ret -eq 0 && gg_run pythia_1_4b
|
||||||
else
|
else
|
||||||
test $ret -eq 0 && gg_run pythia_2_8b
|
test $ret -eq 0 && gg_run pythia_2_8b
|
||||||
#test $ret -eq 0 && gg_run open_llama_7b_v2
|
#test $ret -eq 0 && gg_run open_jarvis_7b_v2
|
||||||
fi
|
fi
|
||||||
test $ret -eq 0 && gg_run ctest_with_model_debug
|
test $ret -eq 0 && gg_run ctest_with_model_debug
|
||||||
test $ret -eq 0 && gg_run ctest_with_model_release
|
test $ret -eq 0 && gg_run ctest_with_model_release
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
|
set(JARVIS_VERSION @JARVIS_INSTALL_VERSION@)
|
||||||
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
set(JARVIS_BUILD_COMMIT @JARVIS_BUILD_COMMIT@)
|
||||||
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
set(JARVIS_BUILD_NUMBER @JARVIS_BUILD_NUMBER@)
|
||||||
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
set(JARVIS_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||||
|
|
||||||
set(GGML_BLAS @GGML_BLAS@)
|
set(GGML_BLAS @GGML_BLAS@)
|
||||||
set(GGML_CUDA @GGML_CUDA@)
|
set(GGML_CUDA @GGML_CUDA@)
|
||||||
|
@ -18,9 +18,9 @@ set(GGML_OPENMP @GGML_OPENMP@)
|
||||||
|
|
||||||
@PACKAGE_INIT@
|
@PACKAGE_INIT@
|
||||||
|
|
||||||
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
set_and_check(JARVIS_INCLUDE_DIR "@PACKAGE_JARVIS_INCLUDE_INSTALL_DIR@")
|
||||||
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
set_and_check(JARVIS_LIB_DIR "@PACKAGE_JARVIS_LIB_INSTALL_DIR@")
|
||||||
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
set_and_check(JARVIS_BIN_DIR "@PACKAGE_JARVIS_BIN_INSTALL_DIR@")
|
||||||
|
|
||||||
# Ensure transient dependencies satisfied
|
# Ensure transient dependencies satisfied
|
||||||
|
|
||||||
|
@ -66,25 +66,25 @@ endif()
|
||||||
|
|
||||||
find_library(ggml_LIBRARY ggml
|
find_library(ggml_LIBRARY ggml
|
||||||
REQUIRED
|
REQUIRED
|
||||||
HINTS ${LLAMA_LIB_DIR})
|
HINTS ${JARVIS_LIB_DIR})
|
||||||
|
|
||||||
find_library(llama_LIBRARY llama
|
find_library(jarvis_LIBRARY jarvis
|
||||||
REQUIRED
|
REQUIRED
|
||||||
HINTS ${LLAMA_LIB_DIR})
|
HINTS ${JARVIS_LIB_DIR})
|
||||||
|
|
||||||
set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
|
set(_jarvis_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
|
||||||
set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
|
set(_jarvis_transient_defines "@GGML_TRANSIENT_DEFINES@")
|
||||||
|
|
||||||
add_library(llama UNKNOWN IMPORTED)
|
add_library(jarvis UNKNOWN IMPORTED)
|
||||||
|
|
||||||
set_target_properties(llama
|
set_target_properties(jarvis
|
||||||
PROPERTIES
|
PROPERTIES
|
||||||
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
INTERFACE_INCLUDE_DIRECTORIES "${JARVIS_INCLUDE_DIR}"
|
||||||
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
|
INTERFACE_LINK_LIBRARIES "${_jarvis_link_deps}"
|
||||||
INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
|
INTERFACE_COMPILE_DEFINITIONS "${_jarvis_transient_defines}"
|
||||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||||
IMPORTED_LOCATION "${llama_LIBRARY}"
|
IMPORTED_LOCATION "${jarvis_LIBRARY}"
|
||||||
INTERFACE_COMPILE_FEATURES cxx_std_11
|
INTERFACE_COMPILE_FEATURES cxx_std_11
|
||||||
POSITION_INDEPENDENT_CODE ON )
|
POSITION_INDEPENDENT_CODE ON )
|
||||||
|
|
||||||
check_required_components(Llama)
|
check_required_components(Jarvis)
|
||||||
|
|
|
@ -3,8 +3,8 @@ exec_prefix=${prefix}
|
||||||
libdir=${exec_prefix}/lib
|
libdir=${exec_prefix}/lib
|
||||||
includedir=${prefix}/include
|
includedir=${prefix}/include
|
||||||
|
|
||||||
Name: llama
|
Name: jarvis
|
||||||
Description: Port of Facebook's LLaMA model in C/C++
|
Description: Port of Facebook's LLaMA model in C/C++
|
||||||
Version: @PROJECT_VERSION@
|
Version: @PROJECT_VERSION@
|
||||||
Libs: -L${libdir} -lllama
|
Libs: -L${libdir} -ljarvis
|
||||||
Cflags: -I${includedir}
|
Cflags: -I${includedir}
|
||||||
|
|
|
@ -74,17 +74,17 @@ if (BUILD_SHARED_LIBS)
|
||||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
set(JARVIS_COMMON_EXTRA_LIBS build_info)
|
||||||
|
|
||||||
# Use curl to download model url
|
# Use curl to download model url
|
||||||
if (LLAMA_CURL)
|
if (JARVIS_CURL)
|
||||||
find_package(CURL REQUIRED)
|
find_package(CURL REQUIRED)
|
||||||
add_definitions(-DLLAMA_USE_CURL)
|
add_definitions(-DJARVIS_USE_CURL)
|
||||||
include_directories(${CURL_INCLUDE_DIRS})
|
include_directories(${CURL_INCLUDE_DIRS})
|
||||||
find_library(CURL_LIBRARY curl REQUIRED)
|
find_library(CURL_LIBRARY curl REQUIRED)
|
||||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
set(JARVIS_COMMON_EXTRA_LIBS ${JARVIS_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
target_include_directories(${TARGET} PUBLIC .)
|
target_include_directories(${TARGET} PUBLIC .)
|
||||||
target_compile_features (${TARGET} PUBLIC cxx_std_11)
|
target_compile_features (${TARGET} PUBLIC cxx_std_11)
|
||||||
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
target_link_libraries (${TARGET} PRIVATE ${JARVIS_COMMON_EXTRA_LIBS} PUBLIC jarvis Threads::Threads)
|
||||||
|
|
382
common/arg.cpp
382
common/arg.cpp
File diff suppressed because it is too large
Load diff
12
common/arg.h
12
common/arg.h
|
@ -11,7 +11,7 @@
|
||||||
//
|
//
|
||||||
|
|
||||||
struct common_arg {
|
struct common_arg {
|
||||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
std::set<enum jarvis_example> examples = {JARVIS_EXAMPLE_COMMON};
|
||||||
std::vector<const char *> args;
|
std::vector<const char *> args;
|
||||||
const char * value_hint = nullptr; // help text or example for arg value
|
const char * value_hint = nullptr; // help text or example for arg value
|
||||||
const char * value_hint_2 = nullptr; // for second arg value
|
const char * value_hint_2 = nullptr; // for second arg value
|
||||||
|
@ -52,17 +52,17 @@ struct common_arg {
|
||||||
void (*handler)(common_params & params, const std::string &, const std::string &)
|
void (*handler)(common_params & params, const std::string &, const std::string &)
|
||||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||||
|
|
||||||
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
common_arg & set_examples(std::initializer_list<enum jarvis_example> examples);
|
||||||
common_arg & set_env(const char * env);
|
common_arg & set_env(const char * env);
|
||||||
common_arg & set_sparam();
|
common_arg & set_sparam();
|
||||||
bool in_example(enum llama_example ex);
|
bool in_example(enum jarvis_example ex);
|
||||||
bool get_value_from_env(std::string & output);
|
bool get_value_from_env(std::string & output);
|
||||||
bool has_value_from_env();
|
bool has_value_from_env();
|
||||||
std::string to_string();
|
std::string to_string();
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_params_context {
|
struct common_params_context {
|
||||||
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
enum jarvis_example ex = JARVIS_EXAMPLE_COMMON;
|
||||||
common_params & params;
|
common_params & params;
|
||||||
std::vector<common_arg> options;
|
std::vector<common_arg> options;
|
||||||
void(*print_usage)(int, char **) = nullptr;
|
void(*print_usage)(int, char **) = nullptr;
|
||||||
|
@ -71,7 +71,7 @@ struct common_params_context {
|
||||||
|
|
||||||
// parse input arguments from CLI
|
// parse input arguments from CLI
|
||||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
bool common_params_parse(int argc, char ** argv, common_params & params, jarvis_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
||||||
// function to be used by test-arg-parser
|
// function to be used by test-arg-parser
|
||||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
common_params_context common_params_parser_init(common_params & params, jarvis_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
|
int JARVIS_BUILD_NUMBER = @BUILD_NUMBER@;
|
||||||
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
|
char const *JARVIS_COMMIT = "@BUILD_COMMIT@";
|
||||||
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
char const *JARVIS_COMPILER = "@BUILD_COMPILER@";
|
||||||
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
char const *JARVIS_BUILD_TARGET = "@BUILD_TARGET@";
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
#define JSON_ASSERT GGML_ASSERT
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
#include "json-schema-to-grammar.h"
|
#include "json-schema-to-grammar.h"
|
||||||
#include "llama.h"
|
#include "jarvis.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
@ -48,7 +48,7 @@
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
#if defined(LLAMA_USE_CURL)
|
#if defined(JARVIS_USE_CURL)
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
#include <curl/easy.h>
|
#include <curl/easy.h>
|
||||||
#include <future>
|
#include <future>
|
||||||
|
@ -58,7 +58,7 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(LLAMA_USE_CURL)
|
#if defined(JARVIS_USE_CURL)
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
#include <linux/limits.h>
|
#include <linux/limits.h>
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
|
@ -66,8 +66,8 @@
|
||||||
#else
|
#else
|
||||||
#include <sys/syslimits.h>
|
#include <sys/syslimits.h>
|
||||||
#endif
|
#endif
|
||||||
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
#define JARVIS_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||||
#endif // LLAMA_USE_CURL
|
#endif // JARVIS_USE_CURL
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
using json = nlohmann::ordered_json;
|
||||||
|
|
||||||
|
@ -364,8 +364,8 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_init() {
|
void common_init() {
|
||||||
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
jarvis_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
||||||
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
|
if (LOG_DEFAULT_JARVIS <= common_log_verbosity_thold) {
|
||||||
common_log_add(common_log_main(), level, "%s", text);
|
common_log_add(common_log_main(), level, "%s", text);
|
||||||
}
|
}
|
||||||
}, NULL);
|
}, NULL);
|
||||||
|
@ -376,7 +376,7 @@ void common_init() {
|
||||||
const char * build_type = " (debug)";
|
const char * build_type = " (debug)";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
LOG_INF("build: %d (%s) with %s for %s%s\n", JARVIS_BUILD_NUMBER, JARVIS_COMMIT, JARVIS_COMPILER, JARVIS_BUILD_TARGET, build_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string common_params_get_system_info(const common_params & params) {
|
std::string common_params_get_system_info(const common_params & params) {
|
||||||
|
@ -389,9 +389,9 @@ std::string common_params_get_system_info(const common_params & params) {
|
||||||
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
||||||
// TODO: windows + arm64 + mingw64
|
// TODO: windows + arm64 + mingw64
|
||||||
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
|
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
|
||||||
os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
|
os << " / " << logicalProcessorCount << " | " << jarvis_print_system_info();
|
||||||
#else
|
#else
|
||||||
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
|
os << " / " << std::thread::hardware_concurrency() << " | " << jarvis_print_system_info();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return os.str();
|
return os.str();
|
||||||
|
@ -483,7 +483,7 @@ std::string string_from(const std::vector<int> & values) {
|
||||||
return buf.str();
|
return buf.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
|
std::string string_from(const struct jarvis_context * ctx, const std::vector<jarvis_token> & tokens) {
|
||||||
std::stringstream buf;
|
std::stringstream buf;
|
||||||
|
|
||||||
buf << "[ ";
|
buf << "[ ";
|
||||||
|
@ -514,7 +514,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
|
||||||
return buf.str();
|
return buf.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
|
std::string string_from(const struct jarvis_context * ctx, const struct jarvis_batch & batch) {
|
||||||
std::stringstream buf;
|
std::stringstream buf;
|
||||||
|
|
||||||
buf << "[ ";
|
buf << "[ ";
|
||||||
|
@ -586,27 +586,27 @@ void string_process_escapes(std::string & input) {
|
||||||
input.resize(output_idx);
|
input.resize(output_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
bool string_parse_kv_override(const char * data, std::vector<jarvis_model_kv_override> & overrides) {
|
||||||
const char * sep = strchr(data, '=');
|
const char * sep = strchr(data, '=');
|
||||||
if (sep == nullptr || sep - data >= 128) {
|
if (sep == nullptr || sep - data >= 128) {
|
||||||
LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
|
LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
llama_model_kv_override kvo;
|
jarvis_model_kv_override kvo;
|
||||||
std::strncpy(kvo.key, data, sep - data);
|
std::strncpy(kvo.key, data, sep - data);
|
||||||
kvo.key[sep - data] = 0;
|
kvo.key[sep - data] = 0;
|
||||||
sep++;
|
sep++;
|
||||||
if (strncmp(sep, "int:", 4) == 0) {
|
if (strncmp(sep, "int:", 4) == 0) {
|
||||||
sep += 4;
|
sep += 4;
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
kvo.tag = JARVIS_KV_OVERRIDE_TYPE_INT;
|
||||||
kvo.val_i64 = std::atol(sep);
|
kvo.val_i64 = std::atol(sep);
|
||||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||||
sep += 6;
|
sep += 6;
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
kvo.tag = JARVIS_KV_OVERRIDE_TYPE_FLOAT;
|
||||||
kvo.val_f64 = std::atof(sep);
|
kvo.val_f64 = std::atof(sep);
|
||||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||||
sep += 5;
|
sep += 5;
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
kvo.tag = JARVIS_KV_OVERRIDE_TYPE_BOOL;
|
||||||
if (std::strcmp(sep, "true") == 0) {
|
if (std::strcmp(sep, "true") == 0) {
|
||||||
kvo.val_bool = true;
|
kvo.val_bool = true;
|
||||||
} else if (std::strcmp(sep, "false") == 0) {
|
} else if (std::strcmp(sep, "false") == 0) {
|
||||||
|
@ -617,7 +617,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
||||||
}
|
}
|
||||||
} else if (strncmp(sep, "str:", 4) == 0) {
|
} else if (strncmp(sep, "str:", 4) == 0) {
|
||||||
sep += 4;
|
sep += 4;
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
kvo.tag = JARVIS_KV_OVERRIDE_TYPE_STR;
|
||||||
if (strlen(sep) > 127) {
|
if (strlen(sep) > 127) {
|
||||||
LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
||||||
return false;
|
return false;
|
||||||
|
@ -788,8 +788,8 @@ std::string fs_get_cache_directory() {
|
||||||
}
|
}
|
||||||
return p;
|
return p;
|
||||||
};
|
};
|
||||||
if (getenv("LLAMA_CACHE")) {
|
if (getenv("JARVIS_CACHE")) {
|
||||||
cache_directory = std::getenv("LLAMA_CACHE");
|
cache_directory = std::getenv("JARVIS_CACHE");
|
||||||
} else {
|
} else {
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
if (std::getenv("XDG_CACHE_HOME")) {
|
if (std::getenv("XDG_CACHE_HOME")) {
|
||||||
|
@ -803,7 +803,7 @@ std::string fs_get_cache_directory() {
|
||||||
cache_directory = std::getenv("LOCALAPPDATA");
|
cache_directory = std::getenv("LOCALAPPDATA");
|
||||||
#endif // __linux__
|
#endif // __linux__
|
||||||
cache_directory = ensure_trailing_slash(cache_directory);
|
cache_directory = ensure_trailing_slash(cache_directory);
|
||||||
cache_directory += "llama.cpp";
|
cache_directory += "jarvis.cpp";
|
||||||
}
|
}
|
||||||
return ensure_trailing_slash(cache_directory);
|
return ensure_trailing_slash(cache_directory);
|
||||||
}
|
}
|
||||||
|
@ -824,16 +824,16 @@ std::string fs_get_cache_file(const std::string & filename) {
|
||||||
//
|
//
|
||||||
struct common_init_result common_init_from_params(common_params & params) {
|
struct common_init_result common_init_from_params(common_params & params) {
|
||||||
common_init_result iparams;
|
common_init_result iparams;
|
||||||
auto mparams = common_model_params_to_llama(params);
|
auto mparams = common_model_params_to_jarvis(params);
|
||||||
|
|
||||||
llama_model * model = nullptr;
|
jarvis_model * model = nullptr;
|
||||||
|
|
||||||
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
||||||
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||||
} else if (!params.model_url.empty()) {
|
} else if (!params.model_url.empty()) {
|
||||||
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||||
} else {
|
} else {
|
||||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
model = jarvis_load_model_from_file(params.model.c_str(), mparams);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
|
@ -844,58 +844,58 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
if (params.reranking) {
|
if (params.reranking) {
|
||||||
bool ok = true;
|
bool ok = true;
|
||||||
|
|
||||||
if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
|
if (jarvis_token_bos(model) == JARVIS_TOKEN_NULL) {
|
||||||
LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
|
LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
|
||||||
ok = false;
|
ok = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
if (jarvis_token_eos(model) == JARVIS_TOKEN_NULL) {
|
||||||
LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
|
LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
|
||||||
ok = false;
|
ok = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
|
if (jarvis_token_sep(model) == JARVIS_TOKEN_NULL) {
|
||||||
LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
|
LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
|
||||||
ok = false;
|
ok = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
llama_free_model(model);
|
jarvis_free_model(model);
|
||||||
|
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto cparams = common_context_params_to_llama(params);
|
auto cparams = common_context_params_to_jarvis(params);
|
||||||
|
|
||||||
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
jarvis_context * lctx = jarvis_new_context_with_model(model, cparams);
|
||||||
if (lctx == NULL) {
|
if (lctx == NULL) {
|
||||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||||
llama_free_model(model);
|
jarvis_free_model(model);
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.control_vectors.empty()) {
|
if (!params.control_vectors.empty()) {
|
||||||
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
||||||
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = jarvis_n_layer(model);
|
||||||
|
|
||||||
const auto cvec = common_control_vector_load(params.control_vectors);
|
const auto cvec = common_control_vector_load(params.control_vectors);
|
||||||
if (cvec.n_embd == -1) {
|
if (cvec.n_embd == -1) {
|
||||||
llama_free(lctx);
|
jarvis_free(lctx);
|
||||||
llama_free_model(model);
|
jarvis_free_model(model);
|
||||||
|
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
int err = llama_control_vector_apply(lctx,
|
int err = jarvis_control_vector_apply(lctx,
|
||||||
cvec.data.data(),
|
cvec.data.data(),
|
||||||
cvec.data.size(),
|
cvec.data.size(),
|
||||||
cvec.n_embd,
|
cvec.n_embd,
|
||||||
params.control_vector_layer_start,
|
params.control_vector_layer_start,
|
||||||
params.control_vector_layer_end);
|
params.control_vector_layer_end);
|
||||||
if (err) {
|
if (err) {
|
||||||
llama_free(lctx);
|
jarvis_free(lctx);
|
||||||
llama_free_model(model);
|
jarvis_free_model(model);
|
||||||
|
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
@ -906,11 +906,11 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
common_lora_adapter_container loaded_la;
|
common_lora_adapter_container loaded_la;
|
||||||
loaded_la.path = la.path;
|
loaded_la.path = la.path;
|
||||||
loaded_la.scale = la.scale;
|
loaded_la.scale = la.scale;
|
||||||
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
loaded_la.adapter = jarvis_lora_adapter_init(model, la.path.c_str());
|
||||||
if (loaded_la.adapter == nullptr) {
|
if (loaded_la.adapter == nullptr) {
|
||||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||||
llama_free(lctx);
|
jarvis_free(lctx);
|
||||||
llama_free_model(model);
|
jarvis_free_model(model);
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
||||||
|
@ -919,7 +919,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
if (params.sparams.ignore_eos && jarvis_token_eos(model) == JARVIS_TOKEN_NULL) {
|
||||||
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||||
params.sparams.ignore_eos = false;
|
params.sparams.ignore_eos = false;
|
||||||
}
|
}
|
||||||
|
@ -927,35 +927,35 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
if (params.warmup) {
|
if (params.warmup) {
|
||||||
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tmp;
|
std::vector<jarvis_token> tmp;
|
||||||
llama_token bos = llama_token_bos(model);
|
jarvis_token bos = jarvis_token_bos(model);
|
||||||
llama_token eos = llama_token_eos(model);
|
jarvis_token eos = jarvis_token_eos(model);
|
||||||
// some models (e.g. T5) don't have a BOS token
|
// some models (e.g. T5) don't have a BOS token
|
||||||
if (bos != LLAMA_TOKEN_NULL) {
|
if (bos != JARVIS_TOKEN_NULL) {
|
||||||
tmp.push_back(bos);
|
tmp.push_back(bos);
|
||||||
}
|
}
|
||||||
if (eos != LLAMA_TOKEN_NULL) {
|
if (eos != JARVIS_TOKEN_NULL) {
|
||||||
tmp.push_back(eos);
|
tmp.push_back(eos);
|
||||||
}
|
}
|
||||||
if (tmp.empty()) {
|
if (tmp.empty()) {
|
||||||
tmp.push_back(0);
|
tmp.push_back(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_model_has_encoder(model)) {
|
if (jarvis_model_has_encoder(model)) {
|
||||||
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
jarvis_encode(lctx, jarvis_batch_get_one(tmp.data(), tmp.size()));
|
||||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
jarvis_token decoder_start_token_id = jarvis_model_decoder_start_token(model);
|
||||||
if (decoder_start_token_id == -1) {
|
if (decoder_start_token_id == -1) {
|
||||||
decoder_start_token_id = bos;
|
decoder_start_token_id = bos;
|
||||||
}
|
}
|
||||||
tmp.clear();
|
tmp.clear();
|
||||||
tmp.push_back(decoder_start_token_id);
|
tmp.push_back(decoder_start_token_id);
|
||||||
}
|
}
|
||||||
if (llama_model_has_decoder(model)) {
|
if (jarvis_model_has_decoder(model)) {
|
||||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
jarvis_decode(lctx, jarvis_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
||||||
}
|
}
|
||||||
llama_kv_cache_clear(lctx);
|
jarvis_kv_cache_clear(lctx);
|
||||||
llama_synchronize(lctx);
|
jarvis_synchronize(lctx);
|
||||||
llama_perf_context_reset(lctx);
|
jarvis_perf_context_reset(lctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
iparams.model = model;
|
iparams.model = model;
|
||||||
|
@ -964,17 +964,17 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
|
void common_lora_adapters_apply(struct jarvis_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
|
||||||
llama_lora_adapter_clear(ctx);
|
jarvis_lora_adapter_clear(ctx);
|
||||||
for (auto & la : lora_adapters) {
|
for (auto & la : lora_adapters) {
|
||||||
if (la.scale != 0.0f) {
|
if (la.scale != 0.0f) {
|
||||||
llama_lora_adapter_set(ctx, la.adapter, la.scale);
|
jarvis_lora_adapter_set(ctx, la.adapter, la.scale);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model_params common_model_params_to_llama(const common_params & params) {
|
struct jarvis_model_params common_model_params_to_jarvis(const common_params & params) {
|
||||||
auto mparams = llama_model_default_params();
|
auto mparams = jarvis_model_default_params();
|
||||||
|
|
||||||
if (params.n_gpu_layers != -1) {
|
if (params.n_gpu_layers != -1) {
|
||||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||||
|
@ -1025,8 +1025,8 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||||
throw std::runtime_error("Unsupported cache type: " + s);
|
throw std::runtime_error("Unsupported cache type: " + s);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
struct jarvis_context_params common_context_params_to_jarvis(const common_params & params) {
|
||||||
auto cparams = llama_context_default_params();
|
auto cparams = jarvis_context_default_params();
|
||||||
|
|
||||||
cparams.n_ctx = params.n_ctx;
|
cparams.n_ctx = params.n_ctx;
|
||||||
cparams.n_seq_max = params.n_parallel;
|
cparams.n_seq_max = params.n_parallel;
|
||||||
|
@ -1056,7 +1056,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||||
|
|
||||||
if (params.reranking) {
|
if (params.reranking) {
|
||||||
cparams.embeddings = true;
|
cparams.embeddings = true;
|
||||||
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
cparams.pooling_type = JARVIS_POOLING_TYPE_RANK;
|
||||||
}
|
}
|
||||||
|
|
||||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||||
|
@ -1081,7 +1081,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
||||||
return tpp;
|
return tpp;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef LLAMA_USE_CURL
|
#ifdef JARVIS_USE_CURL
|
||||||
|
|
||||||
#define CURL_MAX_RETRY 3
|
#define CURL_MAX_RETRY 3
|
||||||
#define CURL_RETRY_DELAY_SECONDS 2
|
#define CURL_RETRY_DELAY_SECONDS 2
|
||||||
|
@ -1279,7 +1279,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
||||||
|
|
||||||
// helper function to hide password in URL
|
// helper function to hide password in URL
|
||||||
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
auto jarvis_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
||||||
std::size_t protocol_pos = url.find("://");
|
std::size_t protocol_pos = url.find("://");
|
||||||
if (protocol_pos == std::string::npos) {
|
if (protocol_pos == std::string::npos) {
|
||||||
return url; // Malformed URL
|
return url; // Malformed URL
|
||||||
|
@ -1295,7 +1295,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
||||||
|
|
||||||
// start the download
|
// start the download
|
||||||
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
||||||
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
jarvis_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
||||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
||||||
if (!was_perform_successful) {
|
if (!was_perform_successful) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -1329,11 +1329,11 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_url(
|
struct jarvis_model * common_load_model_from_url(
|
||||||
const char * model_url,
|
const char * model_url,
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
const char * hf_token,
|
const char * hf_token,
|
||||||
const struct llama_model_params & params) {
|
const struct jarvis_model_params & params) {
|
||||||
// Basic validation of the model_url
|
// Basic validation of the model_url
|
||||||
if (!model_url || strlen(model_url) == 0) {
|
if (!model_url || strlen(model_url) == 0) {
|
||||||
LOG_ERR("%s: invalid model_url\n", __func__);
|
LOG_ERR("%s: invalid model_url\n", __func__);
|
||||||
|
@ -1367,17 +1367,17 @@ struct llama_model * common_load_model_from_url(
|
||||||
|
|
||||||
if (n_split > 1) {
|
if (n_split > 1) {
|
||||||
char split_prefix[PATH_MAX] = {0};
|
char split_prefix[PATH_MAX] = {0};
|
||||||
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
char split_url_prefix[JARVIS_CURL_MAX_URL_LENGTH] = {0};
|
||||||
|
|
||||||
// Verify the first split file format
|
// Verify the first split file format
|
||||||
// and extract split URL and PATH prefixes
|
// and extract split URL and PATH prefixes
|
||||||
{
|
{
|
||||||
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
if (!jarvis_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
||||||
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
|
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
if (!jarvis_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
||||||
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -1388,10 +1388,10 @@ struct llama_model * common_load_model_from_url(
|
||||||
for (int idx = 1; idx < n_split; idx++) {
|
for (int idx = 1; idx < n_split; idx++) {
|
||||||
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
|
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
|
||||||
char split_path[PATH_MAX] = {0};
|
char split_path[PATH_MAX] = {0};
|
||||||
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
|
jarvis_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
|
||||||
|
|
||||||
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
char split_url[JARVIS_CURL_MAX_URL_LENGTH] = {0};
|
||||||
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
jarvis_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
||||||
|
|
||||||
return common_download_file(split_url, split_path, hf_token);
|
return common_download_file(split_url, split_path, hf_token);
|
||||||
}, idx));
|
}, idx));
|
||||||
|
@ -1405,19 +1405,19 @@ struct llama_model * common_load_model_from_url(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return llama_load_model_from_file(path_model, params);
|
return jarvis_load_model_from_file(path_model, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_hf(
|
struct jarvis_model * common_load_model_from_hf(
|
||||||
const char * repo,
|
const char * repo,
|
||||||
const char * model,
|
const char * model,
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
const char * hf_token,
|
const char * hf_token,
|
||||||
const struct llama_model_params & params) {
|
const struct jarvis_model_params & params) {
|
||||||
// construct hugging face model url:
|
// construct hugging face model url:
|
||||||
//
|
//
|
||||||
// --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
|
// --repo ggml-org/models --file tinyjarvis-1.1b/ggml-model-f16.gguf
|
||||||
// https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
|
// https://huggingface.co/ggml-org/models/resolve/main/tinyjarvis-1.1b/ggml-model-f16.gguf
|
||||||
//
|
//
|
||||||
// --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
|
// --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
|
||||||
// https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
|
// https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
|
||||||
|
@ -1433,42 +1433,42 @@ struct llama_model * common_load_model_from_hf(
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_url(
|
struct jarvis_model * common_load_model_from_url(
|
||||||
const char * /*model_url*/,
|
const char * /*model_url*/,
|
||||||
const char * /*path_model*/,
|
const char * /*path_model*/,
|
||||||
const char * /*hf_token*/,
|
const char * /*hf_token*/,
|
||||||
const struct llama_model_params & /*params*/) {
|
const struct jarvis_model_params & /*params*/) {
|
||||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
LOG_WRN("%s: jarvis.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_hf(
|
struct jarvis_model * common_load_model_from_hf(
|
||||||
const char * /*repo*/,
|
const char * /*repo*/,
|
||||||
const char * /*model*/,
|
const char * /*model*/,
|
||||||
const char * /*path_model*/,
|
const char * /*path_model*/,
|
||||||
const char * /*hf_token*/,
|
const char * /*hf_token*/,
|
||||||
const struct llama_model_params & /*params*/) {
|
const struct jarvis_model_params & /*params*/) {
|
||||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
LOG_WRN("%s: jarvis.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // LLAMA_USE_CURL
|
#endif // JARVIS_USE_CURL
|
||||||
|
|
||||||
//
|
//
|
||||||
// Batch utils
|
// Batch utils
|
||||||
//
|
//
|
||||||
|
|
||||||
void common_batch_clear(struct llama_batch & batch) {
|
void common_batch_clear(struct jarvis_batch & batch) {
|
||||||
batch.n_tokens = 0;
|
batch.n_tokens = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_batch_add(
|
void common_batch_add(
|
||||||
struct llama_batch & batch,
|
struct jarvis_batch & batch,
|
||||||
llama_token id,
|
jarvis_token id,
|
||||||
llama_pos pos,
|
jarvis_pos pos,
|
||||||
const std::vector<llama_seq_id> & seq_ids,
|
const std::vector<jarvis_seq_id> & seq_ids,
|
||||||
bool logits) {
|
bool logits) {
|
||||||
GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
|
GGML_ASSERT(batch.seq_id[batch.n_tokens] && "jarvis_batch size exceeded");
|
||||||
|
|
||||||
batch.token [batch.n_tokens] = id;
|
batch.token [batch.n_tokens] = id;
|
||||||
batch.pos [batch.n_tokens] = pos;
|
batch.pos [batch.n_tokens] = pos;
|
||||||
|
@ -1485,26 +1485,26 @@ void common_batch_add(
|
||||||
// Vocab utils
|
// Vocab utils
|
||||||
//
|
//
|
||||||
|
|
||||||
std::vector<llama_token> common_tokenize(
|
std::vector<jarvis_token> common_tokenize(
|
||||||
const struct llama_context * ctx,
|
const struct jarvis_context * ctx,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_special,
|
bool add_special,
|
||||||
bool parse_special) {
|
bool parse_special) {
|
||||||
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
return common_tokenize(jarvis_get_model(ctx), text, add_special, parse_special);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> common_tokenize(
|
std::vector<jarvis_token> common_tokenize(
|
||||||
const struct llama_model * model,
|
const struct jarvis_model * model,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_special,
|
bool add_special,
|
||||||
bool parse_special) {
|
bool parse_special) {
|
||||||
// upper limit for the number of tokens
|
// upper limit for the number of tokens
|
||||||
int n_tokens = text.length() + 2 * add_special;
|
int n_tokens = text.length() + 2 * add_special;
|
||||||
std::vector<llama_token> result(n_tokens);
|
std::vector<jarvis_token> result(n_tokens);
|
||||||
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
n_tokens = jarvis_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
result.resize(-n_tokens);
|
result.resize(-n_tokens);
|
||||||
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
int check = jarvis_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||||
GGML_ASSERT(check == -n_tokens);
|
GGML_ASSERT(check == -n_tokens);
|
||||||
} else {
|
} else {
|
||||||
result.resize(n_tokens);
|
result.resize(n_tokens);
|
||||||
|
@ -1512,13 +1512,13 @@ std::vector<llama_token> common_tokenize(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
std::string common_token_to_piece(const struct jarvis_context * ctx, jarvis_token token, bool special) {
|
||||||
std::string piece;
|
std::string piece;
|
||||||
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
||||||
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
const int n_chars = jarvis_token_to_piece(jarvis_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
||||||
if (n_chars < 0) {
|
if (n_chars < 0) {
|
||||||
piece.resize(-n_chars);
|
piece.resize(-n_chars);
|
||||||
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
int check = jarvis_token_to_piece(jarvis_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
||||||
GGML_ASSERT(check == -n_chars);
|
GGML_ASSERT(check == -n_chars);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -1528,13 +1528,13 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
|
||||||
return piece;
|
return piece;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
std::string common_detokenize(jarvis_context * ctx, const std::vector<jarvis_token> & tokens, bool special) {
|
||||||
std::string text;
|
std::string text;
|
||||||
text.resize(std::max(text.capacity(), tokens.size()));
|
text.resize(std::max(text.capacity(), tokens.size()));
|
||||||
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
int32_t n_chars = jarvis_detokenize(jarvis_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
||||||
if (n_chars < 0) {
|
if (n_chars < 0) {
|
||||||
text.resize(-n_chars);
|
text.resize(-n_chars);
|
||||||
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
n_chars = jarvis_detokenize(jarvis_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
||||||
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
|
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1549,18 +1549,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
|
||||||
//
|
//
|
||||||
|
|
||||||
bool common_chat_verify_template(const std::string & tmpl) {
|
bool common_chat_verify_template(const std::string & tmpl) {
|
||||||
llama_chat_message chat[] = {{"user", "test"}};
|
jarvis_chat_message chat[] = {{"user", "test"}};
|
||||||
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
int res = jarvis_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||||
return res >= 0;
|
return res >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string common_chat_apply_template(const struct llama_model * model,
|
std::string common_chat_apply_template(const struct jarvis_model * model,
|
||||||
const std::string & tmpl,
|
const std::string & tmpl,
|
||||||
const std::vector<common_chat_msg> & msgs,
|
const std::vector<common_chat_msg> & msgs,
|
||||||
bool add_ass) {
|
bool add_ass) {
|
||||||
int alloc_size = 0;
|
int alloc_size = 0;
|
||||||
bool fallback = false; // indicate if we must fallback to default chatml
|
bool fallback = false; // indicate if we must fallback to default chatml
|
||||||
std::vector<llama_chat_message> chat;
|
std::vector<jarvis_chat_message> chat;
|
||||||
for (auto & msg : msgs) {
|
for (auto & msg : msgs) {
|
||||||
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
||||||
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
|
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
|
||||||
|
@ -1570,17 +1570,17 @@ std::string common_chat_apply_template(const struct llama_model * model,
|
||||||
std::vector<char> buf(alloc_size);
|
std::vector<char> buf(alloc_size);
|
||||||
|
|
||||||
// run the first time to get the total output length
|
// run the first time to get the total output length
|
||||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
int32_t res = jarvis_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||||
|
|
||||||
// error: chat template is not supported
|
// error: chat template is not supported
|
||||||
if (res < 0) {
|
if (res < 0) {
|
||||||
if (ptr_tmpl != nullptr) {
|
if (ptr_tmpl != nullptr) {
|
||||||
// if the custom "tmpl" is not supported, we throw an error
|
// if the custom "tmpl" is not supported, we throw an error
|
||||||
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
// this is a bit redundant (for good), since we're not sure if user validated the custom template with jarvis_chat_verify_template()
|
||||||
throw std::runtime_error("this custom template is not supported");
|
throw std::runtime_error("this custom template is not supported");
|
||||||
} else {
|
} else {
|
||||||
// If the built-in template is not supported, we default to chatml
|
// If the built-in template is not supported, we default to chatml
|
||||||
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
res = jarvis_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||||
fallback = true;
|
fallback = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1588,7 +1588,7 @@ std::string common_chat_apply_template(const struct llama_model * model,
|
||||||
// if it turns out that our buffer is too small, we resize it
|
// if it turns out that our buffer is too small, we resize it
|
||||||
if ((size_t) res > buf.size()) {
|
if ((size_t) res > buf.size()) {
|
||||||
buf.resize(res);
|
buf.resize(res);
|
||||||
res = llama_chat_apply_template(
|
res = jarvis_chat_apply_template(
|
||||||
fallback ? nullptr : model,
|
fallback ? nullptr : model,
|
||||||
fallback ? "chatml" : ptr_tmpl,
|
fallback ? "chatml" : ptr_tmpl,
|
||||||
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||||
|
@ -1598,7 +1598,7 @@ std::string common_chat_apply_template(const struct llama_model * model,
|
||||||
return formatted_chat;
|
return formatted_chat;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string common_chat_format_single(const struct llama_model * model,
|
std::string common_chat_format_single(const struct jarvis_model * model,
|
||||||
const std::string & tmpl,
|
const std::string & tmpl,
|
||||||
const std::vector<common_chat_msg> & past_msg,
|
const std::vector<common_chat_msg> & past_msg,
|
||||||
const common_chat_msg & new_msg,
|
const common_chat_msg & new_msg,
|
||||||
|
@ -1618,7 +1618,7 @@ std::string common_chat_format_single(const struct llama_model * model,
|
||||||
return ss.str();
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string common_chat_format_example(const struct llama_model * model,
|
std::string common_chat_format_example(const struct jarvis_model * model,
|
||||||
const std::string & tmpl) {
|
const std::string & tmpl) {
|
||||||
std::vector<common_chat_msg> msgs = {
|
std::vector<common_chat_msg> msgs = {
|
||||||
{"system", "You are a helpful assistant"},
|
{"system", "You are a helpful assistant"},
|
||||||
|
@ -1633,14 +1633,14 @@ std::string common_chat_format_example(const struct llama_model * model,
|
||||||
// KV cache utils
|
// KV cache utils
|
||||||
//
|
//
|
||||||
|
|
||||||
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
void common_kv_cache_dump_view(const jarvis_kv_cache_view & view, int row_size) {
|
||||||
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
||||||
|
|
||||||
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
||||||
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
||||||
|
|
||||||
llama_kv_cache_view_cell * c_curr = view.cells;
|
jarvis_kv_cache_view_cell * c_curr = view.cells;
|
||||||
llama_seq_id * cs_curr = view.cells_sequences;
|
jarvis_seq_id * cs_curr = view.cells_sequences;
|
||||||
|
|
||||||
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
||||||
if (i % row_size == 0) {
|
if (i % row_size == 0) {
|
||||||
|
@ -1656,15 +1656,15 @@ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
||||||
printf("\n=== Done dumping\n");
|
printf("\n=== Done dumping\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
void common_kv_cache_dump_view_seqs(const jarvis_kv_cache_view & view, int row_size) {
|
||||||
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||||
|
|
||||||
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
||||||
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
||||||
|
|
||||||
std::unordered_map<llama_seq_id, size_t> seqs;
|
std::unordered_map<jarvis_seq_id, size_t> seqs;
|
||||||
llama_kv_cache_view_cell * c_curr = view.cells;
|
jarvis_kv_cache_view_cell * c_curr = view.cells;
|
||||||
llama_seq_id * cs_curr = view.cells_sequences;
|
jarvis_seq_id * cs_curr = view.cells_sequences;
|
||||||
|
|
||||||
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
||||||
for (int j = 0; j < view.n_seq_max; j++) {
|
for (int j = 0; j < view.n_seq_max; j++) {
|
||||||
|
@ -1949,12 +1949,12 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
|
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const jarvis_context * lctx,
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||||
const auto & sparams = params.sparams;
|
const auto & sparams = params.sparams;
|
||||||
|
|
||||||
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
fprintf(stream, "build_commit: %s\n", JARVIS_COMMIT);
|
||||||
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
fprintf(stream, "build_number: %d\n", JARVIS_BUILD_NUMBER);
|
||||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
|
||||||
|
@ -1985,7 +1985,7 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
|
||||||
#endif // NDEBUG
|
#endif // NDEBUG
|
||||||
|
|
||||||
fprintf(stream, "model_desc: %s\n", model_desc);
|
fprintf(stream, "model_desc: %s\n", model_desc);
|
||||||
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
|
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", jarvis_n_vocab(jarvis_get_model(lctx)));
|
||||||
|
|
||||||
#ifdef __OPTIMIZE__
|
#ifdef __OPTIMIZE__
|
||||||
fprintf(stream, "optimize: true\n");
|
fprintf(stream, "optimize: true\n");
|
||||||
|
@ -2087,7 +2087,7 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
|
||||||
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
||||||
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
||||||
|
|
||||||
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + jarvis_max_devices());
|
||||||
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
||||||
|
|
||||||
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
||||||
|
|
138
common/common.h
138
common/common.h
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "llama.h"
|
#include "jarvis.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -18,8 +18,8 @@
|
||||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||||
|
|
||||||
#define print_build_info() do { \
|
#define print_build_info() do { \
|
||||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
fprintf(stderr, "%s: build = %d (%s)\n", __func__, JARVIS_BUILD_NUMBER, JARVIS_COMMIT); \
|
||||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
fprintf(stderr, "%s: built with %s for %s\n", __func__, JARVIS_COMPILER, JARVIS_BUILD_TARGET); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
||||||
|
@ -30,14 +30,14 @@ struct common_lora_adapter_info {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_lora_adapter_container : common_lora_adapter_info {
|
struct common_lora_adapter_container : common_lora_adapter_info {
|
||||||
struct llama_lora_adapter * adapter;
|
struct jarvis_lora_adapter * adapter;
|
||||||
};
|
};
|
||||||
|
|
||||||
// build info
|
// build info
|
||||||
extern int LLAMA_BUILD_NUMBER;
|
extern int JARVIS_BUILD_NUMBER;
|
||||||
extern char const * LLAMA_COMMIT;
|
extern char const * JARVIS_COMMIT;
|
||||||
extern char const * LLAMA_COMPILER;
|
extern char const * JARVIS_COMPILER;
|
||||||
extern char const * LLAMA_BUILD_TARGET;
|
extern char const * JARVIS_BUILD_TARGET;
|
||||||
|
|
||||||
struct common_control_vector_load_info;
|
struct common_control_vector_load_info;
|
||||||
|
|
||||||
|
@ -61,25 +61,25 @@ int32_t cpu_get_num_math();
|
||||||
// Common params
|
// Common params
|
||||||
//
|
//
|
||||||
|
|
||||||
enum llama_example {
|
enum jarvis_example {
|
||||||
LLAMA_EXAMPLE_COMMON,
|
JARVIS_EXAMPLE_COMMON,
|
||||||
LLAMA_EXAMPLE_SPECULATIVE,
|
JARVIS_EXAMPLE_SPECULATIVE,
|
||||||
LLAMA_EXAMPLE_MAIN,
|
JARVIS_EXAMPLE_MAIN,
|
||||||
LLAMA_EXAMPLE_INFILL,
|
JARVIS_EXAMPLE_INFILL,
|
||||||
LLAMA_EXAMPLE_EMBEDDING,
|
JARVIS_EXAMPLE_EMBEDDING,
|
||||||
LLAMA_EXAMPLE_PERPLEXITY,
|
JARVIS_EXAMPLE_PERPLEXITY,
|
||||||
LLAMA_EXAMPLE_RETRIEVAL,
|
JARVIS_EXAMPLE_RETRIEVAL,
|
||||||
LLAMA_EXAMPLE_PASSKEY,
|
JARVIS_EXAMPLE_PASSKEY,
|
||||||
LLAMA_EXAMPLE_IMATRIX,
|
JARVIS_EXAMPLE_IMATRIX,
|
||||||
LLAMA_EXAMPLE_BENCH,
|
JARVIS_EXAMPLE_BENCH,
|
||||||
LLAMA_EXAMPLE_SERVER,
|
JARVIS_EXAMPLE_SERVER,
|
||||||
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
JARVIS_EXAMPLE_CVECTOR_GENERATOR,
|
||||||
LLAMA_EXAMPLE_EXPORT_LORA,
|
JARVIS_EXAMPLE_EXPORT_LORA,
|
||||||
LLAMA_EXAMPLE_LLAVA,
|
JARVIS_EXAMPLE_LLAVA,
|
||||||
LLAMA_EXAMPLE_LOOKUP,
|
JARVIS_EXAMPLE_LOOKUP,
|
||||||
LLAMA_EXAMPLE_PARALLEL,
|
JARVIS_EXAMPLE_PARALLEL,
|
||||||
|
|
||||||
LLAMA_EXAMPLE_COUNT,
|
JARVIS_EXAMPLE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum common_sampler_type {
|
enum common_sampler_type {
|
||||||
|
@ -103,7 +103,7 @@ enum dimre_method {
|
||||||
|
|
||||||
// sampler parameters
|
// sampler parameters
|
||||||
struct common_sampler_params {
|
struct common_sampler_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
uint32_t seed = JARVIS_DEFAULT_SEED; // the seed used to initialize jarvis_sampler
|
||||||
|
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
|
@ -149,7 +149,7 @@ struct common_sampler_params {
|
||||||
|
|
||||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||||
|
|
||||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
std::vector<jarvis_logit_bias> logit_bias; // logit biases to apply
|
||||||
|
|
||||||
// print the parameters into a string
|
// print the parameters into a string
|
||||||
std::string print() const;
|
std::string print() const;
|
||||||
|
@ -192,10 +192,10 @@ struct common_params {
|
||||||
|
|
||||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
enum jarvis_split_mode split_mode = JARVIS_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||||
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
enum jarvis_rope_scaling_type rope_scaling_type = JARVIS_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
enum jarvis_pooling_type pooling_type = JARVIS_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||||
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
enum jarvis_attention_type attention_type = JARVIS_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||||
|
|
||||||
struct common_sampler_params sparams;
|
struct common_sampler_params sparams;
|
||||||
|
|
||||||
|
@ -219,9 +219,9 @@ struct common_params {
|
||||||
|
|
||||||
std::vector<std::string> in_files; // all input files
|
std::vector<std::string> in_files; // all input files
|
||||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||||
std::vector<llama_model_kv_override> kv_overrides;
|
std::vector<jarvis_model_kv_override> kv_overrides;
|
||||||
|
|
||||||
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using jarvis_lora_adapter_apply)
|
||||||
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
||||||
|
|
||||||
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||||
|
@ -377,15 +377,15 @@ bool set_process_priority(enum ggml_sched_priority prio);
|
||||||
|
|
||||||
#ifdef __GNUC__
|
#ifdef __GNUC__
|
||||||
#ifdef __MINGW32__
|
#ifdef __MINGW32__
|
||||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||||
#else
|
#else
|
||||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
JARVIS_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
||||||
std::string string_format(const char * fmt, ...);
|
std::string string_format(const char * fmt, ...);
|
||||||
|
|
||||||
std::string string_strip(const std::string & str);
|
std::string string_strip(const std::string & str);
|
||||||
|
@ -424,13 +424,13 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
|
||||||
return parts;
|
return parts;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
bool string_parse_kv_override(const char * data, std::vector<jarvis_model_kv_override> & overrides);
|
||||||
void string_process_escapes(std::string & input);
|
void string_process_escapes(std::string & input);
|
||||||
|
|
||||||
std::string string_from(bool value);
|
std::string string_from(bool value);
|
||||||
std::string string_from(const std::vector<int> & values);
|
std::string string_from(const std::vector<int> & values);
|
||||||
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
std::string string_from(const struct jarvis_context * ctx, const std::vector<jarvis_token> & tokens);
|
||||||
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
std::string string_from(const struct jarvis_context * ctx, const struct jarvis_batch & batch);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Filesystem utils
|
// Filesystem utils
|
||||||
|
@ -447,32 +447,32 @@ std::string fs_get_cache_file(const std::string & filename);
|
||||||
//
|
//
|
||||||
|
|
||||||
struct common_init_result {
|
struct common_init_result {
|
||||||
struct llama_model * model = nullptr;
|
struct jarvis_model * model = nullptr;
|
||||||
struct llama_context * context = nullptr;
|
struct jarvis_context * context = nullptr;
|
||||||
std::vector<common_lora_adapter_container> lora_adapters;
|
std::vector<common_lora_adapter_container> lora_adapters;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_init_result common_init_from_params(common_params & params);
|
struct common_init_result common_init_from_params(common_params & params);
|
||||||
|
|
||||||
struct llama_model_params common_model_params_to_llama (const common_params & params);
|
struct jarvis_model_params common_model_params_to_jarvis (const common_params & params);
|
||||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
struct jarvis_context_params common_context_params_to_jarvis(const common_params & params);
|
||||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||||
|
|
||||||
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct jarvis_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct jarvis_model_params & params);
|
||||||
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct jarvis_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct jarvis_model_params & params);
|
||||||
|
|
||||||
// clear LoRA adapters from context, then apply new list of adapters
|
// clear LoRA adapters from context, then apply new list of adapters
|
||||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
void common_lora_adapters_apply(struct jarvis_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
||||||
|
|
||||||
// Batch utils
|
// Batch utils
|
||||||
|
|
||||||
void common_batch_clear(struct llama_batch & batch);
|
void common_batch_clear(struct jarvis_batch & batch);
|
||||||
|
|
||||||
void common_batch_add(
|
void common_batch_add(
|
||||||
struct llama_batch & batch,
|
struct jarvis_batch & batch,
|
||||||
llama_token id,
|
jarvis_token id,
|
||||||
llama_pos pos,
|
jarvis_pos pos,
|
||||||
const std::vector<llama_seq_id> & seq_ids,
|
const std::vector<jarvis_seq_id> & seq_ids,
|
||||||
bool logits);
|
bool logits);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -481,14 +481,14 @@ void common_batch_add(
|
||||||
|
|
||||||
// tokenizes a string into a vector of tokens
|
// tokenizes a string into a vector of tokens
|
||||||
// should work similar to Python's `tokenizer.encode`
|
// should work similar to Python's `tokenizer.encode`
|
||||||
std::vector<llama_token> common_tokenize(
|
std::vector<jarvis_token> common_tokenize(
|
||||||
const struct llama_context * ctx,
|
const struct jarvis_context * ctx,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_special,
|
bool add_special,
|
||||||
bool parse_special = false);
|
bool parse_special = false);
|
||||||
|
|
||||||
std::vector<llama_token> common_tokenize(
|
std::vector<jarvis_token> common_tokenize(
|
||||||
const struct llama_model * model,
|
const struct jarvis_model * model,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_special,
|
bool add_special,
|
||||||
bool parse_special = false);
|
bool parse_special = false);
|
||||||
|
@ -496,23 +496,23 @@ std::vector<llama_token> common_tokenize(
|
||||||
// tokenizes a token into a piece, optionally renders special/control tokens
|
// tokenizes a token into a piece, optionally renders special/control tokens
|
||||||
// should work similar to Python's `tokenizer.id_to_piece`
|
// should work similar to Python's `tokenizer.id_to_piece`
|
||||||
std::string common_token_to_piece(
|
std::string common_token_to_piece(
|
||||||
const struct llama_context * ctx,
|
const struct jarvis_context * ctx,
|
||||||
llama_token token,
|
jarvis_token token,
|
||||||
bool special = true);
|
bool special = true);
|
||||||
|
|
||||||
// detokenizes a vector of tokens into a string
|
// detokenizes a vector of tokens into a string
|
||||||
// should work similar to Python's `tokenizer.decode`
|
// should work similar to Python's `tokenizer.decode`
|
||||||
// optionally renders special/control tokens
|
// optionally renders special/control tokens
|
||||||
std::string common_detokenize(
|
std::string common_detokenize(
|
||||||
llama_context * ctx,
|
jarvis_context * ctx,
|
||||||
const std::vector<llama_token> & tokens,
|
const std::vector<jarvis_token> & tokens,
|
||||||
bool special = true);
|
bool special = true);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Chat template utils
|
// Chat template utils
|
||||||
//
|
//
|
||||||
|
|
||||||
// same with llama_chat_message, but uses std::string
|
// same with jarvis_chat_message, but uses std::string
|
||||||
struct common_chat_msg {
|
struct common_chat_msg {
|
||||||
std::string role;
|
std::string role;
|
||||||
std::string content;
|
std::string content;
|
||||||
|
@ -521,23 +521,23 @@ struct common_chat_msg {
|
||||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||||
bool common_chat_verify_template(const std::string & tmpl);
|
bool common_chat_verify_template(const std::string & tmpl);
|
||||||
|
|
||||||
// CPP wrapper for llama_chat_apply_template
|
// CPP wrapper for jarvis_chat_apply_template
|
||||||
// If the built-in template is not supported, we default to chatml
|
// If the built-in template is not supported, we default to chatml
|
||||||
// If the custom "tmpl" is not supported, we throw an error
|
// If the custom "tmpl" is not supported, we throw an error
|
||||||
std::string common_chat_apply_template(const struct llama_model * model,
|
std::string common_chat_apply_template(const struct jarvis_model * model,
|
||||||
const std::string & tmpl,
|
const std::string & tmpl,
|
||||||
const std::vector<common_chat_msg> & chat,
|
const std::vector<common_chat_msg> & chat,
|
||||||
bool add_ass);
|
bool add_ass);
|
||||||
|
|
||||||
// Format single message, while taking into account the position of that message in chat history
|
// Format single message, while taking into account the position of that message in chat history
|
||||||
std::string common_chat_format_single(const struct llama_model * model,
|
std::string common_chat_format_single(const struct jarvis_model * model,
|
||||||
const std::string & tmpl,
|
const std::string & tmpl,
|
||||||
const std::vector<common_chat_msg> & past_msg,
|
const std::vector<common_chat_msg> & past_msg,
|
||||||
const common_chat_msg & new_msg,
|
const common_chat_msg & new_msg,
|
||||||
bool add_ass);
|
bool add_ass);
|
||||||
|
|
||||||
// Returns an example of formatted chat
|
// Returns an example of formatted chat
|
||||||
std::string common_chat_format_example(const struct llama_model * model,
|
std::string common_chat_format_example(const struct jarvis_model * model,
|
||||||
const std::string & tmpl);
|
const std::string & tmpl);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -545,10 +545,10 @@ std::string common_chat_format_example(const struct llama_model * model,
|
||||||
//
|
//
|
||||||
|
|
||||||
// Dump the KV cache view with the number of sequences per cell.
|
// Dump the KV cache view with the number of sequences per cell.
|
||||||
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
void common_kv_cache_dump_view(const jarvis_kv_cache_view & view, int row_size = 80);
|
||||||
|
|
||||||
// Dump the KV cache view showing individual sequences in each cell (long output).
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||||
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
void common_kv_cache_dump_view_seqs(const jarvis_kv_cache_view & view, int row_size = 40);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Embedding utils
|
// Embedding utils
|
||||||
|
@ -596,5 +596,5 @@ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std
|
||||||
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
||||||
|
|
||||||
void yaml_dump_non_result_info(
|
void yaml_dump_non_result_info(
|
||||||
FILE * stream, const common_params & params, const llama_context * lctx,
|
FILE * stream, const common_params & params, const jarvis_context * lctx,
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||||
|
|
|
@ -435,7 +435,7 @@ namespace console {
|
||||||
fputc('\n', out);
|
fputc('\n', out);
|
||||||
has_more = !has_more;
|
has_more = !has_more;
|
||||||
} else {
|
} else {
|
||||||
// llama will just eat the single space, it won't act as a space
|
// jarvis will just eat the single space, it won't act as a space
|
||||||
if (line.length() == 1 && line.back() == ' ') {
|
if (line.length() == 1 && line.back() == ' ') {
|
||||||
line.clear();
|
line.clear();
|
||||||
pop_cursor();
|
pop_cursor();
|
||||||
|
|
|
@ -5336,7 +5336,7 @@ template<typename IteratorType> class iteration_proxy
|
||||||
};
|
};
|
||||||
|
|
||||||
// Structured Bindings Support
|
// Structured Bindings Support
|
||||||
// For further reference see https://blog.tartanllama.xyz/structured-bindings/
|
// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
|
||||||
// And see https://github.com/nlohmann/json/pull/1391
|
// And see https://github.com/nlohmann/json/pull/1391
|
||||||
template<std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
|
template<std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
|
||||||
auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key())
|
auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key())
|
||||||
|
@ -5344,7 +5344,7 @@ auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decl
|
||||||
return i.key();
|
return i.key();
|
||||||
}
|
}
|
||||||
// Structured Bindings Support
|
// Structured Bindings Support
|
||||||
// For further reference see https://blog.tartanllama.xyz/structured-bindings/
|
// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
|
||||||
// And see https://github.com/nlohmann/json/pull/1391
|
// And see https://github.com/nlohmann/json/pull/1391
|
||||||
template<std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
|
template<std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
|
||||||
auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value())
|
auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value())
|
||||||
|
@ -5357,7 +5357,7 @@ NLOHMANN_JSON_NAMESPACE_END
|
||||||
|
|
||||||
// The Addition to the STD Namespace is required to add
|
// The Addition to the STD Namespace is required to add
|
||||||
// Structured Bindings Support to the iteration_proxy_value class
|
// Structured Bindings Support to the iteration_proxy_value class
|
||||||
// For further reference see https://blog.tartanllama.xyz/structured-bindings/
|
// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
|
||||||
// And see https://github.com/nlohmann/json/pull/1391
|
// And see https://github.com/nlohmann/json/pull/1391
|
||||||
namespace std
|
namespace std
|
||||||
{
|
{
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
int common_log_verbosity_thold = LOG_DEFAULT_JARVIS;
|
||||||
|
|
||||||
void common_log_set_verbosity_thold(int verbosity) {
|
void common_log_set_verbosity_thold(int verbosity) {
|
||||||
common_log_verbosity_thold = verbosity;
|
common_log_verbosity_thold = verbosity;
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LOG_DEFAULT_DEBUG 1
|
#define LOG_DEFAULT_DEBUG 1
|
||||||
#define LOG_DEFAULT_LLAMA 0
|
#define LOG_DEFAULT_JARVIS 0
|
||||||
|
|
||||||
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
||||||
// set via common_log_set_verbosity()
|
// set via common_log_set_verbosity()
|
||||||
|
|
|
@ -9,7 +9,7 @@
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
|
||||||
void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
||||||
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
std::vector<jarvis_token> & inp, int nnew, bool print_progress) {
|
||||||
const int64_t t_start_ms = ggml_time_ms();
|
const int64_t t_start_ms = ggml_time_ms();
|
||||||
const int64_t inp_size = inp.size();
|
const int64_t inp_size = inp.size();
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min,
|
||||||
for (int64_t i = i_start; i < inp_size; ++i) {
|
for (int64_t i = i_start; i < inp_size; ++i) {
|
||||||
const int64_t ngram_start = i - ngram_size;
|
const int64_t ngram_start = i - ngram_size;
|
||||||
common_ngram ngram(&inp[ngram_start], ngram_size);
|
common_ngram ngram(&inp[ngram_start], ngram_size);
|
||||||
const llama_token token = inp[i];
|
const jarvis_token token = inp[i];
|
||||||
|
|
||||||
common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
|
common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
|
||||||
if (part_it == ngram_cache.end()) {
|
if (part_it == ngram_cache.end()) {
|
||||||
|
@ -51,18 +51,18 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to get a token from the combined, speculative sequence of inp and draft.
|
// Helper function to get a token from the combined, speculative sequence of inp and draft.
|
||||||
static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
|
static jarvis_token get_token(const std::vector<jarvis_token> & inp, const std::vector<jarvis_token> & draft, const size_t i) {
|
||||||
return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
|
return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
|
||||||
}
|
}
|
||||||
|
|
||||||
// If sample size or percentage are below these thresholds the draft is aborted early:
|
// If sample size or percentage are below these thresholds the draft is aborted early:
|
||||||
constexpr int draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2, 2, 1, 1};
|
constexpr int draft_min_sample_size_lax[JARVIS_NGRAM_MAX] = { 2, 2, 1, 1};
|
||||||
constexpr int draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50};
|
constexpr int draft_min_percent_lax[JARVIS_NGRAM_MAX] = {66, 50, 50, 50};
|
||||||
constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
|
constexpr int draft_min_sample_size_strict[JARVIS_NGRAM_MAX] = { 4, 3, 2, 2};
|
||||||
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
|
constexpr int draft_min_percent_strict[JARVIS_NGRAM_MAX] = {75, 66, 66, 66};
|
||||||
|
|
||||||
// Helper function that tries to draft a token from only the static ngram cache:
|
// Helper function that tries to draft a token from only the static ngram cache:
|
||||||
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
static jarvis_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
||||||
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||||
if (part_static_it == nc_static.end()) {
|
if (part_static_it == nc_static.end()) {
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -71,10 +71,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
|
||||||
|
|
||||||
int max_count_static = 0;
|
int max_count_static = 0;
|
||||||
int sum_count_static = 0;
|
int sum_count_static = 0;
|
||||||
llama_token max_token = -1;
|
jarvis_token max_token = -1;
|
||||||
|
|
||||||
for (std::pair<llama_token, int> token_count_static : part_static) {
|
for (std::pair<jarvis_token, int> token_count_static : part_static) {
|
||||||
const llama_token token = token_count_static.first;
|
const jarvis_token token = token_count_static.first;
|
||||||
const int32_t count_static = token_count_static.second;
|
const int32_t count_static = token_count_static.second;
|
||||||
|
|
||||||
if (count_static > max_count_static) {
|
if (count_static > max_count_static) {
|
||||||
|
@ -84,21 +84,21 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
|
||||||
sum_count_static += count_static;
|
sum_count_static += count_static;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
|
if (sum_count_static < draft_min_sample_size_lax[JARVIS_NGRAM_STATIC-1]) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
|
if (100*max_count_static < draft_min_percent_lax[JARVIS_NGRAM_STATIC-1]*sum_count_static) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
return max_token;
|
return max_token;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try to draft a token from primary cache (context/dynamic), validate with static cache:
|
// Try to draft a token from primary cache (context/dynamic), validate with static cache:
|
||||||
static llama_token try_draft(
|
static jarvis_token try_draft(
|
||||||
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
|
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
|
||||||
const int * min_sample_size, const int * min_percent) {
|
const int * min_sample_size, const int * min_percent) {
|
||||||
|
|
||||||
llama_token drafted_token = -1;
|
jarvis_token drafted_token = -1;
|
||||||
|
|
||||||
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
|
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
|
||||||
const common_ngram ngram_primary = ngrams_primary[i];
|
const common_ngram ngram_primary = ngrams_primary[i];
|
||||||
|
@ -112,10 +112,10 @@ static llama_token try_draft(
|
||||||
int max_count_primary = 0;
|
int max_count_primary = 0;
|
||||||
int max_count_static = 0;
|
int max_count_static = 0;
|
||||||
int sum_count_primary = 0;
|
int sum_count_primary = 0;
|
||||||
llama_token max_token = -1;
|
jarvis_token max_token = -1;
|
||||||
|
|
||||||
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
for (std::pair<jarvis_token, int> token_count_primary : part_primary) {
|
||||||
const llama_token token = token_count_primary.first;
|
const jarvis_token token = token_count_primary.first;
|
||||||
|
|
||||||
common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
|
common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
|
||||||
|
|
||||||
|
@ -143,22 +143,22 @@ static llama_token try_draft(
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_ngram_cache_draft(
|
void common_ngram_cache_draft(
|
||||||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
std::vector<jarvis_token> & inp, std::vector<jarvis_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||||
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
|
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
|
||||||
) {
|
) {
|
||||||
GGML_ASSERT(draft.size() == 1);
|
GGML_ASSERT(draft.size() == 1);
|
||||||
const int inp_size = inp.size();
|
const int inp_size = inp.size();
|
||||||
|
|
||||||
if (inp_size < LLAMA_NGRAM_STATIC) {
|
if (inp_size < JARVIS_NGRAM_STATIC) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
while ((int) draft.size()-1 < n_draft) {
|
while ((int) draft.size()-1 < n_draft) {
|
||||||
llama_token drafted_token = -1;
|
jarvis_token drafted_token = -1;
|
||||||
|
|
||||||
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
const int ngram_start_static = inp_size-JARVIS_NGRAM_STATIC + draft.size()-1;
|
||||||
common_ngram ngram_static;
|
common_ngram ngram_static;
|
||||||
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
|
for (int j = ngram_start_static; j < ngram_start_static + JARVIS_NGRAM_STATIC; ++j) {
|
||||||
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
|
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
|
||||||
}
|
}
|
||||||
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||||
|
@ -207,12 +207,12 @@ void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & fil
|
||||||
|
|
||||||
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram));
|
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram));
|
||||||
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
|
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
|
||||||
for (std::pair<llama_token, int32_t> item2 : token_counts) {
|
for (std::pair<jarvis_token, int32_t> item2 : token_counts) {
|
||||||
const llama_token token = item2.first;
|
const jarvis_token token = item2.first;
|
||||||
const int32_t count = item2.second;
|
const int32_t count = item2.second;
|
||||||
GGML_ASSERT(count > 0);
|
GGML_ASSERT(count > 0);
|
||||||
|
|
||||||
file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token));
|
file_out.write(reinterpret_cast<const char *>(&token), sizeof(jarvis_token));
|
||||||
file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
|
file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -228,7 +228,7 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) {
|
||||||
|
|
||||||
common_ngram ngram;
|
common_ngram ngram;
|
||||||
int32_t ntokens;
|
int32_t ntokens;
|
||||||
llama_token token;
|
jarvis_token token;
|
||||||
int32_t count;
|
int32_t count;
|
||||||
|
|
||||||
char * ngramc = reinterpret_cast<char*>(&ngram);
|
char * ngramc = reinterpret_cast<char*>(&ngram);
|
||||||
|
@ -243,7 +243,7 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) {
|
||||||
|
|
||||||
for (int i = 0; i < ntokens; ++i) {
|
for (int i = 0; i < ntokens; ++i) {
|
||||||
GGML_ASSERT(!hashmap_file.eof());
|
GGML_ASSERT(!hashmap_file.eof());
|
||||||
GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token)));
|
GGML_ASSERT(hashmap_file.read(tokenc, sizeof(jarvis_token)));
|
||||||
GGML_ASSERT(!hashmap_file.eof());
|
GGML_ASSERT(!hashmap_file.eof());
|
||||||
GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
|
GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
|
||||||
GGML_ASSERT(count > 0);
|
GGML_ASSERT(count > 0);
|
||||||
|
@ -268,8 +268,8 @@ void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ng
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (std::pair<llama_token, int32_t> token_count : part) {
|
for (std::pair<jarvis_token, int32_t> token_count : part) {
|
||||||
const llama_token token = token_count.first;
|
const jarvis_token token = token_count.first;
|
||||||
const int32_t count = token_count.second;
|
const int32_t count = token_count.second;
|
||||||
GGML_ASSERT(count > 0);
|
GGML_ASSERT(count > 0);
|
||||||
|
|
||||||
|
|
|
@ -1,34 +1,34 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "llama.h"
|
#include "jarvis.h"
|
||||||
|
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#define LLAMA_NGRAM_MIN 1
|
#define JARVIS_NGRAM_MIN 1
|
||||||
#define LLAMA_NGRAM_MAX 4
|
#define JARVIS_NGRAM_MAX 4
|
||||||
#define LLAMA_NGRAM_STATIC 2
|
#define JARVIS_NGRAM_STATIC 2
|
||||||
|
|
||||||
// Data structures to map n-grams to empirical token probabilities:
|
// Data structures to map n-grams to empirical token probabilities:
|
||||||
|
|
||||||
struct common_ngram {
|
struct common_ngram {
|
||||||
llama_token tokens[LLAMA_NGRAM_MAX];
|
jarvis_token tokens[JARVIS_NGRAM_MAX];
|
||||||
|
|
||||||
common_ngram() {
|
common_ngram() {
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
|
||||||
tokens[i] = -1;
|
tokens[i] = -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
common_ngram(const llama_token * input, const int ngram_size) {
|
common_ngram(const jarvis_token * input, const int ngram_size) {
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
|
||||||
tokens[i] = i < ngram_size ? input[i] : -1;
|
tokens[i] = i < ngram_size ? input[i] : -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator==(const common_ngram & other) const {
|
bool operator==(const common_ngram & other) const {
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
|
||||||
if (tokens[i] != other.tokens[i]) {
|
if (tokens[i] != other.tokens[i]) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -38,7 +38,7 @@ struct common_ngram {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_token_hash_function {
|
struct common_token_hash_function {
|
||||||
size_t operator()(const llama_token token) const {
|
size_t operator()(const jarvis_token token) const {
|
||||||
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
||||||
return token * 11400714819323198485llu;
|
return token * 11400714819323198485llu;
|
||||||
}
|
}
|
||||||
|
@ -47,7 +47,7 @@ struct common_token_hash_function {
|
||||||
struct common_ngram_hash_function {
|
struct common_ngram_hash_function {
|
||||||
size_t operator()(const common_ngram & ngram) const {
|
size_t operator()(const common_ngram & ngram) const {
|
||||||
size_t hash = common_token_hash_function{}(ngram.tokens[0]);
|
size_t hash = common_token_hash_function{}(ngram.tokens[0]);
|
||||||
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 1; i < JARVIS_NGRAM_MAX; ++i) {
|
||||||
hash ^= common_token_hash_function{}(ngram.tokens[i]);
|
hash ^= common_token_hash_function{}(ngram.tokens[i]);
|
||||||
}
|
}
|
||||||
return hash;
|
return hash;
|
||||||
|
@ -55,7 +55,7 @@ struct common_ngram_hash_function {
|
||||||
};
|
};
|
||||||
|
|
||||||
// token -> number of times token has been seen
|
// token -> number of times token has been seen
|
||||||
typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
|
typedef std::unordered_map<jarvis_token, int32_t> common_ngram_cache_part;
|
||||||
|
|
||||||
// n-gram -> empirical distribution of following tokens
|
// n-gram -> empirical distribution of following tokens
|
||||||
typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
|
typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
|
||||||
|
@ -71,7 +71,7 @@ typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_h
|
||||||
// In order to get correct results inp_data can ONLY BE APPENDED TO.
|
// In order to get correct results inp_data can ONLY BE APPENDED TO.
|
||||||
// Changes in the middle need a complete rebuild.
|
// Changes in the middle need a complete rebuild.
|
||||||
void common_ngram_cache_update(
|
void common_ngram_cache_update(
|
||||||
common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
|
common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<jarvis_token> & inp_data, int nnew, bool print_progress);
|
||||||
|
|
||||||
// Try to draft tokens from ngram caches.
|
// Try to draft tokens from ngram caches.
|
||||||
// inp: the tokens generated so far.
|
// inp: the tokens generated so far.
|
||||||
|
@ -82,7 +82,7 @@ void common_ngram_cache_update(
|
||||||
// nc_dynamic: ngram cache based on previous user generations.
|
// nc_dynamic: ngram cache based on previous user generations.
|
||||||
// nc_static: ngram cache generated from a large text corpus, used for validation.
|
// nc_static: ngram cache generated from a large text corpus, used for validation.
|
||||||
void common_ngram_cache_draft(
|
void common_ngram_cache_draft(
|
||||||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
std::vector<jarvis_token> & inp, std::vector<jarvis_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||||
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
|
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
|
||||||
|
|
||||||
// Save an ngram cache to a file.
|
// Save an ngram cache to a file.
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||||
// TODO: deduplicate with llama-impl.h
|
// TODO: deduplicate with jarvis-impl.h
|
||||||
template<typename T>
|
template<typename T>
|
||||||
struct ring_buffer {
|
struct ring_buffer {
|
||||||
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
|
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
|
||||||
|
@ -101,24 +101,24 @@ struct ring_buffer {
|
||||||
struct common_sampler {
|
struct common_sampler {
|
||||||
common_sampler_params params;
|
common_sampler_params params;
|
||||||
|
|
||||||
struct llama_sampler * grmr;
|
struct jarvis_sampler * grmr;
|
||||||
struct llama_sampler * chain;
|
struct jarvis_sampler * chain;
|
||||||
|
|
||||||
ring_buffer<llama_token> prev;
|
ring_buffer<jarvis_token> prev;
|
||||||
|
|
||||||
std::vector<llama_token_data> cur;
|
std::vector<jarvis_token_data> cur;
|
||||||
|
|
||||||
llama_token_data_array cur_p;
|
jarvis_token_data_array cur_p;
|
||||||
|
|
||||||
void set_logits(struct llama_context * ctx, int idx) {
|
void set_logits(struct jarvis_context * ctx, int idx) {
|
||||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
const auto * logits = jarvis_get_logits_ith(ctx, idx);
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
const int n_vocab = jarvis_n_vocab(jarvis_get_model(ctx));
|
||||||
|
|
||||||
cur.resize(n_vocab);
|
cur.resize(n_vocab);
|
||||||
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
for (jarvis_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
cur[token_id] = jarvis_token_data{token_id, logits[token_id], 0.0f};
|
||||||
}
|
}
|
||||||
|
|
||||||
cur_p = { cur.data(), cur.size(), -1, false };
|
cur_p = { cur.data(), cur.size(), -1, false };
|
||||||
|
@ -141,31 +141,31 @@ std::string common_sampler_params::print() const {
|
||||||
return std::string(result);
|
return std::string(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
|
struct common_sampler * common_sampler_init(const struct jarvis_model * model, const struct common_sampler_params & params) {
|
||||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
jarvis_sampler_chain_params lparams = jarvis_sampler_chain_default_params();
|
||||||
|
|
||||||
lparams.no_perf = params.no_perf;
|
lparams.no_perf = params.no_perf;
|
||||||
|
|
||||||
auto * result = new common_sampler {
|
auto * result = new common_sampler {
|
||||||
/* .params = */ params,
|
/* .params = */ params,
|
||||||
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
|
/* .grmr = */ jarvis_sampler_init_grammar(model, params.grammar.c_str(), "root"),
|
||||||
/* .chain = */ llama_sampler_chain_init(lparams),
|
/* .chain = */ jarvis_sampler_chain_init(lparams),
|
||||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
/* .prev = */ ring_buffer<jarvis_token>(std::max(32, params.n_prev)),
|
||||||
/* .cur = */ {},
|
/* .cur = */ {},
|
||||||
/* .cur_p = */ {},
|
/* .cur_p = */ {},
|
||||||
};
|
};
|
||||||
|
|
||||||
llama_sampler_chain_add(result->chain,
|
jarvis_sampler_chain_add(result->chain,
|
||||||
llama_sampler_init_logit_bias(
|
jarvis_sampler_init_logit_bias(
|
||||||
llama_n_vocab(model),
|
jarvis_n_vocab(model),
|
||||||
params.logit_bias.size(),
|
params.logit_bias.size(),
|
||||||
params.logit_bias.data()));
|
params.logit_bias.data()));
|
||||||
|
|
||||||
llama_sampler_chain_add(result->chain,
|
jarvis_sampler_chain_add(result->chain,
|
||||||
llama_sampler_init_penalties(
|
jarvis_sampler_init_penalties(
|
||||||
llama_n_vocab (model),
|
jarvis_n_vocab (model),
|
||||||
llama_token_eos(model),
|
jarvis_token_eos(model),
|
||||||
llama_token_nl (model),
|
jarvis_token_nl (model),
|
||||||
params.penalty_last_n,
|
params.penalty_last_n,
|
||||||
params.penalty_repeat,
|
params.penalty_repeat,
|
||||||
params.penalty_freq,
|
params.penalty_freq,
|
||||||
|
@ -184,44 +184,44 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
c_breakers.push_back(str.c_str());
|
c_breakers.push_back(str.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_top_k (params.top_k));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_top_p (params.top_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_min_p (params.min_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_XTC:
|
case COMMON_SAMPLER_TYPE_XTC:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TFS_Z:
|
case COMMON_SAMPLER_TYPE_TFS_Z:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_typical (params.typ_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_INFILL:
|
case COMMON_SAMPLER_TYPE_INFILL:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
|
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_infill (model));
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false && "unknown sampler type");
|
GGML_ASSERT(false && "unknown sampler type");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_dist(params.seed));
|
||||||
} else if (params.mirostat == 1) {
|
} else if (params.mirostat == 1) {
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp(params.temp));
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_mirostat(jarvis_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
||||||
} else if (params.mirostat == 2) {
|
} else if (params.mirostat == 2) {
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp(params.temp));
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false && "unknown mirostat version");
|
GGML_ASSERT(false && "unknown mirostat version");
|
||||||
}
|
}
|
||||||
|
@ -231,53 +231,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
|
|
||||||
void common_sampler_free(struct common_sampler * gsmpl) {
|
void common_sampler_free(struct common_sampler * gsmpl) {
|
||||||
if (gsmpl) {
|
if (gsmpl) {
|
||||||
llama_sampler_free(gsmpl->grmr);
|
jarvis_sampler_free(gsmpl->grmr);
|
||||||
|
|
||||||
llama_sampler_free(gsmpl->chain);
|
jarvis_sampler_free(gsmpl->chain);
|
||||||
|
|
||||||
delete gsmpl;
|
delete gsmpl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
void common_sampler_accept(struct common_sampler * gsmpl, jarvis_token token, bool accept_grammar) {
|
||||||
if (accept_grammar) {
|
if (accept_grammar) {
|
||||||
llama_sampler_accept(gsmpl->grmr, token);
|
jarvis_sampler_accept(gsmpl->grmr, token);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampler_accept(gsmpl->chain, token);
|
jarvis_sampler_accept(gsmpl->chain, token);
|
||||||
|
|
||||||
gsmpl->prev.push_back(token);
|
gsmpl->prev.push_back(token);
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_sampler_reset(struct common_sampler * gsmpl) {
|
void common_sampler_reset(struct common_sampler * gsmpl) {
|
||||||
llama_sampler_reset(gsmpl->grmr);
|
jarvis_sampler_reset(gsmpl->grmr);
|
||||||
|
|
||||||
llama_sampler_reset(gsmpl->chain);
|
jarvis_sampler_reset(gsmpl->chain);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||||
return new common_sampler {
|
return new common_sampler {
|
||||||
/* .params = */ gsmpl->params,
|
/* .params = */ gsmpl->params,
|
||||||
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
/* .grmr = */ jarvis_sampler_clone(gsmpl->grmr),
|
||||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
/* .chain = */ jarvis_sampler_clone(gsmpl->chain),
|
||||||
/* .prev = */ gsmpl->prev,
|
/* .prev = */ gsmpl->prev,
|
||||||
/* .cur = */ gsmpl->cur,
|
/* .cur = */ gsmpl->cur,
|
||||||
/* .cur_p = */ gsmpl->cur_p,
|
/* .cur_p = */ gsmpl->cur_p,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
|
void common_perf_print(const struct jarvis_context * ctx, const struct common_sampler * gsmpl) {
|
||||||
// TODO: measure grammar performance
|
// TODO: measure grammar performance
|
||||||
|
|
||||||
if (gsmpl) {
|
if (gsmpl) {
|
||||||
llama_perf_sampler_print(gsmpl->chain);
|
jarvis_perf_sampler_print(gsmpl->chain);
|
||||||
}
|
}
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
llama_perf_context_print(ctx);
|
jarvis_perf_context_print(ctx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
jarvis_token common_sampler_sample(struct common_sampler * gsmpl, struct jarvis_context * ctx, int idx, bool grammar_first) {
|
||||||
gsmpl->set_logits(ctx, idx);
|
gsmpl->set_logits(ctx, idx);
|
||||||
|
|
||||||
auto & grmr = gsmpl->grmr;
|
auto & grmr = gsmpl->grmr;
|
||||||
|
@ -285,14 +285,14 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||||
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||||
|
|
||||||
if (grammar_first) {
|
if (grammar_first) {
|
||||||
llama_sampler_apply(grmr, &cur_p);
|
jarvis_sampler_apply(grmr, &cur_p);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampler_apply(chain, &cur_p);
|
jarvis_sampler_apply(chain, &cur_p);
|
||||||
|
|
||||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
||||||
|
|
||||||
const llama_token id = cur_p.data[cur_p.selected].id;
|
const jarvis_token id = cur_p.data[cur_p.selected].id;
|
||||||
|
|
||||||
if (grammar_first) {
|
if (grammar_first) {
|
||||||
return id;
|
return id;
|
||||||
|
@ -300,10 +300,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||||
|
|
||||||
// check if it the sampled token fits the grammar
|
// check if it the sampled token fits the grammar
|
||||||
{
|
{
|
||||||
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
jarvis_token_data single_token_data = { id, 1.0f, 0.0f };
|
||||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
jarvis_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
||||||
|
|
||||||
llama_sampler_apply(grmr, &single_token_data_array);
|
jarvis_sampler_apply(grmr, &single_token_data_array);
|
||||||
|
|
||||||
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||||
if (is_valid) {
|
if (is_valid) {
|
||||||
|
@ -315,8 +315,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||||
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
||||||
gsmpl->set_logits(ctx, idx);
|
gsmpl->set_logits(ctx, idx);
|
||||||
|
|
||||||
llama_sampler_apply(grmr, &cur_p);
|
jarvis_sampler_apply(grmr, &cur_p);
|
||||||
llama_sampler_apply(chain, &cur_p);
|
jarvis_sampler_apply(chain, &cur_p);
|
||||||
|
|
||||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
|
||||||
|
|
||||||
|
@ -324,31 +324,31 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||||
return llama_sampler_get_seed(gsmpl->chain);
|
return jarvis_sampler_get_seed(gsmpl->chain);
|
||||||
}
|
}
|
||||||
|
|
||||||
// helpers
|
// helpers
|
||||||
|
|
||||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
|
jarvis_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
|
||||||
return &gsmpl->cur_p;
|
return &gsmpl->cur_p;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
|
jarvis_token common_sampler_last(const struct common_sampler * gsmpl) {
|
||||||
return gsmpl->prev.rat(0);
|
return gsmpl->prev.rat(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
||||||
std::string result = "logits ";
|
std::string result = "logits ";
|
||||||
|
|
||||||
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
for (int i = 0; i < jarvis_sampler_chain_n(gsmpl->chain); i++) {
|
||||||
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
const auto * smpl = jarvis_sampler_chain_get(gsmpl->chain, i);
|
||||||
result += std::string("-> ") + llama_sampler_name(smpl) + " ";
|
result += std::string("-> ") + jarvis_sampler_name(smpl) + " ";
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
|
std::string common_sampler_prev_str(common_sampler * gsmpl, jarvis_context * ctx_main, int n) {
|
||||||
n = std::min(n, (int) gsmpl->prev.size());
|
n = std::min(n, (int) gsmpl->prev.size());
|
||||||
|
|
||||||
if (n <= 0) {
|
if (n <= 0) {
|
||||||
|
@ -359,9 +359,9 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_
|
||||||
result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
|
result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
|
||||||
|
|
||||||
for (int i = n - 1; i >= 0; i--) {
|
for (int i = n - 1; i >= 0; i--) {
|
||||||
const llama_token id = gsmpl->prev.rat(i);
|
const jarvis_token id = gsmpl->prev.rat(i);
|
||||||
|
|
||||||
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
|
GGML_ASSERT(id != JARVIS_TOKEN_NULL && "null token in the sampling history - should not happen");
|
||||||
|
|
||||||
result += common_token_to_piece(ctx_main, id);
|
result += common_token_to_piece(ctx_main, id);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "llama.h"
|
#include "jarvis.h"
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// common_sampler extends llama_sampler with additional functionality:
|
// common_sampler extends jarvis_sampler with additional functionality:
|
||||||
//
|
//
|
||||||
// - grammar support
|
// - grammar support
|
||||||
// - custom sampler logic based on the parameters
|
// - custom sampler logic based on the parameters
|
||||||
|
@ -24,7 +24,7 @@
|
||||||
// grammar constraints are applied to the full vocabulary and the token is resampled.
|
// grammar constraints are applied to the full vocabulary and the token is resampled.
|
||||||
//
|
//
|
||||||
// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
|
// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
|
||||||
// be moved into the core llama library.
|
// be moved into the core jarvis library.
|
||||||
//
|
//
|
||||||
// For convenience, the common_sampler also maintains a container with the current candidate tokens.
|
// For convenience, the common_sampler also maintains a container with the current candidate tokens.
|
||||||
// This can be used to access the probabilities of the rest of the non-sampled tokens.
|
// This can be used to access the probabilities of the rest of the non-sampled tokens.
|
||||||
|
@ -34,19 +34,19 @@
|
||||||
|
|
||||||
struct common_sampler;
|
struct common_sampler;
|
||||||
|
|
||||||
// llama_sampler API overloads
|
// jarvis_sampler API overloads
|
||||||
|
|
||||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
|
struct common_sampler * common_sampler_init(const struct jarvis_model * model, const struct common_sampler_params & params);
|
||||||
|
|
||||||
void common_sampler_free(struct common_sampler * gsmpl);
|
void common_sampler_free(struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
|
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
|
||||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
|
void common_sampler_accept(struct common_sampler * gsmpl, jarvis_token token, bool accept_grammar);
|
||||||
void common_sampler_reset (struct common_sampler * gsmpl);
|
void common_sampler_reset (struct common_sampler * gsmpl);
|
||||||
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// arguments can be nullptr to skip printing
|
// arguments can be nullptr to skip printing
|
||||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
void common_perf_print(const struct jarvis_context * ctx, const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// extended sampling implementation:
|
// extended sampling implementation:
|
||||||
//
|
//
|
||||||
|
@ -58,23 +58,23 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
||||||
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
||||||
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
||||||
//
|
//
|
||||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
jarvis_token common_sampler_sample(struct common_sampler * gsmpl, struct jarvis_context * ctx, int idx, bool grammar_first = false);
|
||||||
|
|
||||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// helpers
|
// helpers
|
||||||
|
|
||||||
// access the internal list of current candidate tokens
|
// access the internal list of current candidate tokens
|
||||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
|
jarvis_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// get the last accepted token
|
// get the last accepted token
|
||||||
llama_token common_sampler_last(const struct common_sampler * gsmpl);
|
jarvis_token common_sampler_last(const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// print the sampler chain into a string
|
// print the sampler chain into a string
|
||||||
std::string common_sampler_print(const struct common_sampler * gsmpl);
|
std::string common_sampler_print(const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// get a string representation of the last accepted tokens
|
// get a string representation of the last accepted tokens
|
||||||
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
|
std::string common_sampler_prev_str(common_sampler * gsmpl, jarvis_context * ctx, int n);
|
||||||
|
|
||||||
char common_sampler_type_to_chr(enum common_sampler_type cnstr);
|
char common_sampler_type_to_chr(enum common_sampler_type cnstr);
|
||||||
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
|
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
|
||||||
|
|
|
@ -34,7 +34,7 @@ struct train_state * init_train_state() {
|
||||||
state->opt = new struct ggml_opt_context;
|
state->opt = new struct ggml_opt_context;
|
||||||
state->opt->ctx = NULL;
|
state->opt->ctx = NULL;
|
||||||
state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
||||||
state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
state->opt->params.graph_size = JARVIS_TRAIN_MAX_NODES;
|
||||||
state->opt->loss_after = 0.0f;
|
state->opt->loss_after = 0.0f;
|
||||||
|
|
||||||
return state;
|
return state;
|
||||||
|
@ -213,7 +213,7 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t get_example_targets_batch(
|
int64_t get_example_targets_batch(
|
||||||
struct llama_context * lctx,
|
struct jarvis_context * lctx,
|
||||||
struct ggml_tensor * tokens_input,
|
struct ggml_tensor * tokens_input,
|
||||||
struct ggml_tensor * target_probs,
|
struct ggml_tensor * target_probs,
|
||||||
int64_t example_id,
|
int64_t example_id,
|
||||||
|
@ -221,7 +221,7 @@ int64_t get_example_targets_batch(
|
||||||
const size_t * samples_begin,
|
const size_t * samples_begin,
|
||||||
const size_t * samples_size,
|
const size_t * samples_size,
|
||||||
size_t samples_count,
|
size_t samples_count,
|
||||||
const llama_token * train_data,
|
const jarvis_token * train_data,
|
||||||
size_t n_train_data,
|
size_t n_train_data,
|
||||||
bool separate_with_eos,
|
bool separate_with_eos,
|
||||||
bool separate_with_bos,
|
bool separate_with_bos,
|
||||||
|
@ -241,8 +241,8 @@ int64_t get_example_targets_batch(
|
||||||
int64_t used_samples = 0;
|
int64_t used_samples = 0;
|
||||||
|
|
||||||
ggml_set_f32(target_probs, 0.0f);
|
ggml_set_f32(target_probs, 0.0f);
|
||||||
llama_token bos = llama_token_bos(llama_get_model(lctx));
|
jarvis_token bos = jarvis_token_bos(jarvis_get_model(lctx));
|
||||||
llama_token eos = llama_token_eos(llama_get_model(lctx));
|
jarvis_token eos = jarvis_token_eos(jarvis_get_model(lctx));
|
||||||
// printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
|
// printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
|
||||||
for (int k=0; k<n_batch; ++k) {
|
for (int k=0; k<n_batch; ++k) {
|
||||||
// printf("%s: batch %d\n", __func__, k);
|
// printf("%s: batch %d\n", __func__, k);
|
||||||
|
@ -259,7 +259,7 @@ int64_t get_example_targets_batch(
|
||||||
bool sample_separation_eos = !separate_with_eos;
|
bool sample_separation_eos = !separate_with_eos;
|
||||||
bool sample_separation_bos = !separate_with_bos;
|
bool sample_separation_bos = !separate_with_bos;
|
||||||
for (int64_t i=0; i<n_tokens; ++i) {
|
for (int64_t i=0; i<n_tokens; ++i) {
|
||||||
llama_token token = eos;
|
jarvis_token token = eos;
|
||||||
if (sample_offs >= sample_size && fill_with_next_samples) {
|
if (sample_offs >= sample_size && fill_with_next_samples) {
|
||||||
if (!sample_separation_eos) {
|
if (!sample_separation_eos) {
|
||||||
// insert eos token to separate samples
|
// insert eos token to separate samples
|
||||||
|
@ -281,7 +281,7 @@ int64_t get_example_targets_batch(
|
||||||
}
|
}
|
||||||
// note: no else-if here
|
// note: no else-if here
|
||||||
if (sample_offs < sample_size) {
|
if (sample_offs < sample_size) {
|
||||||
token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1));
|
token = clamp(train_data[sample_begin+sample_offs], 0, (jarvis_token) (n_vocab - 1));
|
||||||
++sample_offs;
|
++sample_offs;
|
||||||
}
|
}
|
||||||
ggml_set_f32_nd(target_probs, token, (int) i, (int) k, 0, +1.0f);
|
ggml_set_f32_nd(target_probs, token, (int) i, (int) k, 0, +1.0f);
|
||||||
|
@ -712,12 +712,12 @@ void save_train_state_gguf(struct gguf_context * fctx, struct train_state * trai
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
struct llama_file {
|
struct jarvis_file {
|
||||||
// use FILE * so we don't have to re-open the file to mmap
|
// use FILE * so we don't have to re-open the file to mmap
|
||||||
FILE * fp;
|
FILE * fp;
|
||||||
size_t size;
|
size_t size;
|
||||||
|
|
||||||
llama_file(const char * fname, const char * mode) {
|
jarvis_file(const char * fname, const char * mode) {
|
||||||
fp = std::fopen(fname, mode);
|
fp = std::fopen(fname, mode);
|
||||||
if (fp == NULL) {
|
if (fp == NULL) {
|
||||||
size = 0;
|
size = 0;
|
||||||
|
@ -788,7 +788,7 @@ struct llama_file {
|
||||||
write_raw(&val, sizeof(val));
|
write_raw(&val, sizeof(val));
|
||||||
}
|
}
|
||||||
|
|
||||||
~llama_file() {
|
~jarvis_file() {
|
||||||
if (fp) {
|
if (fp) {
|
||||||
std::fclose(fp);
|
std::fclose(fp);
|
||||||
}
|
}
|
||||||
|
@ -823,16 +823,16 @@ static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nu
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t tokenize_file(
|
size_t tokenize_file(
|
||||||
struct llama_context * lctx,
|
struct jarvis_context * lctx,
|
||||||
const char * filename,
|
const char * filename,
|
||||||
const std::string & sample_start,
|
const std::string & sample_start,
|
||||||
bool include_sample_start,
|
bool include_sample_start,
|
||||||
bool overlapping_samples,
|
bool overlapping_samples,
|
||||||
unsigned context_length,
|
unsigned context_length,
|
||||||
std::vector<llama_token> & out_tokens,
|
std::vector<jarvis_token> & out_tokens,
|
||||||
std::vector<size_t> & out_samples_begin,
|
std::vector<size_t> & out_samples_begin,
|
||||||
std::vector<size_t> & out_samples_size) {
|
std::vector<size_t> & out_samples_size) {
|
||||||
struct llama_file f(filename, "rb");
|
struct jarvis_file f(filename, "rb");
|
||||||
|
|
||||||
if (f.size == 0) {
|
if (f.size == 0) {
|
||||||
out_tokens.clear();
|
out_tokens.clear();
|
||||||
|
@ -844,7 +844,7 @@ size_t tokenize_file(
|
||||||
}
|
}
|
||||||
|
|
||||||
// account for possible leading whitespace that will be added by tokenizer
|
// account for possible leading whitespace that will be added by tokenizer
|
||||||
// e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
|
// e.g. '\t' will be tokenized by jarvis spm tokenizer to [29871, 12]
|
||||||
const int n_max_tokens_overhead = 1;
|
const int n_max_tokens_overhead = 1;
|
||||||
|
|
||||||
std::vector<char> buf;
|
std::vector<char> buf;
|
||||||
|
@ -862,8 +862,8 @@ size_t tokenize_file(
|
||||||
// tokenize all data at once
|
// tokenize all data at once
|
||||||
out_tokens.resize(buf.size() + n_max_tokens_overhead);
|
out_tokens.resize(buf.size() + n_max_tokens_overhead);
|
||||||
|
|
||||||
int n_tokens = llama_tokenize(
|
int n_tokens = jarvis_tokenize(
|
||||||
llama_get_model(lctx),
|
jarvis_get_model(lctx),
|
||||||
buf.data(),
|
buf.data(),
|
||||||
(int) buf.size(),
|
(int) buf.size(),
|
||||||
out_tokens.data(),
|
out_tokens.data(),
|
||||||
|
@ -871,8 +871,8 @@ size_t tokenize_file(
|
||||||
false, false);
|
false, false);
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
out_tokens.resize(-n_tokens);
|
out_tokens.resize(-n_tokens);
|
||||||
n_tokens = llama_tokenize(
|
n_tokens = jarvis_tokenize(
|
||||||
llama_get_model(lctx),
|
jarvis_get_model(lctx),
|
||||||
buf.data(),
|
buf.data(),
|
||||||
(int) buf.size(),
|
(int) buf.size(),
|
||||||
out_tokens.data(),
|
out_tokens.data(),
|
||||||
|
@ -915,7 +915,7 @@ size_t tokenize_file(
|
||||||
out_samples_size.resize(out_samples_begin.size(), 0);
|
out_samples_size.resize(out_samples_begin.size(), 0);
|
||||||
|
|
||||||
std::vector<char> buf_sample;
|
std::vector<char> buf_sample;
|
||||||
std::vector<llama_token> tok_sample;
|
std::vector<jarvis_token> tok_sample;
|
||||||
|
|
||||||
const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
|
const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
|
||||||
size_t found_too_big_sample = 0;
|
size_t found_too_big_sample = 0;
|
||||||
|
@ -925,11 +925,11 @@ size_t tokenize_file(
|
||||||
size_t found_max_sample_size = 0;
|
size_t found_max_sample_size = 0;
|
||||||
|
|
||||||
size_t max_token_text_size = 0;
|
size_t max_token_text_size = 0;
|
||||||
int n_vocab = llama_n_vocab(llama_get_model(lctx));
|
int n_vocab = jarvis_n_vocab(jarvis_get_model(lctx));
|
||||||
for (llama_token token=0; token < n_vocab; ++token) {
|
for (jarvis_token token=0; token < n_vocab; ++token) {
|
||||||
max_token_text_size = std::max(
|
max_token_text_size = std::max(
|
||||||
max_token_text_size,
|
max_token_text_size,
|
||||||
strlen(llama_token_get_text(llama_get_model(lctx), token)));
|
strlen(jarvis_token_get_text(jarvis_get_model(lctx), token)));
|
||||||
}
|
}
|
||||||
|
|
||||||
// upper bound of context byte length.
|
// upper bound of context byte length.
|
||||||
|
@ -957,7 +957,7 @@ size_t tokenize_file(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sample_size > 0) {
|
if (sample_size > 0) {
|
||||||
// llama_tokenize expects zero terminated string,
|
// jarvis_tokenize expects zero terminated string,
|
||||||
// copy sample into buffer and zero terminate it.
|
// copy sample into buffer and zero terminate it.
|
||||||
buf_sample.resize(sample_size);
|
buf_sample.resize(sample_size);
|
||||||
memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
|
memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
|
||||||
|
@ -966,7 +966,7 @@ size_t tokenize_file(
|
||||||
|
|
||||||
// tokenize the sample
|
// tokenize the sample
|
||||||
tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
|
tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
|
||||||
int n_tokens = llama_tokenize(llama_get_model(lctx),
|
int n_tokens = jarvis_tokenize(jarvis_get_model(lctx),
|
||||||
buf_sample.data(),
|
buf_sample.data(),
|
||||||
(int) buf_sample.size(),
|
(int) buf_sample.size(),
|
||||||
tok_sample.data(),
|
tok_sample.data(),
|
||||||
|
@ -974,7 +974,7 @@ size_t tokenize_file(
|
||||||
false, false);
|
false, false);
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
tok_sample.resize(-n_tokens);
|
tok_sample.resize(-n_tokens);
|
||||||
n_tokens = llama_tokenize(llama_get_model(lctx),
|
n_tokens = jarvis_tokenize(jarvis_get_model(lctx),
|
||||||
buf_sample.data(),
|
buf_sample.data(),
|
||||||
(int) buf_sample.size(),
|
(int) buf_sample.size(),
|
||||||
tok_sample.data(),
|
tok_sample.data(),
|
||||||
|
@ -1365,7 +1365,7 @@ bool consume_common_train_arg(
|
||||||
*invalid_param = true;
|
*invalid_param = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (llama_supports_gpu_offload()) {
|
if (jarvis_supports_gpu_offload()) {
|
||||||
params->n_gpu_layers = std::stoi(argv[i]);
|
params->n_gpu_layers = std::stoi(argv[i]);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||||
|
|
|
@ -7,9 +7,9 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "jarvis.h"
|
||||||
|
|
||||||
#define LLAMA_TRAIN_MAX_NODES 16384
|
#define JARVIS_TRAIN_MAX_NODES 16384
|
||||||
|
|
||||||
typedef std::string mt19937_state;
|
typedef std::string mt19937_state;
|
||||||
|
|
||||||
|
@ -92,9 +92,9 @@ struct train_opt_callback_data {
|
||||||
struct train_state * train;
|
struct train_state * train;
|
||||||
save_train_files_callback save_cb;
|
save_train_files_callback save_cb;
|
||||||
void * save_data;
|
void * save_data;
|
||||||
struct llama_context * lctx;
|
struct jarvis_context * lctx;
|
||||||
int last_save_iter;
|
int last_save_iter;
|
||||||
llama_token * tokens_data;
|
jarvis_token * tokens_data;
|
||||||
size_t tokens_size;
|
size_t tokens_size;
|
||||||
size_t * samples_begin;
|
size_t * samples_begin;
|
||||||
size_t * samples_size;
|
size_t * samples_size;
|
||||||
|
@ -146,18 +146,18 @@ void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
|
||||||
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
|
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
|
||||||
|
|
||||||
size_t tokenize_file(
|
size_t tokenize_file(
|
||||||
struct llama_context * lctx,
|
struct jarvis_context * lctx,
|
||||||
const char * filename,
|
const char * filename,
|
||||||
const std::string & sample_start,
|
const std::string & sample_start,
|
||||||
bool include_sample_start,
|
bool include_sample_start,
|
||||||
bool overlapping_samples,
|
bool overlapping_samples,
|
||||||
unsigned context_length,
|
unsigned context_length,
|
||||||
std::vector<llama_token> & out_tokens,
|
std::vector<jarvis_token> & out_tokens,
|
||||||
std::vector<size_t> & out_samples_begin,
|
std::vector<size_t> & out_samples_begin,
|
||||||
std::vector<size_t> & out_samples_size);
|
std::vector<size_t> & out_samples_size);
|
||||||
|
|
||||||
int64_t get_example_targets_batch(
|
int64_t get_example_targets_batch(
|
||||||
struct llama_context * lctx,
|
struct jarvis_context * lctx,
|
||||||
struct ggml_tensor * tokens_input,
|
struct ggml_tensor * tokens_input,
|
||||||
struct ggml_tensor * target_probs,
|
struct ggml_tensor * target_probs,
|
||||||
int64_t example_id,
|
int64_t example_id,
|
||||||
|
@ -165,7 +165,7 @@ int64_t get_example_targets_batch(
|
||||||
const size_t * samples_begin,
|
const size_t * samples_begin,
|
||||||
const size_t * samples_size,
|
const size_t * samples_size,
|
||||||
size_t samples_count,
|
size_t samples_count,
|
||||||
const llama_token * train_data,
|
const jarvis_token * train_data,
|
||||||
size_t n_train_data,
|
size_t n_train_data,
|
||||||
bool separate_with_eos,
|
bool separate_with_eos,
|
||||||
bool separate_with_bos,
|
bool separate_with_bos,
|
||||||
|
|
|
@ -49,7 +49,7 @@ class Model:
|
||||||
_model_classes: dict[str, type[Model]] = {}
|
_model_classes: dict[str, type[Model]] = {}
|
||||||
|
|
||||||
dir_model: Path
|
dir_model: Path
|
||||||
ftype: gguf.LlamaFileType
|
ftype: gguf.JarvisFileType
|
||||||
fname_out: Path
|
fname_out: Path
|
||||||
is_big_endian: bool
|
is_big_endian: bool
|
||||||
endianess: gguf.GGUFEndian
|
endianess: gguf.GGUFEndian
|
||||||
|
@ -69,7 +69,7 @@ class Model:
|
||||||
# subclasses should define this!
|
# subclasses should define this!
|
||||||
model_arch: gguf.MODEL_ARCH
|
model_arch: gguf.MODEL_ARCH
|
||||||
|
|
||||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
def __init__(self, dir_model: Path, ftype: gguf.JarvisFileType, fname_out: Path, is_big_endian: bool = False,
|
||||||
use_temp_file: bool = False, eager: bool = False,
|
use_temp_file: bool = False, eager: bool = False,
|
||||||
metadata_override: Path | None = None, model_name: str | None = None,
|
metadata_override: Path | None = None, model_name: str | None = None,
|
||||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
||||||
|
@ -96,15 +96,15 @@ class Model:
|
||||||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||||
|
|
||||||
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
||||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
if self.ftype == gguf.JarvisFileType.GUESSED:
|
||||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||||
_, first_tensor = next(self.get_tensors())
|
_, first_tensor = next(self.get_tensors())
|
||||||
if first_tensor.dtype == torch.float16:
|
if first_tensor.dtype == torch.float16:
|
||||||
logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
|
logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
|
||||||
self.ftype = gguf.LlamaFileType.MOSTLY_F16
|
self.ftype = gguf.JarvisFileType.MOSTLY_F16
|
||||||
else:
|
else:
|
||||||
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
|
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
|
||||||
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
self.ftype = gguf.JarvisFileType.MOSTLY_BF16
|
||||||
|
|
||||||
# Configure GGUF Writer
|
# Configure GGUF Writer
|
||||||
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
||||||
|
@ -308,7 +308,7 @@ class Model:
|
||||||
if n_dims <= 1 or new_name.endswith("_norm.weight"):
|
if n_dims <= 1 or new_name.endswith("_norm.weight"):
|
||||||
data_qtype = gguf.GGMLQuantizationType.F32
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
|
||||||
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
|
# Conditions should closely match those in jarvis_model_quantize_internal in jarvis.cpp
|
||||||
# Some tensor types are always in float32
|
# Some tensor types are always in float32
|
||||||
if data_qtype is False and (
|
if data_qtype is False and (
|
||||||
any(
|
any(
|
||||||
|
@ -337,25 +337,25 @@ class Model:
|
||||||
)
|
)
|
||||||
):
|
):
|
||||||
if self.ftype in (
|
if self.ftype in (
|
||||||
gguf.LlamaFileType.MOSTLY_TQ1_0,
|
gguf.JarvisFileType.MOSTLY_TQ1_0,
|
||||||
gguf.LlamaFileType.MOSTLY_TQ2_0,
|
gguf.JarvisFileType.MOSTLY_TQ2_0,
|
||||||
):
|
):
|
||||||
# TODO: use Q4_K and Q6_K
|
# TODO: use Q4_K and Q6_K
|
||||||
data_qtype = gguf.GGMLQuantizationType.F16
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||||||
|
|
||||||
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
||||||
if isinstance(data_qtype, bool):
|
if isinstance(data_qtype, bool):
|
||||||
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
if self.ftype == gguf.JarvisFileType.ALL_F32:
|
||||||
data_qtype = gguf.GGMLQuantizationType.F32
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
elif self.ftype == gguf.JarvisFileType.MOSTLY_F16:
|
||||||
data_qtype = gguf.GGMLQuantizationType.F16
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
elif self.ftype == gguf.JarvisFileType.MOSTLY_BF16:
|
||||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
elif self.ftype == gguf.JarvisFileType.MOSTLY_Q8_0:
|
||||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
|
elif self.ftype == gguf.JarvisFileType.MOSTLY_TQ1_0:
|
||||||
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
|
elif self.ftype == gguf.JarvisFileType.MOSTLY_TQ2_0:
|
||||||
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
||||||
|
@ -394,7 +394,7 @@ class Model:
|
||||||
if self.metadata.size_label is None and total_params > 0:
|
if self.metadata.size_label is None and total_params > 0:
|
||||||
self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
|
self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
|
||||||
|
|
||||||
# Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
|
# Extract the encoding scheme from the file type name. e.g. 'gguf.JarvisFileType.MOSTLY_Q8_0' --> 'Q8_0'
|
||||||
output_type: str = self.ftype.name.partition("_")[2]
|
output_type: str = self.ftype.name.partition("_")[2]
|
||||||
|
|
||||||
# Filename Output
|
# Filename Output
|
||||||
|
@ -537,13 +537,13 @@ class Model:
|
||||||
|
|
||||||
# NOTE: this function is generated by convert_hf_to_gguf_update.py
|
# NOTE: this function is generated by convert_hf_to_gguf_update.py
|
||||||
# do not modify it manually!
|
# do not modify it manually!
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
# ref: https://github.com/ggerganov/jarvis.cpp/pull/6920
|
||||||
# Marker: Start get_vocab_base_pre
|
# Marker: Start get_vocab_base_pre
|
||||||
def get_vocab_base_pre(self, tokenizer) -> str:
|
def get_vocab_base_pre(self, tokenizer) -> str:
|
||||||
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
||||||
# is specific for the BPE pre-tokenizer used by the model
|
# is specific for the BPE pre-tokenizer used by the model
|
||||||
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
||||||
# use in llama.cpp to implement the same pre-tokenizer
|
# use in jarvis.cpp to implement the same pre-tokenizer
|
||||||
|
|
||||||
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||||
|
|
||||||
|
@ -559,8 +559,8 @@ class Model:
|
||||||
# or pull the latest version of the model from Huggingface
|
# or pull the latest version of the model from Huggingface
|
||||||
# don't edit the hashes manually!
|
# don't edit the hashes manually!
|
||||||
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
||||||
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
# ref: https://huggingface.co/meta-jarvis/Meta-Jarvis-3-8B
|
||||||
res = "llama-bpe"
|
res = "jarvis-bpe"
|
||||||
if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
|
if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
|
||||||
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
|
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
|
||||||
res = "deepseek-llm"
|
res = "deepseek-llm"
|
||||||
|
@ -616,7 +616,7 @@ class Model:
|
||||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||||
res = "jina-v2-de"
|
res = "jina-v2-de"
|
||||||
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||||
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
# ref: https://huggingface.co/abacusai/Smaug-Jarvis-3-70B-Instruct
|
||||||
res = "smaug-bpe"
|
res = "smaug-bpe"
|
||||||
if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
|
if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
|
||||||
# ref: https://huggingface.co/LumiOpen/Poro-34B-chat
|
# ref: https://huggingface.co/LumiOpen/Poro-34B-chat
|
||||||
|
@ -666,7 +666,7 @@ class Model:
|
||||||
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
||||||
logger.warning("** - the pre-tokenization config has changed upstream")
|
logger.warning("** - the pre-tokenization config has changed upstream")
|
||||||
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
||||||
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
logger.warning("** ref: https://github.com/ggerganov/jarvis.cpp/pull/6920")
|
||||||
logger.warning("**")
|
logger.warning("**")
|
||||||
logger.warning(f"** chkhsh: {chkhsh}")
|
logger.warning(f"** chkhsh: {chkhsh}")
|
||||||
logger.warning("**************************************************************************************")
|
logger.warning("**************************************************************************************")
|
||||||
|
@ -746,7 +746,7 @@ class Model:
|
||||||
def _set_vocab_sentencepiece(self, add_to_gguf=True):
|
def _set_vocab_sentencepiece(self, add_to_gguf=True):
|
||||||
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
||||||
|
|
||||||
self.gguf_writer.add_tokenizer_model("llama")
|
self.gguf_writer.add_tokenizer_model("jarvis")
|
||||||
self.gguf_writer.add_tokenizer_pre("default")
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
self.gguf_writer.add_token_list(tokens)
|
self.gguf_writer.add_token_list(tokens)
|
||||||
self.gguf_writer.add_token_scores(scores)
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
@ -835,8 +835,8 @@ class Model:
|
||||||
|
|
||||||
return tokens, scores, toktypes
|
return tokens, scores, toktypes
|
||||||
|
|
||||||
def _set_vocab_llama_hf(self):
|
def _set_vocab_jarvis_hf(self):
|
||||||
vocab = gguf.LlamaHfVocab(self.dir_model)
|
vocab = gguf.JarvisHfVocab(self.dir_model)
|
||||||
tokens = []
|
tokens = []
|
||||||
scores = []
|
scores = []
|
||||||
toktypes = []
|
toktypes = []
|
||||||
|
@ -848,7 +848,7 @@ class Model:
|
||||||
|
|
||||||
assert len(tokens) == vocab.vocab_size
|
assert len(tokens) == vocab.vocab_size
|
||||||
|
|
||||||
self.gguf_writer.add_tokenizer_model("llama")
|
self.gguf_writer.add_tokenizer_model("jarvis")
|
||||||
self.gguf_writer.add_tokenizer_pre("default")
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
self.gguf_writer.add_token_list(tokens)
|
self.gguf_writer.add_token_list(tokens)
|
||||||
self.gguf_writer.add_token_scores(scores)
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
@ -857,7 +857,7 @@ class Model:
|
||||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
|
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "jarvis-spm"], vocab_size: int):
|
||||||
tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
|
tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
|
||||||
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
||||||
vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
|
vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
|
||||||
|
@ -875,7 +875,7 @@ class Model:
|
||||||
assert field # token list
|
assert field # token list
|
||||||
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
||||||
|
|
||||||
if model_name == "llama-spm":
|
if model_name == "jarvis-spm":
|
||||||
field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
|
field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
|
||||||
assert field # token scores
|
assert field # token scores
|
||||||
self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
||||||
|
@ -884,7 +884,7 @@ class Model:
|
||||||
assert field # token types
|
assert field # token types
|
||||||
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
||||||
|
|
||||||
if model_name != "llama-spm":
|
if model_name != "jarvis-spm":
|
||||||
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
||||||
assert field # token merges
|
assert field # token merges
|
||||||
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
||||||
|
@ -1226,7 +1226,7 @@ class XverseModel(Model):
|
||||||
tokens.append(token_text)
|
tokens.append(token_text)
|
||||||
toktypes.append(toktype)
|
toktypes.append(toktype)
|
||||||
|
|
||||||
self.gguf_writer.add_tokenizer_model("llama")
|
self.gguf_writer.add_tokenizer_model("jarvis")
|
||||||
self.gguf_writer.add_tokenizer_pre("default")
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
self.gguf_writer.add_token_list(tokens)
|
self.gguf_writer.add_token_list(tokens)
|
||||||
self.gguf_writer.add_token_types(toktypes)
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
@ -1515,21 +1515,21 @@ class StableLMModel(Model):
|
||||||
raise ValueError(f"Unprocessed norms: {norms}")
|
raise ValueError(f"Unprocessed norms: {norms}")
|
||||||
|
|
||||||
|
|
||||||
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
@Model.register("LLaMAForCausalLM", "JarvisForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
||||||
class LlamaModel(Model):
|
class JarvisModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
model_arch = gguf.MODEL_ARCH.JARVIS
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
try:
|
try:
|
||||||
self._set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
try:
|
try:
|
||||||
self._set_vocab_llama_hf()
|
self._set_vocab_jarvis_hf()
|
||||||
except (FileNotFoundError, TypeError):
|
except (FileNotFoundError, TypeError):
|
||||||
# Llama 3
|
# Jarvis 3
|
||||||
self._set_vocab_gpt2()
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
# Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
|
# Apply to CodeJarvis only (and ignore for Jarvis 3 with a vocab size of 128256)
|
||||||
if self.hparams.get("vocab_size", 32000) == 32016:
|
if self.hparams.get("vocab_size", 32000) == 32016:
|
||||||
special_vocab = gguf.SpecialVocab(
|
special_vocab = gguf.SpecialVocab(
|
||||||
self.dir_model, load_merges=False,
|
self.dir_model, load_merges=False,
|
||||||
|
@ -1583,9 +1583,9 @@ class LlamaModel(Model):
|
||||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
||||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
data_torch = JarvisModel.permute(data_torch, n_head, n_head)
|
||||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
|
||||||
|
|
||||||
# process the experts separately
|
# process the experts separately
|
||||||
if name.find("block_sparse_moe.experts") != -1:
|
if name.find("block_sparse_moe.experts") != -1:
|
||||||
|
@ -1625,7 +1625,7 @@ class LlamaModel(Model):
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_scaling.get("rope_type", '').lower() == "jarvis3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = self.hparams.get("rope_theta", 10000.0)
|
||||||
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
@ -1793,7 +1793,7 @@ class DbrxModel(Model):
|
||||||
|
|
||||||
# Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
|
# Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
|
||||||
# original implementation expects (n_expert, n_ff, n_embd) for all experts weights
|
# original implementation expects (n_expert, n_ff, n_embd) for all experts weights
|
||||||
# But llama.cpp moe graph works differently
|
# But jarvis.cpp moe graph works differently
|
||||||
# AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
|
# AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
|
||||||
# so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
|
# so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
|
||||||
exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
||||||
|
@ -1842,7 +1842,7 @@ class MiniCPMModel(Model):
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_llama_hf()
|
self._set_vocab_jarvis_hf()
|
||||||
|
|
||||||
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
||||||
if n_kv_head is not None and n_head != n_kv_head:
|
if n_kv_head is not None and n_head != n_kv_head:
|
||||||
|
@ -2188,7 +2188,7 @@ class Phi3MiniModel(Model):
|
||||||
if foken_data.get("special"):
|
if foken_data.get("special"):
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||||
|
|
||||||
self.gguf_writer.add_tokenizer_model("llama")
|
self.gguf_writer.add_tokenizer_model("jarvis")
|
||||||
self.gguf_writer.add_tokenizer_pre("default")
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
self.gguf_writer.add_token_list(tokens)
|
self.gguf_writer.add_token_list(tokens)
|
||||||
self.gguf_writer.add_token_scores(scores)
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
@ -2456,7 +2456,7 @@ class InternLM2Model(Model):
|
||||||
if foken_data.get("special"):
|
if foken_data.get("special"):
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||||
|
|
||||||
self.gguf_writer.add_tokenizer_model("llama")
|
self.gguf_writer.add_tokenizer_model("jarvis")
|
||||||
self.gguf_writer.add_tokenizer_pre("default")
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
self.gguf_writer.add_token_list(tokens)
|
self.gguf_writer.add_token_list(tokens)
|
||||||
self.gguf_writer.add_token_scores(scores)
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
@ -2468,7 +2468,7 @@ class InternLM2Model(Model):
|
||||||
if chat_eos_token_id is not None:
|
if chat_eos_token_id is not None:
|
||||||
# For the chat model, we replace the eos with '<|im_end|>'.
|
# For the chat model, we replace the eos with '<|im_end|>'.
|
||||||
# TODO: this is a hack, should be fixed
|
# TODO: this is a hack, should be fixed
|
||||||
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
|
# https://github.com/ggerganov/jarvis.cpp/pull/6745#issuecomment-2067687048
|
||||||
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
||||||
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
||||||
" in chat mode so that the conversation can end normally.")
|
" in chat mode so that the conversation can end normally.")
|
||||||
|
@ -2505,8 +2505,8 @@ class InternLM2Model(Model):
|
||||||
q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
|
q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
|
||||||
|
|
||||||
# The model weights of q and k equire additional reshape.
|
# The model weights of q and k equire additional reshape.
|
||||||
q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
|
q = JarvisModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
|
||||||
k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
|
k = JarvisModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
|
||||||
v = v.reshape((-1, v.shape[-1]))
|
v = v.reshape((-1, v.shape[-1]))
|
||||||
|
|
||||||
return [
|
return [
|
||||||
|
@ -2769,7 +2769,7 @@ class GemmaModel(Model):
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
del bid # unused
|
del bid # unused
|
||||||
|
|
||||||
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
# lm_head is not used in jarvis.cpp, while autoawq will include this tensor in model
|
||||||
# To prevent errors, skip loading lm_head.weight.
|
# To prevent errors, skip loading lm_head.weight.
|
||||||
if name == "lm_head.weight":
|
if name == "lm_head.weight":
|
||||||
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
||||||
|
@ -2816,7 +2816,7 @@ class Gemma2Model(Model):
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
del bid # unused
|
del bid # unused
|
||||||
|
|
||||||
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
# lm_head is not used in jarvis.cpp, while autoawq will include this tensor in model
|
||||||
# To prevent errors, skip loading lm_head.weight.
|
# To prevent errors, skip loading lm_head.weight.
|
||||||
if name == "lm_head.weight":
|
if name == "lm_head.weight":
|
||||||
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
||||||
|
@ -2894,7 +2894,7 @@ class Rwkv6Model(Model):
|
||||||
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
# required by llama.cpp, unused
|
# required by jarvis.cpp, unused
|
||||||
self.gguf_writer.add_head_count(0)
|
self.gguf_writer.add_head_count(0)
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
@ -3024,7 +3024,7 @@ class OlmoModel(Model):
|
||||||
self.gguf_writer.add_clamp_kqv(clip_qkv)
|
self.gguf_writer.add_clamp_kqv(clip_qkv)
|
||||||
|
|
||||||
# Same as super class, but permuting q_proj, k_proj
|
# Same as super class, but permuting q_proj, k_proj
|
||||||
# Copied from: LlamaModel
|
# Copied from: JarvisModel
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
del bid # unused
|
del bid # unused
|
||||||
|
|
||||||
|
@ -3032,9 +3032,9 @@ class OlmoModel(Model):
|
||||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
||||||
if name.endswith("q_proj.weight"):
|
if name.endswith("q_proj.weight"):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
data_torch = JarvisModel.permute(data_torch, n_head, n_head)
|
||||||
if name.endswith("k_proj.weight"):
|
if name.endswith("k_proj.weight"):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
|
||||||
|
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
@ -3174,12 +3174,12 @@ class OpenELMModel(Model):
|
||||||
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
|
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
|
||||||
assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
|
assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
|
||||||
|
|
||||||
# Uses the tokenizer from meta-llama/Llama-2-7b-hf
|
# Uses the tokenizer from meta-jarvis/Jarvis-2-7b-hf
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
try:
|
try:
|
||||||
self._set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
|
self._set_vocab_builtin("jarvis-spm", self.hparams["vocab_size"])
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
n_embd = self._n_embd
|
n_embd = self._n_embd
|
||||||
|
@ -3300,7 +3300,7 @@ class ArcticModel(Model):
|
||||||
toktypes[token_id] = token_type
|
toktypes[token_id] = token_type
|
||||||
scores[token_id] = token_score
|
scores[token_id] = token_score
|
||||||
|
|
||||||
self.gguf_writer.add_tokenizer_model("llama")
|
self.gguf_writer.add_tokenizer_model("jarvis")
|
||||||
self.gguf_writer.add_tokenizer_pre("default")
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
self.gguf_writer.add_token_list(tokens)
|
self.gguf_writer.add_token_list(tokens)
|
||||||
self.gguf_writer.add_token_scores(scores)
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
@ -3322,9 +3322,9 @@ class ArcticModel(Model):
|
||||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
||||||
if name.endswith("q_proj.weight"):
|
if name.endswith("q_proj.weight"):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
data_torch = JarvisModel.permute(data_torch, n_head, n_head)
|
||||||
if name.endswith("k_proj.weight"):
|
if name.endswith("k_proj.weight"):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
|
||||||
|
|
||||||
# process the experts separately
|
# process the experts separately
|
||||||
if name.find("block_sparse_moe.experts") != -1:
|
if name.find("block_sparse_moe.experts") != -1:
|
||||||
|
@ -3882,7 +3882,7 @@ class ChatGLMModel(Model):
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
toktypes.append(toktype)
|
toktypes.append(toktype)
|
||||||
|
|
||||||
self.gguf_writer.add_tokenizer_model("llama")
|
self.gguf_writer.add_tokenizer_model("jarvis")
|
||||||
# glm3 needs prefix and suffix formatted as:
|
# glm3 needs prefix and suffix formatted as:
|
||||||
# prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
|
# prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
|
||||||
self.gguf_writer.add_tokenizer_pre("chatglm-spm")
|
self.gguf_writer.add_tokenizer_pre("chatglm-spm")
|
||||||
|
@ -4087,7 +4087,7 @@ class ExaoneModel(Model):
|
||||||
|
|
||||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_scaling.get("rope_type", '').lower() == "jarvis3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = self.hparams.get("rope_theta", 10000.0)
|
||||||
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
@ -4116,12 +4116,12 @@ class ExaoneModel(Model):
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GraniteForCausalLM")
|
@Model.register("GraniteForCausalLM")
|
||||||
class GraniteModel(LlamaModel):
|
class GraniteModel(JarvisModel):
|
||||||
"""Conversion for IBM's GraniteForCausalLM"""
|
"""Conversion for IBM's GraniteForCausalLM"""
|
||||||
model_arch = gguf.MODEL_ARCH.GRANITE
|
model_arch = gguf.MODEL_ARCH.GRANITE
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
"""Granite uses standard llama parameters with the following differences:
|
"""Granite uses standard jarvis parameters with the following differences:
|
||||||
|
|
||||||
- No head_dim support
|
- No head_dim support
|
||||||
- New multiplier params:
|
- New multiplier params:
|
||||||
|
@ -4196,9 +4196,9 @@ class ChameleonModel(Model):
|
||||||
hidden_dim = self.hparams.get("hidden_size")
|
hidden_dim = self.hparams.get("hidden_size")
|
||||||
|
|
||||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
data_torch = JarvisModel.permute(data_torch, n_head, n_head)
|
||||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
|
||||||
if name.endswith(("q_norm.weight", "q_norm.bias")):
|
if name.endswith(("q_norm.weight", "q_norm.bias")):
|
||||||
data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
|
data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
|
||||||
if name.endswith(("k_norm.weight", "k_norm.bias")):
|
if name.endswith(("k_norm.weight", "k_norm.bias")):
|
||||||
|
@ -4379,14 +4379,14 @@ def main() -> None:
|
||||||
logger.error(f'Error: {args.model} is not a directory')
|
logger.error(f'Error: {args.model} is not a directory')
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
ftype_map: dict[str, gguf.LlamaFileType] = {
|
ftype_map: dict[str, gguf.JarvisFileType] = {
|
||||||
"f32": gguf.LlamaFileType.ALL_F32,
|
"f32": gguf.JarvisFileType.ALL_F32,
|
||||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
"f16": gguf.JarvisFileType.MOSTLY_F16,
|
||||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
"bf16": gguf.JarvisFileType.MOSTLY_BF16,
|
||||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
"q8_0": gguf.JarvisFileType.MOSTLY_Q8_0,
|
||||||
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
|
"tq1_0": gguf.JarvisFileType.MOSTLY_TQ1_0,
|
||||||
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
|
"tq2_0": gguf.JarvisFileType.MOSTLY_TQ2_0,
|
||||||
"auto": gguf.LlamaFileType.GUESSED,
|
"auto": gguf.JarvisFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|
||||||
is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
|
is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
|
||||||
|
|
|
@ -5,10 +5,10 @@
|
||||||
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
|
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
|
||||||
#
|
#
|
||||||
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
|
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
|
||||||
# provide the necessary information to llama.cpp via the GGUF header in order to implement
|
# provide the necessary information to jarvis.cpp via the GGUF header in order to implement
|
||||||
# the same pre-tokenizer.
|
# the same pre-tokenizer.
|
||||||
#
|
#
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
# ref: https://github.com/ggerganov/jarvis.cpp/pull/6920
|
||||||
#
|
#
|
||||||
# Instructions:
|
# Instructions:
|
||||||
#
|
#
|
||||||
|
@ -18,9 +18,9 @@
|
||||||
# python3 convert_hf_to_gguf_update.py <huggingface_token>
|
# python3 convert_hf_to_gguf_update.py <huggingface_token>
|
||||||
#
|
#
|
||||||
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
|
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
|
||||||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
# - Update jarvis.cpp with the new pre-tokenizer if necessary
|
||||||
#
|
#
|
||||||
# TODO: generate tokenizer tests for llama.cpp
|
# TODO: generate tokenizer tests for jarvis.cpp
|
||||||
#
|
#
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
@ -65,8 +65,8 @@ else:
|
||||||
|
|
||||||
# TODO: add models here, base models preferred
|
# TODO: add models here, base models preferred
|
||||||
models = [
|
models = [
|
||||||
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
|
{"name": "jarvis-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-jarvis/Jarvis-2-7b-hf", },
|
||||||
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
|
{"name": "jarvis-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-jarvis/Meta-Jarvis-3-8B", },
|
||||||
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
|
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
|
||||||
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
|
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
|
||||||
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
||||||
|
@ -86,7 +86,7 @@ models = [
|
||||||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Jarvis-3-70B-Instruct", },
|
||||||
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||||
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||||
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
|
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
|
||||||
|
@ -215,7 +215,7 @@ src_func = f"""
|
||||||
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
||||||
# is specific for the BPE pre-tokenizer used by the model
|
# is specific for the BPE pre-tokenizer used by the model
|
||||||
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
||||||
# use in llama.cpp to implement the same pre-tokenizer
|
# use in jarvis.cpp to implement the same pre-tokenizer
|
||||||
|
|
||||||
chktxt = {repr(CHK_TXT)}
|
chktxt = {repr(CHK_TXT)}
|
||||||
|
|
||||||
|
@ -239,7 +239,7 @@ src_func = f"""
|
||||||
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
||||||
logger.warning("** - the pre-tokenization config has changed upstream")
|
logger.warning("** - the pre-tokenization config has changed upstream")
|
||||||
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
||||||
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
logger.warning("** ref: https://github.com/ggerganov/jarvis.cpp/pull/6920")
|
||||||
logger.warning("**")
|
logger.warning("**")
|
||||||
logger.warning(f"** chkhsh: {{chkhsh}}")
|
logger.warning(f"** chkhsh: {{chkhsh}}")
|
||||||
logger.warning("**************************************************************************************")
|
logger.warning("**************************************************************************************")
|
||||||
|
@ -311,7 +311,7 @@ tests = [
|
||||||
"3333333",
|
"3333333",
|
||||||
"33333333",
|
"33333333",
|
||||||
"333333333",
|
"333333333",
|
||||||
"Cửa Việt", # llama-bpe fails on this
|
"Cửa Việt", # jarvis-bpe fails on this
|
||||||
" discards",
|
" discards",
|
||||||
CHK_TXT,
|
CHK_TXT,
|
||||||
]
|
]
|
||||||
|
|
|
@ -223,13 +223,13 @@ class GGMLToGGUF:
|
||||||
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
|
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
|
||||||
logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
|
logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
|
||||||
self.n_kv_head = n_kv_head
|
self.n_kv_head = n_kv_head
|
||||||
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
|
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.JARVIS, ggml_model.hyperparameters.n_layer)
|
||||||
|
|
||||||
def save(self):
|
def save(self):
|
||||||
logger.info('* Preparing to save GGUF file')
|
logger.info('* Preparing to save GGUF file')
|
||||||
gguf_writer = gguf.GGUFWriter(
|
gguf_writer = gguf.GGUFWriter(
|
||||||
self.cfg.output,
|
self.cfg.output,
|
||||||
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.JARVIS],
|
||||||
use_temp_file = False)
|
use_temp_file = False)
|
||||||
self.add_params(gguf_writer)
|
self.add_params(gguf_writer)
|
||||||
self.add_vocab(gguf_writer)
|
self.add_vocab(gguf_writer)
|
||||||
|
@ -286,7 +286,7 @@ class GGMLToGGUF:
|
||||||
|
|
||||||
def add_vocab(self, gguf_writer):
|
def add_vocab(self, gguf_writer):
|
||||||
hp = self.model.hyperparameters
|
hp = self.model.hyperparameters
|
||||||
gguf_writer.add_tokenizer_model('llama')
|
gguf_writer.add_tokenizer_model('jarvis')
|
||||||
gguf_writer.add_tokenizer_pre('default')
|
gguf_writer.add_tokenizer_pre('default')
|
||||||
tokens = []
|
tokens = []
|
||||||
scores = []
|
scores = []
|
||||||
|
@ -358,7 +358,7 @@ class GGMLToGGUF:
|
||||||
|
|
||||||
|
|
||||||
def handle_metadata(cfg, hp):
|
def handle_metadata(cfg, hp):
|
||||||
import examples.convert_legacy_llama as convert
|
import examples.convert_legacy_jarvis as convert
|
||||||
|
|
||||||
assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
|
assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
|
||||||
hf_config_path = cfg.model_metadata_dir / "config.json"
|
hf_config_path = cfg.model_metadata_dir / "config.json"
|
||||||
|
|
|
@ -271,12 +271,12 @@ if __name__ == '__main__':
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||||
|
|
||||||
ftype_map: dict[str, gguf.LlamaFileType] = {
|
ftype_map: dict[str, gguf.JarvisFileType] = {
|
||||||
"f32": gguf.LlamaFileType.ALL_F32,
|
"f32": gguf.JarvisFileType.ALL_F32,
|
||||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
"f16": gguf.JarvisFileType.MOSTLY_F16,
|
||||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
"bf16": gguf.JarvisFileType.MOSTLY_BF16,
|
||||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
"q8_0": gguf.JarvisFileType.MOSTLY_Q8_0,
|
||||||
"auto": gguf.LlamaFileType.GUESSED,
|
"auto": gguf.JarvisFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|
||||||
ftype = ftype_map[args.outtype]
|
ftype = ftype_map[args.outtype]
|
||||||
|
@ -372,9 +372,9 @@ if __name__ == '__main__':
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
dest = list(super().modify_tensors(data_torch, name, bid))
|
dest = list(super().modify_tensors(data_torch, name, bid))
|
||||||
# some archs may have the same tensor for lm_head and output (tie word embeddings)
|
# some archs may have the same tensor for lm_head and output (tie word embeddings)
|
||||||
# in this case, adapters targeting lm_head will fail when using llama-export-lora
|
# in this case, adapters targeting lm_head will fail when using jarvis-export-lora
|
||||||
# therefore, we ignore them for now
|
# therefore, we ignore them for now
|
||||||
# see: https://github.com/ggerganov/llama.cpp/issues/9065
|
# see: https://github.com/ggerganov/jarvis.cpp/issues/9065
|
||||||
if name == "lm_head.weight" and len(dest) == 0:
|
if name == "lm_head.weight" and len(dest) == 0:
|
||||||
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
||||||
for dest_name, dest_data in dest:
|
for dest_name, dest_data in dest:
|
||||||
|
|
|
@ -5,14 +5,14 @@
|
||||||
|
|
||||||
[Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid.
|
[Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid.
|
||||||
|
|
||||||
With Termux, you can install and run `llama.cpp` as if the environment were Linux. Once in the Termux shell:
|
With Termux, you can install and run `jarvis.cpp` as if the environment were Linux. Once in the Termux shell:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ apt update && apt upgrade -y
|
$ apt update && apt upgrade -y
|
||||||
$ apt install git cmake
|
$ apt install git cmake
|
||||||
```
|
```
|
||||||
|
|
||||||
Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake.
|
Then, follow the [build instructions](https://github.com/ggerganov/jarvis.cpp/blob/master/docs/build.md), specifically for CMake.
|
||||||
|
|
||||||
Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:
|
Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:
|
||||||
|
|
||||||
|
@ -20,22 +20,22 @@ Once the binaries are built, download your model of choice (e.g., from Hugging F
|
||||||
$ curl -L {model-url} -o ~/{model}.gguf
|
$ curl -L {model-url} -o ~/{model}.gguf
|
||||||
```
|
```
|
||||||
|
|
||||||
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
|
Then, if you are not already in the repo directory, `cd` into `jarvis.cpp` and:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
|
$ ./build/bin/jarvis-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
|
||||||
```
|
```
|
||||||
|
|
||||||
Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
|
Here, we show `jarvis-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
|
||||||
|
|
||||||
To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
|
To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
|
||||||
|
|
||||||
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
||||||
|
|
||||||
## Cross-compile using Android NDK
|
## Cross-compile using Android NDK
|
||||||
It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)
|
It's possible to build `jarvis.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)
|
||||||
|
|
||||||
Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory:
|
Once you're ready and have cloned `jarvis.cpp`, invoke the following in the project directory:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ cmake \
|
$ cmake \
|
||||||
|
@ -45,15 +45,15 @@ $ cmake \
|
||||||
-DCMAKE_C_FLAGS="-march=armv8.7a" \
|
-DCMAKE_C_FLAGS="-march=armv8.7a" \
|
||||||
-DCMAKE_CXX_FLAGS="-march=armv8.7a" \
|
-DCMAKE_CXX_FLAGS="-march=armv8.7a" \
|
||||||
-DGGML_OPENMP=OFF \
|
-DGGML_OPENMP=OFF \
|
||||||
-DGGML_LLAMAFILE=OFF \
|
-DGGML_JARVISFILE=OFF \
|
||||||
-B build-android
|
-B build-android
|
||||||
```
|
```
|
||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
- While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time
|
- While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time
|
||||||
- `llamafile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/llamafile/issues/325)
|
- `jarvisfile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/jarvisfile/issues/325)
|
||||||
|
|
||||||
The above command should configure `llama.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `llama.cpp` includes runtime checks for available CPU features it can use.
|
The above command should configure `jarvis.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `jarvis.cpp` includes runtime checks for available CPU features it can use.
|
||||||
|
|
||||||
Feel free to adjust the Android ABI for your target. Once the project is configured:
|
Feel free to adjust the Android ABI for your target. Once the project is configured:
|
||||||
|
|
||||||
|
@ -65,17 +65,17 @@ $ cmake --install build-android --prefix {install-dir} --config Release
|
||||||
After installing, go ahead and download the model of your choice to your host system. Then:
|
After installing, go ahead and download the model of your choice to your host system. Then:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ adb shell "mkdir /data/local/tmp/llama.cpp"
|
$ adb shell "mkdir /data/local/tmp/jarvis.cpp"
|
||||||
$ adb push {install-dir} /data/local/tmp/llama.cpp/
|
$ adb push {install-dir} /data/local/tmp/jarvis.cpp/
|
||||||
$ adb push {model}.gguf /data/local/tmp/llama.cpp/
|
$ adb push {model}.gguf /data/local/tmp/jarvis.cpp/
|
||||||
$ adb shell
|
$ adb shell
|
||||||
```
|
```
|
||||||
|
|
||||||
In the `adb shell`:
|
In the `adb shell`:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ cd /data/local/tmp/llama.cpp
|
$ cd /data/local/tmp/jarvis.cpp
|
||||||
$ LD_LIBRARY_PATH=lib ./bin/llama-simple -m {model}.gguf -c {context-size} -p "{your-prompt}"
|
$ LD_LIBRARY_PATH=lib ./bin/jarvis-simple -m {model}.gguf -c {context-size} -p "{your-prompt}"
|
||||||
```
|
```
|
||||||
|
|
||||||
That's it!
|
That's it!
|
||||||
|
|
|
@ -25,13 +25,13 @@ sudo make install
|
||||||
|
|
||||||
We recommend using openmp since it's easier to modify the cores being used.
|
We recommend using openmp since it's easier to modify the cores being used.
|
||||||
|
|
||||||
### llama.cpp compilation
|
### jarvis.cpp compilation
|
||||||
|
|
||||||
Makefile:
|
Makefile:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make GGML_BLIS=1 -j
|
make GGML_BLIS=1 -j
|
||||||
# make GGML_BLIS=1 llama-benchmark-matmult
|
# make GGML_BLIS=1 jarvis-benchmark-matmult
|
||||||
```
|
```
|
||||||
|
|
||||||
CMake:
|
CMake:
|
||||||
|
@ -43,7 +43,7 @@ cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
|
||||||
make -j
|
make -j
|
||||||
```
|
```
|
||||||
|
|
||||||
### llama.cpp execution
|
### jarvis.cpp execution
|
||||||
|
|
||||||
According to the BLIS documentation, we could set the following
|
According to the BLIS documentation, we could set the following
|
||||||
environment variables to modify the behavior of openmp:
|
environment variables to modify the behavior of openmp:
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# llama.cpp for CANN
|
# jarvis.cpp for CANN
|
||||||
|
|
||||||
- [Background](#background)
|
- [Background](#background)
|
||||||
- [News](#news)
|
- [News](#news)
|
||||||
|
@ -17,9 +17,9 @@
|
||||||
|
|
||||||
**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
|
**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
|
||||||
|
|
||||||
**Llama.cpp + CANN**
|
**Jarvis.cpp + CANN**
|
||||||
|
|
||||||
The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
|
The jarvis.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
|
||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
|
@ -78,11 +78,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||||
| GritLM-7B | √ | √ | √ |
|
| GritLM-7B | √ | √ | √ |
|
||||||
| internlm2_5-7b-chat | √ | √ | √ |
|
| internlm2_5-7b-chat | √ | √ | √ |
|
||||||
| koala-7B-HF | √ | √ | √ |
|
| koala-7B-HF | √ | √ | √ |
|
||||||
| Llama-2-7b-chat-hf | √ | √ | √ |
|
| Jarvis-2-7b-chat-hf | √ | √ | √ |
|
||||||
| Llama-3-Smaug-8B | √ | √ | √ |
|
| Jarvis-3-Smaug-8B | √ | √ | √ |
|
||||||
| Llama2-Chinese-7b-Chat | √ | √ | √ |
|
| Jarvis2-Chinese-7b-Chat | √ | √ | √ |
|
||||||
| Llama3-8B | √ | √ | √ |
|
| Jarvis3-8B | √ | √ | √ |
|
||||||
| Llama3-8b-chinese | √ | √ | √ |
|
| Jarvis3-8b-chinese | √ | √ | √ |
|
||||||
| mamba-130m-hf | √ | √ | √ |
|
| mamba-130m-hf | √ | √ | √ |
|
||||||
| Mistral-7B-Instruct-v0.2 | √ | √ | √ |
|
| Mistral-7B-Instruct-v0.2 | √ | √ | √ |
|
||||||
| Mixtral-8x7B-Instruct-v0.1 | x | √ | √ |
|
| Mixtral-8x7B-Instruct-v0.1 | x | √ | √ |
|
||||||
|
@ -120,9 +120,9 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||||
## Docker
|
## Docker
|
||||||
|
|
||||||
### Build Images
|
### Build Images
|
||||||
You can get a image with llama.cpp in one command.
|
You can get a image with jarvis.cpp in one command.
|
||||||
```sh
|
```sh
|
||||||
docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile .
|
docker build -t jarvis-cpp-cann -f .devops/jarvis-cli-cann.Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
### Run container
|
### Run container
|
||||||
|
@ -133,7 +133,7 @@ npu-smi info
|
||||||
|
|
||||||
# Select the cards that you want to use, make sure these cards are not used by someone.
|
# Select the cards that you want to use, make sure these cards are not used by someone.
|
||||||
# Following using cards of device0.
|
# Following using cards of device0.
|
||||||
docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
|
docker run --name jarviscpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it jarvis-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
|
||||||
```
|
```
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
@ -208,7 +208,7 @@ docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager
|
||||||
|
|
||||||
Upon a successful installation, CANN is enabled for the available ascend devices.
|
Upon a successful installation, CANN is enabled for the available ascend devices.
|
||||||
|
|
||||||
### II. Build llama.cpp
|
### II. Build jarvis.cpp
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
|
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
|
||||||
|
@ -242,13 +242,13 @@ cmake --build build --config release
|
||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
./build/bin/jarvis-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||||
```
|
```
|
||||||
|
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
./build/bin/jarvis-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||||
```
|
```
|
||||||
|
|
||||||
### **GitHub contribution**:
|
### **GitHub contribution**:
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# llama.cpp for SYCL
|
# jarvis.cpp for SYCL
|
||||||
|
|
||||||
- [Background](#background)
|
- [Background](#background)
|
||||||
- [Recommended Release](#recommended-release)
|
- [Recommended Release](#recommended-release)
|
||||||
|
@ -24,9 +24,9 @@
|
||||||
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
|
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
|
||||||
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
||||||
|
|
||||||
### Llama.cpp + SYCL
|
### Jarvis.cpp + SYCL
|
||||||
|
|
||||||
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
|
The jarvis.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
|
||||||
|
|
||||||
## Recommended Release
|
## Recommended Release
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ The following release is verified with good quality:
|
||||||
|
|
||||||
|Commit ID|Tag|Release|Verified Platform|
|
|Commit ID|Tag|Release|Verified Platform|
|
||||||
|-|-|-|-|
|
|-|-|-|-|
|
||||||
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[jarvis-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/jarvis.cpp/releases/download/b3038/jarvis-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
|
||||||
|
|
||||||
|
|
||||||
## News
|
## News
|
||||||
|
@ -46,7 +46,7 @@ The following release is verified with good quality:
|
||||||
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
|
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
|
||||||
|
|
||||||
- 2024.5
|
- 2024.5
|
||||||
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
|
- Performance is increased: 34 -> 37 tokens/s of jarvis-2-7b.Q4_0 on Arc770.
|
||||||
- Arch Linux is verified successfully.
|
- Arch Linux is verified successfully.
|
||||||
|
|
||||||
- 2024.4
|
- 2024.4
|
||||||
|
@ -54,8 +54,8 @@ The following release is verified with good quality:
|
||||||
|
|
||||||
- 2024.3
|
- 2024.3
|
||||||
- Release binary files of Windows.
|
- Release binary files of Windows.
|
||||||
- A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
|
- A blog is published: **Run LLM on all Intel GPUs Using jarvis.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-jarvis-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-jarvis-cpp-fd2e2dcbd9bd).
|
||||||
- New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
|
- New base line is ready: [tag b2437](https://github.com/ggerganov/jarvis.cpp/tree/b2437).
|
||||||
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
|
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
|
||||||
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
|
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
|
||||||
- Support detecting all GPUs with level-zero and same top **Max compute units**.
|
- Support detecting all GPUs with level-zero and same top **Max compute units**.
|
||||||
|
@ -100,9 +100,9 @@ SYCL backend supports Intel GPU Family:
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- **Memory**
|
- **Memory**
|
||||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
|
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/jarvis-cli`.
|
||||||
|
|
||||||
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *jarvis-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
||||||
|
|
||||||
- **Execution Unit (EU)**
|
- **Execution Unit (EU)**
|
||||||
- If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
|
- If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
|
||||||
|
@ -130,14 +130,14 @@ The docker build option is currently limited to *intel GPU* targets.
|
||||||
### Build image
|
### Build image
|
||||||
```sh
|
```sh
|
||||||
# Using FP16
|
# Using FP16
|
||||||
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
|
docker build -t jarvis-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/jarvis-cli-intel.Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
*Notes*:
|
*Notes*:
|
||||||
|
|
||||||
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
|
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
|
||||||
|
|
||||||
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
You can also use the `.devops/jarvis-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
||||||
|
|
||||||
### Run container
|
### Run container
|
||||||
|
|
||||||
|
@ -145,7 +145,7 @@ You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *
|
||||||
# First, find all the DRI cards
|
# First, find all the DRI cards
|
||||||
ls -la /dev/dri
|
ls -la /dev/dri
|
||||||
# Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
|
# Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
|
||||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 jarvis-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||||
```
|
```
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
@ -276,7 +276,7 @@ For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
|
||||||
[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
|
[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
|
||||||
```
|
```
|
||||||
|
|
||||||
### II. Build llama.cpp
|
### II. Build jarvis.cpp
|
||||||
|
|
||||||
#### Intel GPU
|
#### Intel GPU
|
||||||
|
|
||||||
|
@ -309,7 +309,7 @@ export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
|
||||||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
|
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
|
||||||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
||||||
|
|
||||||
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
# Build JARVIS with Nvidia BLAS acceleration through SYCL
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
@ -329,7 +329,7 @@ export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
|
||||||
export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
|
export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
|
||||||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
|
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
|
||||||
|
|
||||||
# Build LLAMA with rocBLAS acceleration through SYCL
|
# Build JARVIS with rocBLAS acceleration through SYCL
|
||||||
|
|
||||||
## AMD
|
## AMD
|
||||||
# Use FP32, FP16 is not supported
|
# Use FP32, FP16 is not supported
|
||||||
|
@ -344,7 +344,7 @@ cmake --build build --config Release -j -v
|
||||||
|
|
||||||
#### Retrieve and prepare model
|
#### Retrieve and prepare model
|
||||||
|
|
||||||
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
|
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [jarvis-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Jarvis-2-7B-GGUF/blob/main/jarvis-2-7b.Q4_0.gguf) model as example.
|
||||||
|
|
||||||
##### Check device
|
##### Check device
|
||||||
|
|
||||||
|
@ -359,7 +359,7 @@ source /opt/intel/oneapi/setvars.sh
|
||||||
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./build/bin/llama-ls-sycl-device
|
./build/bin/jarvis-ls-sycl-device
|
||||||
```
|
```
|
||||||
|
|
||||||
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
||||||
|
@ -390,12 +390,12 @@ Choose one of following methods to run.
|
||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./examples/sycl/run-llama2.sh 0
|
./examples/sycl/run-jarvis2.sh 0
|
||||||
```
|
```
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./examples/sycl/run-llama2.sh
|
./examples/sycl/run-jarvis2.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Command line
|
2. Command line
|
||||||
|
@ -418,13 +418,13 @@ Examples:
|
||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
ZES_ENABLE_SYSMAN=1 ./build/bin/jarvis-cli -m models/jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||||
```
|
```
|
||||||
|
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
ZES_ENABLE_SYSMAN=1 ./build/bin/jarvis-cli -m models/jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||||
```
|
```
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
@ -492,7 +492,7 @@ a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can
|
||||||
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
|
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
|
||||||
|
|
||||||
|
|
||||||
### II. Build llama.cpp
|
### II. Build jarvis.cpp
|
||||||
|
|
||||||
You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
|
You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
|
||||||
|
|
||||||
|
@ -506,7 +506,7 @@ Choose one of following methods to build from source code.
|
||||||
|
|
||||||
2. CMake
|
2. CMake
|
||||||
|
|
||||||
On the oneAPI command line window, step into the llama.cpp main directory and run the following:
|
On the oneAPI command line window, step into the jarvis.cpp main directory and run the following:
|
||||||
|
|
||||||
```
|
```
|
||||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
@ -524,34 +524,34 @@ Or, use CMake presets to build:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
cmake --preset x64-windows-sycl-release
|
cmake --preset x64-windows-sycl-release
|
||||||
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
cmake --build build-x64-windows-sycl-release -j --target jarvis-cli
|
||||||
|
|
||||||
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
|
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||||
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
cmake --build build-x64-windows-sycl-release -j --target jarvis-cli
|
||||||
|
|
||||||
cmake --preset x64-windows-sycl-debug
|
cmake --preset x64-windows-sycl-debug
|
||||||
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
|
cmake --build build-x64-windows-sycl-debug -j --target jarvis-cli
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Visual Studio
|
3. Visual Studio
|
||||||
|
|
||||||
You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
|
You can use Visual Studio to open jarvis.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
|
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target jarvis-cli`.
|
||||||
|
|
||||||
### III. Run the inference
|
### III. Run the inference
|
||||||
|
|
||||||
#### Retrieve and prepare model
|
#### Retrieve and prepare model
|
||||||
|
|
||||||
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
|
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [jarvis-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Jarvis-2-7B-GGUF/blob/main/jarvis-2-7b.Q4_0.gguf) model as example.
|
||||||
|
|
||||||
##### Check device
|
##### Check device
|
||||||
|
|
||||||
1. Enable oneAPI running environment
|
1. Enable oneAPI running environment
|
||||||
|
|
||||||
On the oneAPI command line window, run the following and step into the llama.cpp directory:
|
On the oneAPI command line window, run the following and step into the jarvis.cpp directory:
|
||||||
```
|
```
|
||||||
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
|
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
|
||||||
```
|
```
|
||||||
|
@ -561,7 +561,7 @@ On the oneAPI command line window, run the following and step into the llama.cpp
|
||||||
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
||||||
|
|
||||||
```
|
```
|
||||||
build\bin\llama-ls-sycl-device.exe
|
build\bin\jarvis-ls-sycl-device.exe
|
||||||
```
|
```
|
||||||
|
|
||||||
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
||||||
|
@ -589,7 +589,7 @@ Choose one of following methods to run.
|
||||||
1. Script
|
1. Script
|
||||||
|
|
||||||
```
|
```
|
||||||
examples\sycl\win-run-llama2.bat
|
examples\sycl\win-run-jarvis2.bat
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Command line
|
2. Command line
|
||||||
|
@ -613,13 +613,13 @@ Examples:
|
||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```
|
```
|
||||||
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
build\bin\jarvis-cli.exe -m models\jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||||
```
|
```
|
||||||
|
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```
|
```
|
||||||
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
build\bin\jarvis-cli.exe -m models\jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
@ -682,13 +682,13 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
```
|
```
|
||||||
Otherwise, please double-check the GPU driver installation steps.
|
Otherwise, please double-check the GPU driver installation steps.
|
||||||
|
|
||||||
- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend?
|
- Can I report Ojarvis issue on Intel GPU to jarvis.cpp SYCL backend?
|
||||||
|
|
||||||
No. We can't support Ollama issue directly, because we aren't familiar with Ollama.
|
No. We can't support Ojarvis issue directly, because we aren't familiar with Ojarvis.
|
||||||
|
|
||||||
Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it.
|
Sugguest reproducing on jarvis.cpp and report similar issue to jarvis.cpp. We will surpport it.
|
||||||
|
|
||||||
It's same for other projects including llama.cpp SYCL backend.
|
It's same for other projects including jarvis.cpp SYCL backend.
|
||||||
|
|
||||||
- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
|
- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
# Build llama.cpp locally
|
# Build jarvis.cpp locally
|
||||||
|
|
||||||
**To get the Code:**
|
**To get the Code:**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/ggerganov/llama.cpp
|
git clone https://github.com/ggerganov/jarvis.cpp
|
||||||
cd llama.cpp
|
cd jarvis.cpp
|
||||||
```
|
```
|
||||||
|
|
||||||
In order to build llama.cpp you have four different options.
|
In order to build jarvis.cpp you have four different options.
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
- On Linux or MacOS:
|
- On Linux or MacOS:
|
||||||
|
@ -21,17 +21,17 @@ In order to build llama.cpp you have four different options.
|
||||||
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
||||||
2. Extract `w64devkit` on your pc.
|
2. Extract `w64devkit` on your pc.
|
||||||
3. Run `w64devkit.exe`.
|
3. Run `w64devkit.exe`.
|
||||||
4. Use the `cd` command to reach the `llama.cpp` folder.
|
4. Use the `cd` command to reach the `jarvis.cpp` folder.
|
||||||
5. From here you can run:
|
5. From here you can run:
|
||||||
```bash
|
```bash
|
||||||
make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
- Notes:
|
- Notes:
|
||||||
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
|
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_JARVISFILE=1` flag. For example, use `make GGML_NO_JARVISFILE=1`.
|
||||||
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
|
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
|
||||||
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||||
- For debug builds, run `make LLAMA_DEBUG=1`
|
- For debug builds, run `make JARVIS_DEBUG=1`
|
||||||
|
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
|
@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options.
|
||||||
|
|
||||||
**Notes**:
|
**Notes**:
|
||||||
|
|
||||||
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
|
- For `Q4_0_4_4` quantization type build, add the `-DGGML_JARVISFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_JARVISFILE=OFF`.
|
||||||
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
||||||
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||||
- For debug builds, there are two cases:
|
- For debug builds, there are two cases:
|
||||||
|
@ -118,7 +118,7 @@ This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS i
|
||||||
4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
|
4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
|
||||||
5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
|
5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
|
||||||
6. Run `w64devkit.exe`.
|
6. Run `w64devkit.exe`.
|
||||||
7. Use the `cd` command to reach the `llama.cpp` folder.
|
7. Use the `cd` command to reach the `jarvis.cpp` folder.
|
||||||
8. From here you can run:
|
8. From here you can run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
@ -140,13 +140,13 @@ Check [BLIS.md](./backend/BLIS.md) for more information.
|
||||||
|
|
||||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
|
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
|
||||||
|
|
||||||
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
jarvis.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
||||||
|
|
||||||
For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
|
For detailed info, please refer to [jarvis.cpp for SYCL](./backend/SYCL.md).
|
||||||
|
|
||||||
### Intel oneMKL
|
### Intel oneMKL
|
||||||
|
|
||||||
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
|
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [jarvis.cpp for SYCL](./backend/SYCL.md).
|
||||||
|
|
||||||
- Using manual oneAPI installation:
|
- Using manual oneAPI installation:
|
||||||
By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
|
By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
|
||||||
|
@ -159,7 +159,7 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
|
||||||
- Using oneAPI docker image:
|
- Using oneAPI docker image:
|
||||||
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
|
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
|
||||||
|
|
||||||
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
|
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-jarvis2-on-intel-cpu.html) for more information.
|
||||||
|
|
||||||
### CUDA
|
### CUDA
|
||||||
|
|
||||||
|
@ -300,7 +300,7 @@ Libs: -lvulkan-1
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
```
|
```
|
||||||
Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
|
Switch into the `jarvis.cpp` directory and run `make GGML_VULKAN=1`.
|
||||||
|
|
||||||
#### MSYS2
|
#### MSYS2
|
||||||
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
|
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
|
||||||
|
@ -311,7 +311,7 @@ Install [MSYS2](https://www.msys2.org/) and then run the following commands in a
|
||||||
mingw-w64-ucrt-x86_64-vulkan-devel \
|
mingw-w64-ucrt-x86_64-vulkan-devel \
|
||||||
mingw-w64-ucrt-x86_64-shaderc
|
mingw-w64-ucrt-x86_64-shaderc
|
||||||
```
|
```
|
||||||
Switch into `llama.cpp` directory and build using CMake.
|
Switch into `jarvis.cpp` directory and build using CMake.
|
||||||
```sh
|
```sh
|
||||||
cmake -B build -DGGML_VULKAN=ON
|
cmake -B build -DGGML_VULKAN=ON
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
|
@ -323,10 +323,10 @@ You don't need to install Vulkan SDK. It will be installed inside the container.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Build the image
|
# Build the image
|
||||||
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
|
docker build -t jarvis-cpp-vulkan -f .devops/jarvis-cli-vulkan.Dockerfile .
|
||||||
|
|
||||||
# Then, use it:
|
# Then, use it:
|
||||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 jarvis-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||||
```
|
```
|
||||||
|
|
||||||
**Without docker**:
|
**Without docker**:
|
||||||
|
@ -348,13 +348,13 @@ Alternatively your package manager might be able to provide the appropriate libr
|
||||||
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
||||||
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
|
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
|
||||||
|
|
||||||
Then, build llama.cpp using the cmake command below:
|
Then, build jarvis.cpp using the cmake command below:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DGGML_VULKAN=1
|
cmake -B build -DGGML_VULKAN=1
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
||||||
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
./bin/jarvis-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||||
|
|
||||||
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
||||||
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
||||||
|
@ -367,7 +367,7 @@ For more information about Ascend NPU in [Ascend Community](https://www.hiascend
|
||||||
|
|
||||||
Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
|
Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
|
||||||
|
|
||||||
Go to `llama.cpp` directory and build using CMake.
|
Go to `jarvis.cpp` directory and build using CMake.
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
|
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
|
||||||
cmake --build build --config release
|
cmake --build build --config release
|
||||||
|
@ -375,15 +375,15 @@ cmake --build build --config release
|
||||||
|
|
||||||
You can test with:
|
You can test with:
|
||||||
|
|
||||||
`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
|
`./build/jarvis-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
|
||||||
|
|
||||||
If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
|
If the fllowing info is output on screen, you are using `jarvis.cpp by CANN backend`:
|
||||||
```bash
|
```bash
|
||||||
llm_load_tensors: CANN buffer size = 13313.00 MiB
|
llm_load_tensors: CANN buffer size = 13313.00 MiB
|
||||||
llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
|
jarvis_new_context_with_model: CANN compute buffer size = 1260.81 MiB
|
||||||
```
|
```
|
||||||
|
|
||||||
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
|
For detailed info, such as model/device supports, CANN install, please refer to [jarvis.cpp for CANN](./backend/CANN.md).
|
||||||
|
|
||||||
### Android
|
### Android
|
||||||
|
|
||||||
|
@ -391,6 +391,6 @@ To read documentation for how to build on Android, [click here](./android.md)
|
||||||
|
|
||||||
### Arm CPU optimized mulmat kernels
|
### Arm CPU optimized mulmat kernels
|
||||||
|
|
||||||
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
|
Jarvis.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
|
||||||
|
|
||||||
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
|
To support `Q4_0_4_4`, you must build with `GGML_NO_JARVISFILE=1` (`make`) or `-DGGML_JARVISFILE=OFF` (`cmake`).
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
# Add a new model architecture to `llama.cpp`
|
# Add a new model architecture to `jarvis.cpp`
|
||||||
|
|
||||||
Adding a model requires few steps:
|
Adding a model requires few steps:
|
||||||
|
|
||||||
1. Convert the model to GGUF
|
1. Convert the model to GGUF
|
||||||
2. Define the model architecture in `llama.cpp`
|
2. Define the model architecture in `jarvis.cpp`
|
||||||
3. Build the GGML graph implementation
|
3. Build the GGML graph implementation
|
||||||
|
|
||||||
After following these steps, you can open PR.
|
After following these steps, you can open PR.
|
||||||
|
@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
|
||||||
### 1. Convert the model to GGUF
|
### 1. Convert the model to GGUF
|
||||||
|
|
||||||
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
||||||
Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
|
Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_jarvis.py](/examples/convert_legacy_jarvis.py) (for `jarvis/jarvis2` models in `.pth` format).
|
||||||
|
|
||||||
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
||||||
|
|
||||||
|
@ -81,26 +81,26 @@ Depending on the model configuration, tokenizer, code and tensors layout, you wi
|
||||||
|
|
||||||
NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
|
NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
|
||||||
|
|
||||||
### 2. Define the model architecture in `llama.cpp`
|
### 2. Define the model architecture in `jarvis.cpp`
|
||||||
|
|
||||||
The model params and tensors layout must be defined in `llama.cpp`:
|
The model params and tensors layout must be defined in `jarvis.cpp`:
|
||||||
1. Define a new `llm_arch`
|
1. Define a new `llm_arch`
|
||||||
2. Define the tensors layout in `LLM_TENSOR_NAMES`
|
2. Define the tensors layout in `LLM_TENSOR_NAMES`
|
||||||
3. Add any non standard metadata in `llm_load_hparams`
|
3. Add any non standard metadata in `llm_load_hparams`
|
||||||
4. Create the tensors for inference in `llm_load_tensors`
|
4. Create the tensors for inference in `llm_load_tensors`
|
||||||
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
|
5. If the model has a RoPE operation, add the rope type in `jarvis_rope_type`
|
||||||
|
|
||||||
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
|
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
|
||||||
|
|
||||||
### 3. Build the GGML graph implementation
|
### 3. Build the GGML graph implementation
|
||||||
|
|
||||||
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
|
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `jarvis_build_graph`.
|
||||||
|
|
||||||
Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
|
Have a look at existing implementation like `build_jarvis`, `build_dbrx` or `build_bert`.
|
||||||
|
|
||||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
||||||
|
|
||||||
Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
|
Note: to debug the inference graph: you can use [jarvis-eval-callback](/examples/eval-callback/).
|
||||||
|
|
||||||
## GGUF specification
|
## GGUF specification
|
||||||
|
|
||||||
|
@ -108,12 +108,12 @@ https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
|
||||||
|
|
||||||
## Resources
|
## Resources
|
||||||
|
|
||||||
- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
|
- YaRN RoPE scaling https://github.com/ggerganov/jarvis.cpp/pull/2268
|
||||||
- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
|
- support Baichuan serial models https://github.com/ggerganov/jarvis.cpp/pull/3009
|
||||||
- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
|
- support attention bias https://github.com/ggerganov/jarvis.cpp/pull/4283
|
||||||
- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
|
- Mixtral support https://github.com/ggerganov/jarvis.cpp/pull/4406
|
||||||
- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
|
- BERT embeddings https://github.com/ggerganov/jarvis.cpp/pull/5423
|
||||||
- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
|
- Grok-1 support https://github.com/ggerganov/jarvis.cpp/pull/6204
|
||||||
- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
|
- Command R Plus support https://github.com/ggerganov/jarvis.cpp/pull/6491
|
||||||
- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
|
- support arch DBRX https://github.com/ggerganov/jarvis.cpp/pull/6515
|
||||||
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
|
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/jarvis.cpp/discussions/2948
|
||||||
|
|
|
@ -51,7 +51,7 @@ rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
|
||||||
Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
|
Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
|
cmake -DCMAKE_BUILD_TYPE=Debug -DJARVIS_CUDA=1 -DJARVIS_FATAL_WARNINGS=ON ..
|
||||||
make -j
|
make -j
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -71,12 +71,12 @@ This may return output similar to below (focusing on key lines to pay attention
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
...
|
...
|
||||||
1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
|
1: Test command: ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf"
|
||||||
1: Working Directory: .
|
1: Working Directory: .
|
||||||
Labels: main
|
Labels: main
|
||||||
Test #1: test-tokenizer-0-llama-spm
|
Test #1: test-tokenizer-0-jarvis-spm
|
||||||
...
|
...
|
||||||
4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf"
|
4: Test command: ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-falcon.gguf"
|
||||||
4: Working Directory: .
|
4: Working Directory: .
|
||||||
Labels: main
|
Labels: main
|
||||||
Test #4: test-tokenizer-0-falcon
|
Test #4: test-tokenizer-0-falcon
|
||||||
|
@ -86,8 +86,8 @@ Labels: main
|
||||||
#### Step 4: Identify Test Command for Debugging
|
#### Step 4: Identify Test Command for Debugging
|
||||||
|
|
||||||
So for test #1 above we can tell these two pieces of relevant information:
|
So for test #1 above we can tell these two pieces of relevant information:
|
||||||
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
|
* Test Binary: `~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0`
|
||||||
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
|
* Test GGUF Model: `~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf`
|
||||||
|
|
||||||
#### Step 5: Run GDB on test command
|
#### Step 5: Run GDB on test command
|
||||||
|
|
||||||
|
@ -100,5 +100,5 @@ gdb --args ${Test Binary} ${Test GGUF Model}
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
|
gdb --args ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf"
|
||||||
```
|
```
|
||||||
|
|
|
@ -1,23 +1,23 @@
|
||||||
# Token generation performance troubleshooting
|
# Token generation performance troubleshooting
|
||||||
|
|
||||||
## Verifying that the model is running on the GPU with CUDA
|
## Verifying that the model is running on the GPU with CUDA
|
||||||
Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
Make sure you compiled jarvis with the correct env variables according to [this guide](/docs/build.md#cuda), so that jarvis accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running jarvis, you may configure `N` to be very large, and jarvis will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
||||||
```shell
|
```shell
|
||||||
./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
./jarvis-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
||||||
```
|
```
|
||||||
|
|
||||||
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
When running jarvis, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
||||||
```shell
|
```shell
|
||||||
llama_model_load_internal: [cublas] offloading 60 layers to GPU
|
jarvis_model_load_internal: [cublas] offloading 60 layers to GPU
|
||||||
llama_model_load_internal: [cublas] offloading output layer to GPU
|
jarvis_model_load_internal: [cublas] offloading output layer to GPU
|
||||||
llama_model_load_internal: [cublas] total VRAM used: 17223 MB
|
jarvis_model_load_internal: [cublas] total VRAM used: 17223 MB
|
||||||
... rest of inference
|
... rest of inference
|
||||||
```
|
```
|
||||||
|
|
||||||
If you see these lines, then the GPU is being used.
|
If you see these lines, then the GPU is being used.
|
||||||
|
|
||||||
## Verifying that the CPU is not oversaturated
|
## Verifying that the CPU is not oversaturated
|
||||||
llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
|
jarvis accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
|
||||||
|
|
||||||
# Example of runtime flags effect on inference speed benchmark
|
# Example of runtime flags effect on inference speed benchmark
|
||||||
These runs were tested on the following machine:
|
These runs were tested on the following machine:
|
||||||
|
@ -27,7 +27,7 @@ RAM: 32GB
|
||||||
|
|
||||||
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
|
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
|
||||||
|
|
||||||
Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
Run command: `./jarvis-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
||||||
|
|
||||||
Result:
|
Result:
|
||||||
|
|
||||||
|
|
|
@ -2,26 +2,26 @@
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
* Docker must be installed and running on your system.
|
* Docker must be installed and running on your system.
|
||||||
* Create a folder to store big models & intermediate files (ex. /llama/models)
|
* Create a folder to store big models & intermediate files (ex. /jarvis/models)
|
||||||
|
|
||||||
## Images
|
## Images
|
||||||
We have three Docker images available for this project:
|
We have three Docker images available for this project:
|
||||||
|
|
||||||
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
1. `ghcr.io/ggerganov/jarvis.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
2. `ghcr.io/ggerganov/jarvis.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
3. `ghcr.io/ggerganov/jarvis.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
|
|
||||||
Additionally, there the following images, similar to the above:
|
Additionally, there the following images, similar to the above:
|
||||||
|
|
||||||
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
|
- `ghcr.io/ggerganov/jarvis.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||||
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
|
- `ghcr.io/ggerganov/jarvis.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||||
- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
|
- `ghcr.io/ggerganov/jarvis.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||||
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
- `ghcr.io/ggerganov/jarvis.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
- `ghcr.io/ggerganov/jarvis.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
- `ghcr.io/ggerganov/jarvis.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
|
- `ghcr.io/ggerganov/jarvis.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||||
- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
|
- `ghcr.io/ggerganov/jarvis.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||||
- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
|
- `ghcr.io/ggerganov/jarvis.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||||
|
|
||||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
|
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
|
||||||
|
|
||||||
|
@ -32,25 +32,25 @@ The easiest way to download the models, convert them to ggml and optimize them i
|
||||||
Replace `/path/to/models` below with the actual path where you downloaded the models.
|
Replace `/path/to/models` below with the actual path where you downloaded the models.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
|
docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:full --all-in-one "/models/" 7B
|
||||||
```
|
```
|
||||||
|
|
||||||
On completion, you are ready to play!
|
On completion, you are ready to play!
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
or with a light image:
|
or with a light image:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
or with a server image:
|
or with a server image:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
|
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/jarvis.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
## Docker With CUDA
|
## Docker With CUDA
|
||||||
|
@ -60,9 +60,9 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
|
||||||
## Building Docker locally
|
## Building Docker locally
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
docker build -t local/jarvis.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
||||||
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
|
docker build -t local/jarvis.cpp:light-cuda -f .devops/jarvis-cli-cuda.Dockerfile .
|
||||||
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
|
docker build -t local/jarvis.cpp:server-cuda -f .devops/jarvis-server-cuda.Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
||||||
|
@ -74,18 +74,18 @@ The defaults are:
|
||||||
|
|
||||||
The resulting images, are essentially the same as the non-CUDA images:
|
The resulting images, are essentially the same as the non-CUDA images:
|
||||||
|
|
||||||
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
1. `local/jarvis.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||||
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
|
2. `local/jarvis.cpp:light-cuda`: This image only includes the main executable file.
|
||||||
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
|
3. `local/jarvis.cpp:server-cuda`: This image only includes the server executable file.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
|
After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||||
```
|
```
|
||||||
|
|
||||||
## Docker With MUSA
|
## Docker With MUSA
|
||||||
|
@ -95,9 +95,9 @@ Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/
|
||||||
## Building Docker locally
|
## Building Docker locally
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
|
docker build -t local/jarvis.cpp:full-musa -f .devops/full-musa.Dockerfile .
|
||||||
docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
|
docker build -t local/jarvis.cpp:light-musa -f .devops/jarvis-cli-musa.Dockerfile .
|
||||||
docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
|
docker build -t local/jarvis.cpp:server-musa -f .devops/jarvis-server-musa.Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.
|
You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.
|
||||||
|
@ -108,16 +108,16 @@ The defaults are:
|
||||||
|
|
||||||
The resulting images, are essentially the same as the non-MUSA images:
|
The resulting images, are essentially the same as the non-MUSA images:
|
||||||
|
|
||||||
1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
1. `local/jarvis.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||||
2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
|
2. `local/jarvis.cpp:light-musa`: This image only includes the main executable file.
|
||||||
3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
|
3. `local/jarvis.cpp:server-musa`: This image only includes the server executable file.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag.
|
After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
docker run -v /path/to/models:/models local/jarvis.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
docker run -v /path/to/models:/models local/jarvis.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
docker run -v /path/to/models:/models local/jarvis.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||||
```
|
```
|
||||||
|
|
|
@ -1,39 +1,39 @@
|
||||||
# Install pre-built version of llama.cpp
|
# Install pre-built version of jarvis.cpp
|
||||||
|
|
||||||
## Homebrew
|
## Homebrew
|
||||||
|
|
||||||
On Mac and Linux, the homebrew package manager can be used via
|
On Mac and Linux, the homebrew package manager can be used via
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
brew install llama.cpp
|
brew install jarvis.cpp
|
||||||
```
|
```
|
||||||
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
|
The formula is automatically updated with new `jarvis.cpp` releases. More info: https://github.com/ggerganov/jarvis.cpp/discussions/7668
|
||||||
|
|
||||||
## Nix
|
## Nix
|
||||||
|
|
||||||
On Mac and Linux, the Nix package manager can be used via
|
On Mac and Linux, the Nix package manager can be used via
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
nix profile install nixpkgs#llama-cpp
|
nix profile install nixpkgs#jarvis-cpp
|
||||||
```
|
```
|
||||||
For flake enabled installs.
|
For flake enabled installs.
|
||||||
|
|
||||||
Or
|
Or
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
nix-env --file '<nixpkgs>' --install --attr llama-cpp
|
nix-env --file '<nixpkgs>' --install --attr jarvis-cpp
|
||||||
```
|
```
|
||||||
|
|
||||||
For non-flake enabled installs.
|
For non-flake enabled installs.
|
||||||
|
|
||||||
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
|
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/jarvis-cpp/package.nix#L164).
|
||||||
|
|
||||||
## Flox
|
## Flox
|
||||||
|
|
||||||
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
|
On Mac and Linux, Flox can be used to install jarvis.cpp within a Flox environment via
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
flox install llama-cpp
|
flox install jarvis-cpp
|
||||||
```
|
```
|
||||||
|
|
||||||
Flox follows the nixpkgs build of llama.cpp.
|
Flox follows the nixpkgs build of jarvis.cpp.
|
||||||
|
|
|
@ -13,10 +13,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
else()
|
else()
|
||||||
add_subdirectory(cvector-generator)
|
add_subdirectory(cvector-generator)
|
||||||
add_subdirectory(baby-llama)
|
add_subdirectory(baby-jarvis)
|
||||||
add_subdirectory(batched-bench)
|
add_subdirectory(batched-bench)
|
||||||
add_subdirectory(batched)
|
add_subdirectory(batched)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-jarvis2c-to-ggml)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(eval-callback)
|
add_subdirectory(eval-callback)
|
||||||
add_subdirectory(export-lora)
|
add_subdirectory(export-lora)
|
||||||
|
@ -27,7 +27,7 @@ else()
|
||||||
add_subdirectory(gritlm)
|
add_subdirectory(gritlm)
|
||||||
add_subdirectory(imatrix)
|
add_subdirectory(imatrix)
|
||||||
add_subdirectory(infill)
|
add_subdirectory(infill)
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(jarvis-bench)
|
||||||
add_subdirectory(llava)
|
add_subdirectory(llava)
|
||||||
add_subdirectory(lookahead)
|
add_subdirectory(lookahead)
|
||||||
add_subdirectory(lookup)
|
add_subdirectory(lookup)
|
||||||
|
@ -41,7 +41,7 @@ else()
|
||||||
if (GGML_RPC)
|
if (GGML_RPC)
|
||||||
add_subdirectory(rpc)
|
add_subdirectory(rpc)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_BUILD_SERVER)
|
if (JARVIS_BUILD_SERVER)
|
||||||
add_subdirectory(server)
|
add_subdirectory(server)
|
||||||
endif()
|
endif()
|
||||||
if (GGML_SYCL)
|
if (GGML_SYCL)
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
AI_NAME="${AI_NAME:-Miku}"
|
AI_NAME="${AI_NAME:-Miku}"
|
||||||
MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
|
MODEL="${MODEL:-./models/jarvis-2-7b-chat.ggmlv3.q4_K_M.bin}"
|
||||||
USER_NAME="${USER_NAME:-Anon}"
|
USER_NAME="${USER_NAME:-Anon}"
|
||||||
|
|
||||||
# Uncomment and adjust to the number of CPU cores you want to use.
|
# Uncomment and adjust to the number of CPU cores you want to use.
|
||||||
|
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
|
||||||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
./llama-cli "${GEN_OPTIONS[@]}" \
|
./jarvis-cli "${GEN_OPTIONS[@]}" \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--in-prefix " " \
|
--in-prefix " " \
|
||||||
--in-suffix "${AI_NAME}:" \
|
--in-suffix "${AI_NAME}:" \
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
set(TARGET llama-baby-llama)
|
set(TARGET jarvis-baby-jarvis)
|
||||||
add_executable(${TARGET} baby-llama.cpp)
|
add_executable(${TARGET} baby-jarvis.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
|
@ -11,8 +11,8 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef LLAMA_DEFAULT_RMS_EPS
|
#ifdef JARVIS_DEFAULT_RMS_EPS
|
||||||
constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
constexpr float rms_norm_eps = JARVIS_DEFAULT_RMS_EPS;
|
||||||
#else
|
#else
|
||||||
constexpr float rms_norm_eps = 5e-6f;
|
constexpr float rms_norm_eps = 5e-6f;
|
||||||
#endif
|
#endif
|
||||||
|
@ -71,7 +71,7 @@ static struct ggml_tensor * randomize_tensor(
|
||||||
return tensor;
|
return tensor;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_hparams {
|
struct jarvis_hparams {
|
||||||
uint32_t n_vocab = 32000;
|
uint32_t n_vocab = 32000;
|
||||||
uint32_t n_ctx = 512; // this is provided as user input?
|
uint32_t n_ctx = 512; // this is provided as user input?
|
||||||
uint32_t n_embd = 4096;
|
uint32_t n_embd = 4096;
|
||||||
|
@ -80,17 +80,17 @@ struct llama_hparams {
|
||||||
uint32_t n_layer = 32;
|
uint32_t n_layer = 32;
|
||||||
uint32_t n_rot = 64;
|
uint32_t n_rot = 64;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const jarvis_hparams & other) const {
|
||||||
return memcmp(this, &other, sizeof(llama_hparams));
|
return memcmp(this, &other, sizeof(jarvis_hparams));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static uint32_t get_n_ff(const struct llama_hparams* hparams) {
|
static uint32_t get_n_ff(const struct jarvis_hparams* hparams) {
|
||||||
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
|
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
|
||||||
return n_ff;
|
return n_ff;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_hparams_lora {
|
struct jarvis_hparams_lora {
|
||||||
uint32_t n_vocab = 32000;
|
uint32_t n_vocab = 32000;
|
||||||
uint32_t n_ctx = 512; // this is provided as user input?
|
uint32_t n_ctx = 512; // this is provided as user input?
|
||||||
uint32_t n_embd = 4096;
|
uint32_t n_embd = 4096;
|
||||||
|
@ -100,12 +100,12 @@ struct llama_hparams_lora {
|
||||||
uint32_t n_rot = 64;
|
uint32_t n_rot = 64;
|
||||||
uint32_t n_lora = 64;
|
uint32_t n_lora = 64;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams_lora & other) const {
|
bool operator!=(const jarvis_hparams_lora & other) const {
|
||||||
return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
|
return memcmp(this, &other, sizeof(jarvis_hparams_lora)) != 0;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_layer {
|
struct jarvis_layer {
|
||||||
// normalization
|
// normalization
|
||||||
struct ggml_tensor * attention_norm;
|
struct ggml_tensor * attention_norm;
|
||||||
|
|
||||||
|
@ -124,7 +124,7 @@ struct llama_layer {
|
||||||
struct ggml_tensor * w3;
|
struct ggml_tensor * w3;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_layer_lora {
|
struct jarvis_layer_lora {
|
||||||
// normalization
|
// normalization
|
||||||
struct ggml_tensor * attention_norm;
|
struct ggml_tensor * attention_norm;
|
||||||
|
|
||||||
|
@ -148,34 +148,34 @@ struct llama_layer_lora {
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
struct llama_kv_cache {
|
struct jarvis_kv_cache {
|
||||||
struct ggml_context * ctx = NULL;
|
struct ggml_context * ctx = NULL;
|
||||||
|
|
||||||
struct ggml_tensor * k;
|
struct ggml_tensor * k;
|
||||||
struct ggml_tensor * v;
|
struct ggml_tensor * v;
|
||||||
|
|
||||||
// llama_ctx_buffer buf;
|
// jarvis_ctx_buffer buf;
|
||||||
|
|
||||||
int n; // number of tokens currently in the cache
|
int n; // number of tokens currently in the cache
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_model {
|
struct jarvis_model {
|
||||||
struct ggml_context * ctx = NULL;
|
struct ggml_context * ctx = NULL;
|
||||||
|
|
||||||
llama_hparams hparams;
|
jarvis_hparams hparams;
|
||||||
|
|
||||||
struct ggml_tensor * tok_embeddings;
|
struct ggml_tensor * tok_embeddings;
|
||||||
|
|
||||||
struct ggml_tensor * norm;
|
struct ggml_tensor * norm;
|
||||||
struct ggml_tensor * output;
|
struct ggml_tensor * output;
|
||||||
|
|
||||||
std::vector<llama_layer> layers;
|
std::vector<jarvis_layer> layers;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_model_lora {
|
struct jarvis_model_lora {
|
||||||
struct ggml_context * ctx = NULL;
|
struct ggml_context * ctx = NULL;
|
||||||
|
|
||||||
llama_hparams_lora hparams;
|
jarvis_hparams_lora hparams;
|
||||||
|
|
||||||
struct ggml_tensor * tok_embeddings;
|
struct ggml_tensor * tok_embeddings;
|
||||||
|
|
||||||
|
@ -183,10 +183,10 @@ struct llama_model_lora {
|
||||||
struct ggml_tensor * outputa;
|
struct ggml_tensor * outputa;
|
||||||
struct ggml_tensor * outputb;
|
struct ggml_tensor * outputb;
|
||||||
|
|
||||||
std::vector<llama_layer_lora> layers;
|
std::vector<jarvis_layer_lora> layers;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void init_model(struct llama_model * model) {
|
static void init_model(struct jarvis_model * model) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_embd = hparams.n_embd;
|
const uint32_t n_embd = hparams.n_embd;
|
||||||
|
@ -223,7 +223,7 @@ static void init_model(struct llama_model * model) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void init_model_lora(struct llama_model_lora * model) {
|
static void init_model_lora(struct jarvis_model_lora * model) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_embd = hparams.n_embd;
|
const uint32_t n_embd = hparams.n_embd;
|
||||||
|
@ -266,7 +266,7 @@ static void init_model_lora(struct llama_model_lora * model) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void set_param_model(struct llama_model * model) {
|
static void set_param_model(struct jarvis_model * model) {
|
||||||
const auto& hparams = model->hparams;
|
const auto& hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
@ -292,7 +292,7 @@ static void set_param_model(struct llama_model * model) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void set_param_model_lora(struct llama_model_lora * model) {
|
static void set_param_model_lora(struct jarvis_model_lora * model) {
|
||||||
const auto& hparams = model->hparams;
|
const auto& hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
@ -323,7 +323,7 @@ static void set_param_model_lora(struct llama_model_lora * model) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
|
static void randomize_model(struct jarvis_model * model, int seed, float mean, float std, float min, float max) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
@ -355,7 +355,7 @@ static void randomize_model(struct llama_model * model, int seed, float mean, fl
|
||||||
|
|
||||||
|
|
||||||
static void randomize_model_lora(
|
static void randomize_model_lora(
|
||||||
struct llama_model_lora * model, int seed, float mean, float std, float min, float max
|
struct jarvis_model_lora * model, int seed, float mean, float std, float min, float max
|
||||||
) {
|
) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
|
@ -391,7 +391,7 @@ static void randomize_model_lora(
|
||||||
free_random_normal_distribution(rnd);
|
free_random_normal_distribution(rnd);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
|
static void init_kv_cache(struct jarvis_kv_cache* cache, struct jarvis_model * model, int n_batch) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_ctx = hparams.n_ctx;
|
const uint32_t n_ctx = hparams.n_ctx;
|
||||||
|
@ -425,7 +425,7 @@ static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * mod
|
||||||
cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
|
cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
|
static bool init_kv_cache_lora(struct jarvis_kv_cache* cache, struct jarvis_model_lora * model, int n_batch) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_ctx = hparams.n_ctx;
|
const uint32_t n_ctx = hparams.n_ctx;
|
||||||
|
@ -462,8 +462,8 @@ static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_tensor * forward(
|
static struct ggml_tensor * forward(
|
||||||
struct llama_model * model,
|
struct jarvis_model * model,
|
||||||
struct llama_kv_cache * cache,
|
struct jarvis_kv_cache * cache,
|
||||||
struct ggml_context * ctx0,
|
struct ggml_context * ctx0,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf,
|
||||||
struct ggml_tensor * tokens_input,
|
struct ggml_tensor * tokens_input,
|
||||||
|
@ -472,7 +472,7 @@ static struct ggml_tensor * forward(
|
||||||
) {
|
) {
|
||||||
const int N = n_tokens;
|
const int N = n_tokens;
|
||||||
|
|
||||||
struct llama_kv_cache& kv_self = *cache;
|
struct jarvis_kv_cache& kv_self = *cache;
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
const int n_ctx = hparams.n_ctx;
|
const int n_ctx = hparams.n_ctx;
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
|
@ -692,8 +692,8 @@ static struct ggml_tensor * forward(
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_tensor * forward_batch(
|
static struct ggml_tensor * forward_batch(
|
||||||
struct llama_model * model,
|
struct jarvis_model * model,
|
||||||
struct llama_kv_cache * cache,
|
struct jarvis_kv_cache * cache,
|
||||||
struct ggml_context * ctx0,
|
struct ggml_context * ctx0,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf,
|
||||||
struct ggml_tensor * tokens_input,
|
struct ggml_tensor * tokens_input,
|
||||||
|
@ -703,7 +703,7 @@ static struct ggml_tensor * forward_batch(
|
||||||
) {
|
) {
|
||||||
const int N = n_tokens;
|
const int N = n_tokens;
|
||||||
|
|
||||||
struct llama_kv_cache& kv_self = *cache;
|
struct jarvis_kv_cache& kv_self = *cache;
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
const int n_ctx = hparams.n_ctx;
|
const int n_ctx = hparams.n_ctx;
|
||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
@ -989,8 +989,8 @@ static struct ggml_tensor * forward_batch(
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_tensor * forward_lora(
|
static struct ggml_tensor * forward_lora(
|
||||||
struct llama_model_lora * model,
|
struct jarvis_model_lora * model,
|
||||||
struct llama_kv_cache * cache,
|
struct jarvis_kv_cache * cache,
|
||||||
struct ggml_context * ctx0,
|
struct ggml_context * ctx0,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf,
|
||||||
struct ggml_tensor * tokens_input,
|
struct ggml_tensor * tokens_input,
|
||||||
|
@ -999,7 +999,7 @@ static struct ggml_tensor * forward_lora(
|
||||||
) {
|
) {
|
||||||
const int N = n_tokens;
|
const int N = n_tokens;
|
||||||
|
|
||||||
struct llama_kv_cache& kv_self = *cache;
|
struct jarvis_kv_cache& kv_self = *cache;
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const int n_ctx = hparams.n_ctx;
|
const int n_ctx = hparams.n_ctx;
|
||||||
|
@ -1444,7 +1444,7 @@ int main(int argc, char ** argv) {
|
||||||
lcparams.mem_buffer = NULL;
|
lcparams.mem_buffer = NULL;
|
||||||
lcparams.no_alloc = false;
|
lcparams.no_alloc = false;
|
||||||
|
|
||||||
struct llama_model model;
|
struct jarvis_model model;
|
||||||
model.hparams.n_vocab = 8;
|
model.hparams.n_vocab = 8;
|
||||||
model.hparams.n_ctx = 8;
|
model.hparams.n_ctx = 8;
|
||||||
model.hparams.n_embd = 32;
|
model.hparams.n_embd = 32;
|
||||||
|
@ -1467,7 +1467,7 @@ int main(int argc, char ** argv) {
|
||||||
randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
|
randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
struct llama_model_lora model_lora;
|
struct jarvis_model_lora model_lora;
|
||||||
// model.hparams.n_vocab = 6;
|
// model.hparams.n_vocab = 6;
|
||||||
// model.hparams.n_ctx = 64;
|
// model.hparams.n_ctx = 64;
|
||||||
// model.hparams.n_embd = 128;
|
// model.hparams.n_embd = 128;
|
||||||
|
@ -1501,7 +1501,7 @@ int main(int argc, char ** argv) {
|
||||||
*/
|
*/
|
||||||
int n_batch = 8;
|
int n_batch = 8;
|
||||||
// key + value cache for the self attention
|
// key + value cache for the self attention
|
||||||
struct llama_kv_cache kv_self;
|
struct jarvis_kv_cache kv_self;
|
||||||
printf("init_kv_cache\n");
|
printf("init_kv_cache\n");
|
||||||
kv_self.ctx = model.ctx;
|
kv_self.ctx = model.ctx;
|
||||||
init_kv_cache(&kv_self, &model, n_batch);
|
init_kv_cache(&kv_self, &model, n_batch);
|
||||||
|
@ -1533,7 +1533,7 @@ int main(int argc, char ** argv) {
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
struct ggml_cgraph * gf = NULL;
|
struct ggml_cgraph * gf = NULL;
|
||||||
gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
|
gf = ggml_new_graph_custom(ctx0, JARVIS_TRAIN_MAX_NODES, true);
|
||||||
|
|
||||||
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
|
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
|
||||||
|
|
||||||
|
@ -1601,7 +1601,7 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
|
||||||
struct ggml_cgraph * gf = NULL;
|
struct ggml_cgraph * gf = NULL;
|
||||||
gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
|
gf = ggml_new_graph_custom(ctx0, JARVIS_TRAIN_MAX_NODES, true);
|
||||||
|
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
|
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
#
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
#
|
#
|
||||||
# cd llama.cpp
|
# cd jarvis.cpp
|
||||||
# make -j
|
# make -j
|
||||||
#
|
#
|
||||||
# ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
|
# ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
|
||||||
|
@ -21,7 +21,7 @@ if [ $# -gt 2 ]; then
|
||||||
eargs="${@:3}"
|
eargs="${@:3}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
ftmp="__llama.cpp_example_tmp__.txt"
|
ftmp="__jarvis.cpp_example_tmp__.txt"
|
||||||
trap "rm -f $ftmp" EXIT
|
trap "rm -f $ftmp" EXIT
|
||||||
|
|
||||||
echo "Translate from English to French:
|
echo "Translate from English to French:
|
||||||
|
@ -58,4 +58,4 @@ echo "$2
|
||||||
model=$1
|
model=$1
|
||||||
|
|
||||||
# generate the most likely continuation until the string "===" is found
|
# generate the most likely continuation until the string "===" is found
|
||||||
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
./jarvis-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
set(TARGET llama-batched-bench)
|
set(TARGET jarvis-batched-bench)
|
||||||
add_executable(${TARGET} batched-bench.cpp)
|
add_executable(${TARGET} batched-bench.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# llama.cpp/example/batched-bench
|
# jarvis.cpp/example/batched-bench
|
||||||
|
|
||||||
Benchmark the batched decoding performance of `llama.cpp`
|
Benchmark the batched decoding performance of `jarvis.cpp`
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
@ -10,16 +10,16 @@ There are 2 modes of operation:
|
||||||
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
./jarvis-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
||||||
|
|
||||||
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||||
./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
||||||
|
|
||||||
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||||
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
||||||
|
|
||||||
# custom set of batches
|
# custom set of batches
|
||||||
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
||||||
```
|
```
|
||||||
|
|
||||||
## Sample results
|
## Sample results
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "jarvis.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
@ -17,7 +17,7 @@ static void print_usage(int, char ** argv) {
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
common_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
|
if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_BENCH, print_usage)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -31,42 +31,42 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// init LLM
|
// init LLM
|
||||||
|
|
||||||
llama_backend_init();
|
jarvis_backend_init();
|
||||||
llama_numa_init(params.numa);
|
jarvis_numa_init(params.numa);
|
||||||
|
|
||||||
// initialize the model
|
// initialize the model
|
||||||
|
|
||||||
llama_model_params model_params = common_model_params_to_llama(params);
|
jarvis_model_params model_params = common_model_params_to_jarvis(params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
jarvis_model * model = jarvis_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_context_params ctx_params = common_context_params_to_llama(params);
|
jarvis_context_params ctx_params = common_context_params_to_jarvis(params);
|
||||||
|
|
||||||
// ensure enough sequences are available
|
// ensure enough sequences are available
|
||||||
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
|
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
jarvis_context * ctx = jarvis_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
fprintf(stderr , "%s: error: failed to create the jarvis_context\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int32_t n_kv_max = llama_n_ctx(ctx);
|
const int32_t n_kv_max = jarvis_n_ctx(ctx);
|
||||||
|
|
||||||
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
|
jarvis_batch batch = jarvis_batch_init(n_kv_max, 0, 1);
|
||||||
|
|
||||||
// decode in batches of ctx_params.n_batch tokens
|
// decode in batches of ctx_params.n_batch tokens
|
||||||
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
|
auto decode_helper = [](jarvis_context * ctx, jarvis_batch & batch, int32_t n_batch) {
|
||||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||||
|
|
||||||
llama_batch batch_view = {
|
jarvis_batch batch_view = {
|
||||||
n_tokens,
|
n_tokens,
|
||||||
batch.token + i,
|
batch.token + i,
|
||||||
nullptr,
|
nullptr,
|
||||||
|
@ -76,13 +76,13 @@ int main(int argc, char ** argv) {
|
||||||
batch.logits + i,
|
batch.logits + i,
|
||||||
};
|
};
|
||||||
|
|
||||||
const int ret = llama_decode(ctx, batch_view);
|
const int ret = jarvis_decode(ctx, batch_view);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_synchronize(ctx);
|
jarvis_synchronize(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -95,7 +95,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: jarvis_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -132,16 +132,16 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const auto t_pp_start = ggml_time_us();
|
const auto t_pp_start = ggml_time_us();
|
||||||
|
|
||||||
llama_kv_cache_clear(ctx);
|
jarvis_kv_cache_clear(ctx);
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: jarvis_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_pp_shared) {
|
if (is_pp_shared) {
|
||||||
for (int32_t i = 1; i < pl; ++i) {
|
for (int32_t i = 1; i < pl; ++i) {
|
||||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
jarvis_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -157,7 +157,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: jarvis_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -189,14 +189,14 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
llama_perf_context_print(ctx);
|
jarvis_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
jarvis_batch_free(batch);
|
||||||
|
|
||||||
llama_free(ctx);
|
jarvis_free(ctx);
|
||||||
llama_free_model(model);
|
jarvis_free_model(model);
|
||||||
|
|
||||||
llama_backend_free();
|
jarvis_backend_free();
|
||||||
|
|
||||||
LOG("\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
.PHONY: build
|
.PHONY: build
|
||||||
|
|
||||||
build:
|
build:
|
||||||
xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
|
xcodebuild -scheme jarvis-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
|
||||||
rm -f ./llama-batched-swift
|
rm -f ./jarvis-batched-swift
|
||||||
ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
|
ln -s ./build/Build/Products/Debug/jarvis-batched-swift ./jarvis-batched-swift
|
||||||
|
|
|
@ -4,17 +4,17 @@
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
let package = Package(
|
let package = Package(
|
||||||
name: "llama-batched-swift",
|
name: "jarvis-batched-swift",
|
||||||
platforms: [.macOS(.v12)],
|
platforms: [.macOS(.v12)],
|
||||||
dependencies: [
|
dependencies: [
|
||||||
.package(name: "llama", path: "../../"),
|
.package(name: "jarvis", path: "../../"),
|
||||||
],
|
],
|
||||||
targets: [
|
targets: [
|
||||||
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||||
// Targets can depend on other targets in this package and products from dependencies.
|
// Targets can depend on other targets in this package and products from dependencies.
|
||||||
.executableTarget(
|
.executableTarget(
|
||||||
name: "llama-batched-swift",
|
name: "jarvis-batched-swift",
|
||||||
dependencies: ["llama"],
|
dependencies: ["jarvis"],
|
||||||
path: "Sources",
|
path: "Sources",
|
||||||
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
||||||
),
|
),
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue