Changed "llama" to "jarvis"
This commit is contained in:
parent
4dfbcf9646
commit
52ab617954
372 changed files with 8788 additions and 8788 deletions
|
@ -7,16 +7,16 @@ node('x86_runner1'){ // Running on x86 runner containing latest vecto
|
|||
checkout scm // Clone the repo on Runner
|
||||
}
|
||||
}
|
||||
stage('Compiling llama.cpp'){
|
||||
stage('Compiling jarvis.cpp'){
|
||||
sh'''#!/bin/bash
|
||||
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
|
||||
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling jarvis for RISC-V
|
||||
'''
|
||||
}
|
||||
stage('Running llama.cpp'){
|
||||
stage('Running jarvis.cpp'){
|
||||
sh'''#!/bin/bash
|
||||
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||
cat llama_log.txt # Printing results
|
||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./jarvis-cli -m /home/alitariq/codejarvis-7b.Q4_K_M.gguf -p "Anything" -n 9 > jarvis_log.txt # Running jarvis.cpp on vector qemu-riscv64
|
||||
cat jarvis_log.txt # Printing results
|
||||
'''
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@ COPY . .
|
|||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake -B build -DGGML_CUDA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc) && \
|
||||
cp build/bin/* .
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ WORKDIR /app
|
|||
|
||||
COPY . .
|
||||
|
||||
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
RUN cmake -B build -DGGML_MUSA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc) && \
|
||||
cp build/bin/* .
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
|
|||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
ARG ROCM_DOCKER_ARCH="\
|
||||
gfx803 \
|
||||
|
@ -41,7 +41,7 @@ ENV CC=/opt/rocm/llvm/bin/clang
|
|||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
|
||||
# Enable cURL
|
||||
ENV LLAMA_CURL=1
|
||||
ENV JARVIS_CURL=1
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ WORKDIR /app
|
|||
|
||||
COPY . .
|
||||
|
||||
ENV LLAMA_CURL=1
|
||||
ENV JARVIS_CURL=1
|
||||
|
||||
|
||||
RUN make -j$(nproc)
|
||||
|
|
|
@ -23,11 +23,11 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
|||
RUN echo "Building with static libs" && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
||||
cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
|
||||
cmake --build build --config Release --target llama-cli
|
||||
cmake --build build --config Release --target jarvis-cli
|
||||
|
||||
# TODO: use image with NNRT
|
||||
FROM cosdt/cann:$ASCEND_VERSION AS runtime
|
||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||
COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
|
@ -41,4 +41,4 @@ ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
|||
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
|
||||
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
|
||||
|
||||
ENTRYPOINT ["/llama-cli" ]
|
||||
ENTRYPOINT ["/jarvis-cli" ]
|
||||
|
|
|
@ -23,7 +23,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
|||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release --target llama-cli -j$(nproc)
|
||||
cmake --build build --config Release --target jarvis-cli -j$(nproc)
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||
|
||||
|
@ -31,7 +31,7 @@ RUN apt-get update && \
|
|||
apt-get install -y libgomp1
|
||||
|
||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||
COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
|
||||
COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
|
||||
|
||||
ENTRYPOINT [ "/llama-cli" ]
|
||||
ENTRYPOINT [ "/jarvis-cli" ]
|
||||
|
|
|
@ -17,12 +17,12 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|||
echo "Building with static libs" && \
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
|
||||
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
|
||||
cmake --build build --config Release --target llama-cli
|
||||
cmake --build build --config Release --target jarvis-cli
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||
|
||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||
COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/llama-cli" ]
|
||||
ENTRYPOINT [ "/jarvis-cli" ]
|
||||
|
|
|
@ -16,7 +16,7 @@ WORKDIR /app
|
|||
COPY . .
|
||||
|
||||
RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release --target llama-cli -j$(nproc)
|
||||
cmake --build build --config Release --target jarvis-cli -j$(nproc)
|
||||
|
||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
||||
|
||||
|
@ -24,7 +24,7 @@ RUN apt-get update && \
|
|||
apt-get install -y libgomp1
|
||||
|
||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||
COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
|
||||
COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli
|
||||
|
||||
ENTRYPOINT [ "/llama-cli" ]
|
||||
ENTRYPOINT [ "/jarvis-cli" ]
|
||||
|
|
|
@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
|
|||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
ARG ROCM_DOCKER_ARCH="\
|
||||
gfx803 \
|
||||
|
@ -40,6 +40,6 @@ ENV GGML_HIPBLAS=1
|
|||
ENV CC=/opt/rocm/llvm/bin/clang
|
||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
|
||||
RUN make -j$(nproc) llama-cli
|
||||
RUN make -j$(nproc) jarvis-cli
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
ENTRYPOINT [ "/app/jarvis-cli" ]
|
||||
|
|
|
@ -15,13 +15,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
|
|||
WORKDIR /app
|
||||
COPY . .
|
||||
RUN cmake -B build -DGGML_VULKAN=1 && \
|
||||
cmake --build build --config Release --target llama-cli
|
||||
cmake --build build --config Release --target jarvis-cli
|
||||
|
||||
# Clean up
|
||||
WORKDIR /
|
||||
RUN cp /app/build/bin/llama-cli /llama-cli && \
|
||||
RUN cp /app/build/bin/jarvis-cli /jarvis-cli && \
|
||||
rm -rf /app
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/llama-cli" ]
|
||||
ENTRYPOINT [ "/jarvis-cli" ]
|
||||
|
|
|
@ -9,15 +9,15 @@ WORKDIR /app
|
|||
|
||||
COPY . .
|
||||
|
||||
RUN make -j$(nproc) llama-cli
|
||||
RUN make -j$(nproc) jarvis-cli
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libgomp1
|
||||
|
||||
COPY --from=build /app/llama-cli /llama-cli
|
||||
COPY --from=build /app/jarvis-cli /jarvis-cli
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/llama-cli" ]
|
||||
ENTRYPOINT [ "/jarvis-cli" ]
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||
|
||||
# Notes for llama.cpp:
|
||||
# Notes for jarvis.cpp:
|
||||
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
||||
# We need to declare standard versioning if people want to sort latest releases.
|
||||
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
||||
|
@ -12,44 +12,44 @@
|
|||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||
# It is up to the user to install the correct vendor-specific support.
|
||||
|
||||
Name: llama.cpp-cuda
|
||||
Name: jarvis.cpp-cuda
|
||||
Version: %( date "+%%Y%%m%%d" )
|
||||
Release: 1%{?dist}
|
||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||
License: MIT
|
||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||
Source0: https://github.com/ggerganov/jarvis.cpp/archive/refs/heads/master.tar.gz
|
||||
BuildRequires: coreutils make gcc-c++ git cuda-toolkit
|
||||
Requires: cuda-toolkit
|
||||
URL: https://github.com/ggerganov/llama.cpp
|
||||
URL: https://github.com/ggerganov/jarvis.cpp
|
||||
|
||||
%define debug_package %{nil}
|
||||
%define source_date_epoch_from_changelog 0
|
||||
|
||||
%description
|
||||
CPU inference for Meta's Lllama2 models using default options.
|
||||
CPU inference for Meta's Ljarvis2 models using default options.
|
||||
|
||||
%prep
|
||||
%setup -n llama.cpp-master
|
||||
%setup -n jarvis.cpp-master
|
||||
|
||||
%build
|
||||
make -j GGML_CUDA=1
|
||||
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
||||
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
||||
cp -p jarvis-cli %{buildroot}%{_bindir}/jarvis-cuda-cli
|
||||
cp -p jarvis-server %{buildroot}%{_bindir}/jarvis-cuda-server
|
||||
cp -p jarvis-simple %{buildroot}%{_bindir}/jarvis-cuda-simple
|
||||
|
||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
|
||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/jarviscuda.service
|
||||
[Unit]
|
||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
||||
Description=Jarvis.cpp server, CPU only (no GPU support in this build).
|
||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=/etc/sysconfig/llama
|
||||
ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
|
||||
EnvironmentFile=/etc/sysconfig/jarvis
|
||||
ExecStart=/usr/bin/jarvis-cuda-server $JARVIS_ARGS
|
||||
ExecReload=/bin/kill -s HUP $MAINPID
|
||||
Restart=never
|
||||
|
||||
|
@ -58,8 +58,8 @@ WantedBy=default.target
|
|||
EOF
|
||||
|
||||
mkdir -p %{buildroot}/etc/sysconfig
|
||||
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
||||
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
||||
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/jarvis
|
||||
JARVIS_ARGS="-m /opt/jarvis2/ggml-model-f32.bin"
|
||||
EOF
|
||||
|
||||
%clean
|
||||
|
@ -67,11 +67,11 @@ rm -rf %{buildroot}
|
|||
rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llama-cuda-cli
|
||||
%{_bindir}/llama-cuda-server
|
||||
%{_bindir}/llama-cuda-simple
|
||||
/usr/lib/systemd/system/llamacuda.service
|
||||
%config /etc/sysconfig/llama
|
||||
%{_bindir}/jarvis-cuda-cli
|
||||
%{_bindir}/jarvis-cuda-server
|
||||
%{_bindir}/jarvis-cuda-simple
|
||||
/usr/lib/systemd/system/jarviscuda.service
|
||||
%config /etc/sysconfig/jarvis
|
||||
|
||||
%pre
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# Built and maintained by John Boero - boeroboy@gmail.com
|
||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
||||
|
||||
# Notes for llama.cpp:
|
||||
# Notes for jarvis.cpp:
|
||||
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
||||
# We need to declare standard versioning if people want to sort latest releases.
|
||||
# In the meantime, YYYYMMDD format will be used.
|
||||
|
@ -13,45 +13,45 @@
|
|||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
||||
# It is up to the user to install the correct vendor-specific support.
|
||||
|
||||
Name: llama.cpp
|
||||
Name: jarvis.cpp
|
||||
Version: %( date "+%%Y%%m%%d" )
|
||||
Release: 1%{?dist}
|
||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||
License: MIT
|
||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||
Source0: https://github.com/ggerganov/jarvis.cpp/archive/refs/heads/master.tar.gz
|
||||
BuildRequires: coreutils make gcc-c++ git libstdc++-devel
|
||||
Requires: libstdc++
|
||||
URL: https://github.com/ggerganov/llama.cpp
|
||||
URL: https://github.com/ggerganov/jarvis.cpp
|
||||
|
||||
%define debug_package %{nil}
|
||||
%define source_date_epoch_from_changelog 0
|
||||
|
||||
%description
|
||||
CPU inference for Meta's Lllama2 models using default options.
|
||||
CPU inference for Meta's Ljarvis2 models using default options.
|
||||
Models are not included in this package and must be downloaded separately.
|
||||
|
||||
%prep
|
||||
%setup -n llama.cpp-master
|
||||
%setup -n jarvis.cpp-master
|
||||
|
||||
%build
|
||||
make -j
|
||||
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
||||
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
||||
cp -p jarvis-cli %{buildroot}%{_bindir}/jarvis-cli
|
||||
cp -p jarvis-server %{buildroot}%{_bindir}/jarvis-server
|
||||
cp -p jarvis-simple %{buildroot}%{_bindir}/jarvis-simple
|
||||
|
||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llama.service
|
||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/jarvis.service
|
||||
[Unit]
|
||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
||||
Description=Jarvis.cpp server, CPU only (no GPU support in this build).
|
||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=/etc/sysconfig/llama
|
||||
ExecStart=/usr/bin/llama-server $LLAMA_ARGS
|
||||
EnvironmentFile=/etc/sysconfig/jarvis
|
||||
ExecStart=/usr/bin/jarvis-server $JARVIS_ARGS
|
||||
ExecReload=/bin/kill -s HUP $MAINPID
|
||||
Restart=never
|
||||
|
||||
|
@ -60,8 +60,8 @@ WantedBy=default.target
|
|||
EOF
|
||||
|
||||
mkdir -p %{buildroot}/etc/sysconfig
|
||||
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
||||
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
||||
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/jarvis
|
||||
JARVIS_ARGS="-m /opt/jarvis2/ggml-model-f32.bin"
|
||||
EOF
|
||||
|
||||
%clean
|
||||
|
@ -69,11 +69,11 @@ rm -rf %{buildroot}
|
|||
rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llama-cli
|
||||
%{_bindir}/llama-server
|
||||
%{_bindir}/llama-simple
|
||||
/usr/lib/systemd/system/llama.service
|
||||
%config /etc/sysconfig/llama
|
||||
%{_bindir}/jarvis-cli
|
||||
%{_bindir}/jarvis-server
|
||||
%{_bindir}/jarvis-simple
|
||||
/usr/lib/systemd/system/jarvis.service
|
||||
%config /etc/sysconfig/jarvis
|
||||
|
||||
%pre
|
||||
|
||||
|
|
|
@ -22,8 +22,8 @@ COPY . .
|
|||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release --target llama-server -j$(nproc)
|
||||
cmake -B build -DGGML_CUDA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release --target jarvis-server -j$(nproc)
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||
|
||||
|
@ -31,12 +31,12 @@ RUN apt-get update && \
|
|||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||
|
||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||
COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
|
||||
COPY --from=build /app/build/bin/jarvis-server /jarvis-server
|
||||
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
ENV JARVIS_ARG_HOST=0.0.0.0
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/llama-server" ]
|
||||
ENTRYPOINT [ "/jarvis-server" ]
|
||||
|
|
|
@ -15,20 +15,20 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|||
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
echo "Building with dynamic libs" && \
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release --target llama-server
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DJARVIS_CURL=ON ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release --target jarvis-server
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev curl
|
||||
|
||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||
COPY --from=build /app/build/bin/jarvis-server /jarvis-server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
ENV JARVIS_ARG_HOST=0.0.0.0
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/llama-server" ]
|
||||
ENTRYPOINT [ "/jarvis-server" ]
|
||||
|
|
|
@ -15,8 +15,8 @@ WORKDIR /app
|
|||
|
||||
COPY . .
|
||||
|
||||
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release --target llama-server -j$(nproc)
|
||||
RUN cmake -B build -DGGML_MUSA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release --target jarvis-server -j$(nproc)
|
||||
|
||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
||||
|
||||
|
@ -24,12 +24,12 @@ RUN apt-get update && \
|
|||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||
|
||||
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||
COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
|
||||
COPY --from=build /app/build/bin/jarvis-server /jarvis-server
|
||||
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
ENV JARVIS_ARG_HOST=0.0.0.0
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/llama-server" ]
|
||||
ENTRYPOINT [ "/jarvis-server" ]
|
||||
|
|
|
@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
|
|||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
ARG ROCM_DOCKER_ARCH="\
|
||||
gfx803 \
|
||||
|
@ -40,15 +40,15 @@ ENV GGML_HIPBLAS=1
|
|||
ENV CC=/opt/rocm/llvm/bin/clang
|
||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
ENV JARVIS_ARG_HOST=0.0.0.0
|
||||
|
||||
# Enable cURL
|
||||
ENV LLAMA_CURL=1
|
||||
ENV JARVIS_CURL=1
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev curl
|
||||
|
||||
RUN make -j$(nproc) llama-server
|
||||
RUN make -j$(nproc) jarvis-server
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
||||
ENTRYPOINT [ "/app/jarvis-server" ]
|
||||
|
|
|
@ -14,18 +14,18 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
|
|||
# Build it
|
||||
WORKDIR /app
|
||||
COPY . .
|
||||
RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||
cmake --build build --config Release --target llama-server
|
||||
RUN cmake -B build -DGGML_VULKAN=1 -DJARVIS_CURL=1 && \
|
||||
cmake --build build --config Release --target jarvis-server
|
||||
|
||||
# Clean up
|
||||
WORKDIR /
|
||||
RUN cp /app/build/bin/llama-server /llama-server && \
|
||||
RUN cp /app/build/bin/jarvis-server /jarvis-server && \
|
||||
rm -rf /app
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
ENV JARVIS_ARG_HOST=0.0.0.0
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/llama-server" ]
|
||||
ENTRYPOINT [ "/jarvis-server" ]
|
||||
|
|
|
@ -9,21 +9,21 @@ WORKDIR /app
|
|||
|
||||
COPY . .
|
||||
|
||||
ENV LLAMA_CURL=1
|
||||
ENV JARVIS_CURL=1
|
||||
|
||||
RUN make -j$(nproc) llama-server
|
||||
RUN make -j$(nproc) jarvis-server
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||
|
||||
COPY --from=build /app/llama-server /llama-server
|
||||
COPY --from=build /app/jarvis-server /jarvis-server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
ENV JARVIS_ARG_HOST=0.0.0.0
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/llama-server" ]
|
||||
ENTRYPOINT [ "/jarvis-server" ]
|
||||
|
|
|
@ -6,10 +6,10 @@
|
|||
let
|
||||
inherit (config.packages) default;
|
||||
binaries = [
|
||||
"llama-cli"
|
||||
"llama-embedding"
|
||||
"llama-server"
|
||||
"llama-quantize"
|
||||
"jarvis-cli"
|
||||
"jarvis-embedding"
|
||||
"jarvis-server"
|
||||
"jarvis-quantize"
|
||||
];
|
||||
mkApp = name: {
|
||||
type = "app";
|
||||
|
|
|
@ -2,14 +2,14 @@
|
|||
lib,
|
||||
dockerTools,
|
||||
buildEnv,
|
||||
llama-cpp,
|
||||
jarvis-cpp,
|
||||
interactive ? true,
|
||||
coreutils,
|
||||
}:
|
||||
|
||||
# A tar that can be fed into `docker load`:
|
||||
#
|
||||
# $ nix build .#llamaPackages.docker
|
||||
# $ nix build .#jarvisPackages.docker
|
||||
# $ docker load < result
|
||||
|
||||
# For details and variations cf.
|
||||
|
@ -19,16 +19,16 @@
|
|||
|
||||
# Approximate (compressed) sizes, at the time of writing, are:
|
||||
#
|
||||
# .#llamaPackages.docker: 125M;
|
||||
# .#llamaPackagesCuda.docker: 537M;
|
||||
# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
|
||||
# .#jarvisPackages.docker: 125M;
|
||||
# .#jarvisPackagesCuda.docker: 537M;
|
||||
# .#legacyPackages.aarch64-linux.jarvisPackagesXavier.docker: 415M.
|
||||
|
||||
dockerTools.buildLayeredImage {
|
||||
name = llama-cpp.pname;
|
||||
name = jarvis-cpp.pname;
|
||||
tag = "latest";
|
||||
|
||||
contents =
|
||||
[ llama-cpp ]
|
||||
[ jarvis-cpp ]
|
||||
++ lib.optionals interactive [
|
||||
coreutils
|
||||
dockerTools.binSh
|
||||
|
|
|
@ -11,10 +11,10 @@
|
|||
{
|
||||
legacyPackages =
|
||||
let
|
||||
caps.llamaPackagesXavier = "7.2";
|
||||
caps.llamaPackagesOrin = "8.7";
|
||||
caps.llamaPackagesTX2 = "6.2";
|
||||
caps.llamaPackagesNano = "5.3";
|
||||
caps.jarvisPackagesXavier = "7.2";
|
||||
caps.jarvisPackagesOrin = "8.7";
|
||||
caps.jarvisPackagesTX2 = "6.2";
|
||||
caps.jarvisPackagesNano = "5.3";
|
||||
|
||||
pkgsFor =
|
||||
cap:
|
||||
|
@ -31,9 +31,9 @@
|
|||
builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
|
||||
|
||||
packages = lib.optionalAttrs (system == "aarch64-linux") {
|
||||
jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
|
||||
jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
|
||||
jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
|
||||
jetson-xavier = config.legacyPackages.jarvisPackagesXavier.jarvis-cpp;
|
||||
jetson-orin = config.legacyPackages.jarvisPackagesOrin.jarvis-cpp;
|
||||
jetson-nano = config.legacyPackages.jarvisPackagesNano.jarvis-cpp;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
lib,
|
||||
llamaVersion,
|
||||
jarvisVersion,
|
||||
numpy,
|
||||
tqdm,
|
||||
sentencepiece,
|
||||
|
@ -12,7 +12,7 @@
|
|||
|
||||
buildPythonPackage {
|
||||
pname = "gguf";
|
||||
version = llamaVersion;
|
||||
version = jarvisVersion;
|
||||
pyproject = true;
|
||||
nativeBuildInputs = [ poetry-core ];
|
||||
propagatedBuildInputs = [
|
||||
|
|
|
@ -33,7 +33,7 @@
|
|||
useRocm ? config.rocmSupport,
|
||||
enableCurl ? true,
|
||||
useVulkan ? false,
|
||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||
jarvisVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||
|
||||
# It's necessary to consistently use backendStdenv when building with CUDA support,
|
||||
# otherwise we get libstdc++ errors downstream.
|
||||
|
@ -103,8 +103,8 @@ let
|
|||
in
|
||||
|
||||
effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
pname = "llama-cpp${pnameSuffix}";
|
||||
version = llamaVersion;
|
||||
pname = "jarvis-cpp${pnameSuffix}";
|
||||
version = jarvisVersion;
|
||||
|
||||
# Note: none of the files discarded here are visible in the sandbox or
|
||||
# affect the output hash. This also means they can be modified without
|
||||
|
@ -132,12 +132,12 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
|||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
||||
'';
|
||||
|
||||
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
|
||||
# With PR#6015 https://github.com/ggerganov/jarvis.cpp/pull/6015,
|
||||
# `default.metallib` may be compiled with Metal compiler from XCode
|
||||
# and we need to escape sandbox on MacOS to access Metal compiler.
|
||||
# `xcrun` is used find the path of the Metal compiler, which is varible
|
||||
# and not on $PATH
|
||||
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
|
||||
# see https://github.com/ggerganov/jarvis.cpp/pull/6118 for discussion
|
||||
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
|
||||
|
||||
nativeBuildInputs =
|
||||
|
@ -166,10 +166,10 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
|||
|
||||
cmakeFlags =
|
||||
[
|
||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||
(cmakeBool "JARVIS_BUILD_SERVER" true)
|
||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||
(cmakeBool "LLAMA_CURL" enableCurl)
|
||||
(cmakeBool "JARVIS_CURL" enableCurl)
|
||||
(cmakeBool "GGML_NATIVE" false)
|
||||
(cmakeBool "GGML_BLAS" useBlas)
|
||||
(cmakeBool "GGML_CUDA" useCuda)
|
||||
|
@ -205,7 +205,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
|||
# if they haven't been added yet.
|
||||
postInstall = ''
|
||||
mkdir -p $out/include
|
||||
cp $src/include/llama.h $out/include/
|
||||
cp $src/include/jarvis.h $out/include/
|
||||
'';
|
||||
|
||||
meta = {
|
||||
|
@ -219,11 +219,11 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
|||
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
||||
|
||||
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
||||
homepage = "https://github.com/ggerganov/llama.cpp/";
|
||||
homepage = "https://github.com/ggerganov/jarvis.cpp/";
|
||||
license = lib.licenses.mit;
|
||||
|
||||
# Accommodates `nix run` and `lib.getExe`
|
||||
mainProgram = "llama-cli";
|
||||
mainProgram = "jarvis-cli";
|
||||
|
||||
# These people might respond, on the best effort basis, if you ping them
|
||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
}@inputs:
|
||||
|
||||
let
|
||||
llama-python-deps = with python3Packages; [
|
||||
jarvis-python-deps = with python3Packages; [
|
||||
numpy
|
||||
sentencepiece
|
||||
transformers
|
||||
|
@ -18,7 +18,7 @@ let
|
|||
gguf-py
|
||||
tqdm
|
||||
|
||||
# for scripts/compare-llama-bench.py
|
||||
# for scripts/compare-jarvis-bench.py
|
||||
gitpython
|
||||
tabulate
|
||||
|
||||
|
@ -28,7 +28,7 @@ let
|
|||
|
||||
];
|
||||
|
||||
llama-python-test-deps = with python3Packages; [
|
||||
jarvis-python-test-deps = with python3Packages; [
|
||||
# Server bench
|
||||
matplotlib
|
||||
|
||||
|
@ -40,7 +40,7 @@ let
|
|||
in
|
||||
|
||||
buildPythonPackage ({
|
||||
pname = "llama-scripts";
|
||||
pname = "jarvis-scripts";
|
||||
version = "0.0.0";
|
||||
pyproject = true;
|
||||
|
||||
|
@ -61,6 +61,6 @@ buildPythonPackage ({
|
|||
src = lib.cleanSource ../../.;
|
||||
};
|
||||
nativeBuildInputs = [ poetry-core ];
|
||||
nativeCheckInputs = llama-python-test-deps;
|
||||
dependencies = llama-python-deps;
|
||||
nativeCheckInputs = jarvis-python-test-deps;
|
||||
dependencies = jarvis-python-deps;
|
||||
})
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
lib,
|
||||
newScope,
|
||||
python3,
|
||||
llamaVersion ? "0.0.0",
|
||||
jarvisVersion ? "0.0.0",
|
||||
}:
|
||||
|
||||
let
|
||||
|
@ -21,7 +21,7 @@ in
|
|||
# Cf. https://noogle.dev/f/lib/makeScope
|
||||
|
||||
lib.makeScope newScope (self: {
|
||||
inherit llamaVersion;
|
||||
inherit jarvisVersion;
|
||||
gguf-py = self.callPackage ./package-gguf-py.nix {
|
||||
inherit
|
||||
buildPythonPackage
|
||||
|
@ -34,7 +34,7 @@ lib.makeScope newScope (self: {
|
|||
;
|
||||
};
|
||||
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
|
||||
llama-cpp = self.callPackage ./package.nix { };
|
||||
jarvis-cpp = self.callPackage ./package.nix { };
|
||||
docker = self.callPackage ./docker.nix { };
|
||||
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
||||
sif = self.callPackage ./sif.nix { };
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
lib,
|
||||
singularity-tools,
|
||||
llama-cpp,
|
||||
jarvis-cpp,
|
||||
bashInteractive,
|
||||
interactive ? false,
|
||||
}:
|
||||
|
@ -10,8 +10,8 @@ let
|
|||
optionalInt = cond: x: if cond then x else 0;
|
||||
in
|
||||
singularity-tools.buildImage rec {
|
||||
inherit (llama-cpp) name;
|
||||
contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
|
||||
inherit (jarvis-cpp) name;
|
||||
contents = [ jarvis-cpp ] ++ lib.optionals interactive [ bashInteractive ];
|
||||
|
||||
# These are excessive (but safe) for most variants. Building singularity
|
||||
# images requires superuser privileges, so we build them inside a VM in a
|
||||
|
@ -22,6 +22,6 @@ singularity-tools.buildImage rec {
|
|||
# Expected image sizes:
|
||||
# - cpu/blas: 150M,
|
||||
# - cuda, all gencodes: 560M,
|
||||
diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
|
||||
diskSize = 4096 + optionalInt jarvis-cpp.useRocm 16384;
|
||||
memSize = diskSize;
|
||||
}
|
||||
|
|
|
@ -10,9 +10,9 @@ shift
|
|||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||
python3 ./convert_hf_to_gguf.py "$@"
|
||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||
./llama-quantize "$@"
|
||||
./jarvis-quantize "$@"
|
||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||
./llama-cli "$@"
|
||||
./jarvis-cli "$@"
|
||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||
echo "Converting PTH to GGML..."
|
||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||
|
@ -20,17 +20,17 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
|||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||
else
|
||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
||||
./jarvis-quantize "$i" "${i/f16/q4_0}" q4_0
|
||||
fi
|
||||
done
|
||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||
./llama-server "$@"
|
||||
./jarvis-server "$@"
|
||||
else
|
||||
echo "Unknown command: $arg1"
|
||||
echo "Available commands: "
|
||||
echo " --run (-r): Run a model previously converted into ggml"
|
||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||
echo " --convert (-c): Convert a llama model into ggml"
|
||||
echo " --convert (-c): Convert a jarvis model into ggml"
|
||||
echo " ex: --outtype f16 \"/models/7B/\" "
|
||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||
|
|
|
@ -12,8 +12,8 @@ build*/
|
|||
|
||||
models/*
|
||||
|
||||
/llama-cli
|
||||
/llama-quantize
|
||||
/jarvis-cli
|
||||
/jarvis-quantize
|
||||
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
|
|
|
@ -24,7 +24,7 @@ insert_final_newline = unset
|
|||
[examples/server/public/*]
|
||||
indent_size = 2
|
||||
|
||||
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
|
||||
[examples/jarvis.swiftui/jarvis.swiftui.xcodeproj/*]
|
||||
indent_style = tab
|
||||
|
||||
[examples/cvector-generator/*.txt]
|
||||
|
|
6
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
6
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
|
@ -1,5 +1,5 @@
|
|||
name: Low Severity Bugs
|
||||
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
|
||||
description: Used to report low severity bugs in jarvis.cpp (e.g. cosmetic issues, non critical UI glitches)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "low severity"]
|
||||
body:
|
||||
|
@ -8,7 +8,7 @@ body:
|
|||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
and the version of jarvis.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
|
@ -24,7 +24,7 @@ body:
|
|||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./llama-cli --version
|
||||
$./jarvis-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
|
|
6
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
6
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
|
@ -1,5 +1,5 @@
|
|||
name: Medium Severity Bug
|
||||
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
|
||||
description: Used to report medium severity bugs in jarvis.cpp (e.g. Malfunctioning Features but generally still useable)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "medium severity"]
|
||||
body:
|
||||
|
@ -8,7 +8,7 @@ body:
|
|||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
and the version of jarvis.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
|
@ -24,7 +24,7 @@ body:
|
|||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./llama-cli --version
|
||||
$./jarvis-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
|
|
6
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
6
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
|
@ -1,5 +1,5 @@
|
|||
name: High Severity Bug
|
||||
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
|
||||
description: Used to report high severity bugs in jarvis.cpp (e.g. Malfunctioning features hindering important common workflow)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "high severity"]
|
||||
body:
|
||||
|
@ -8,7 +8,7 @@ body:
|
|||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
and the version of jarvis.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
|
@ -24,7 +24,7 @@ body:
|
|||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./llama-cli --version
|
||||
$./jarvis-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
|
|
6
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
6
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
|
@ -1,5 +1,5 @@
|
|||
name: Critical Severity Bug
|
||||
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
|
||||
description: Used to report critical severity bugs in jarvis.cpp (e.g. Crashing, Corrupted, Dataloss)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "critical severity"]
|
||||
body:
|
||||
|
@ -8,7 +8,7 @@ body:
|
|||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
and the version of jarvis.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
|
@ -24,7 +24,7 @@ body:
|
|||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./llama-cli --version
|
||||
$./jarvis-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
|
|
12
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
12
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
|
@ -1,12 +1,12 @@
|
|||
name: Enhancement
|
||||
description: Used to request enhancements for llama.cpp
|
||||
description: Used to request enhancements for jarvis.cpp
|
||||
title: "Feature Request: "
|
||||
labels: ["enhancement"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
|
||||
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/jarvis.cpp/discussions/categories/ideas)
|
||||
|
||||
- type: checkboxes
|
||||
id: prerequisites
|
||||
|
@ -16,18 +16,18 @@ body:
|
|||
options:
|
||||
- label: I am running the latest code. Mention the version if possible as well.
|
||||
required: true
|
||||
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||
- label: I carefully followed the [README.md](https://github.com/ggerganov/jarvis.cpp/blob/master/README.md).
|
||||
required: true
|
||||
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
||||
required: true
|
||||
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
|
||||
- label: I reviewed the [Discussions](https://github.com/ggerganov/jarvis.cpp/discussions), and have a new and useful enhancement to share.
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: feature-description
|
||||
attributes:
|
||||
label: Feature Description
|
||||
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
||||
description: Please provide a detailed written description of what you were trying to do, and what you expected `jarvis.cpp` to do as an enhancement.
|
||||
placeholder: Detailed description of the enhancement
|
||||
validations:
|
||||
required: true
|
||||
|
@ -36,7 +36,7 @@ body:
|
|||
id: motivation
|
||||
attributes:
|
||||
label: Motivation
|
||||
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
||||
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `jarvis.cpp` users.
|
||||
placeholder: Explanation of why this feature is needed and its benefits
|
||||
validations:
|
||||
required: true
|
||||
|
|
2
.github/ISSUE_TEMPLATE/06-research.yml
vendored
2
.github/ISSUE_TEMPLATE/06-research.yml
vendored
|
@ -6,7 +6,7 @@ body:
|
|||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
|
||||
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
|
||||
|
||||
- type: checkboxes
|
||||
id: research-stage
|
||||
|
|
4
.github/ISSUE_TEMPLATE/07-refactor.yml
vendored
4
.github/ISSUE_TEMPLATE/07-refactor.yml
vendored
|
@ -6,8 +6,8 @@ body:
|
|||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
||||
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
||||
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
||||
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/jarvis.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
||||
|
||||
- type: textarea
|
||||
id: background-description
|
||||
|
|
6
.github/ISSUE_TEMPLATE/config.yml
vendored
6
.github/ISSUE_TEMPLATE/config.yml
vendored
|
@ -1,11 +1,11 @@
|
|||
blank_issues_enabled: true
|
||||
contact_links:
|
||||
- name: Got an idea?
|
||||
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
|
||||
url: https://github.com/ggerganov/jarvis.cpp/discussions/categories/ideas
|
||||
about: Pop it there. It may then become an enhancement ticket.
|
||||
- name: Got a question?
|
||||
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
|
||||
url: https://github.com/ggerganov/jarvis.cpp/discussions/categories/q-a
|
||||
about: Ask a question there!
|
||||
- name: Want to contribute?
|
||||
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
|
||||
url: https://github.com/ggerganov/jarvis.cpp/wiki/contribute
|
||||
about: Head to the contribution guide page of the wiki for areas you can help with
|
||||
|
|
2
.github/labeler.yml
vendored
2
.github/labeler.yml
vendored
|
@ -67,7 +67,7 @@ script:
|
|||
android:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- examples/llama.android/**
|
||||
- examples/jarvis.android/**
|
||||
server:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
|
2
.github/pull_request_template.md
vendored
2
.github/pull_request_template.md
vendored
|
@ -1,6 +1,6 @@
|
|||
|
||||
|
||||
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
|
||||
- [x] I have read the [contributing guidelines](https://github.com/ggerganov/jarvis.cpp/blob/master/CONTRIBUTING.md)
|
||||
- Self-reported review complexity:
|
||||
- [ ] Low
|
||||
- [ ] Medium
|
||||
|
|
26
.github/workflows/bench.yml.disabled
vendored
26
.github/workflows/bench.yml.disabled
vendored
|
@ -1,5 +1,5 @@
|
|||
# TODO: there have been some issues with the workflow, so disabling for now
|
||||
# https://github.com/ggerganov/llama.cpp/issues/7893
|
||||
# https://github.com/ggerganov/jarvis.cpp/issues/7893
|
||||
#
|
||||
# Benchmark
|
||||
name: Benchmark
|
||||
|
@ -27,10 +27,10 @@ on:
|
|||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||
paths: ['jarvis.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||
pull_request_target:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||
paths: ['jarvis.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||
schedule:
|
||||
- cron: '04 2 * * *'
|
||||
|
||||
|
@ -113,16 +113,16 @@ jobs:
|
|||
set -eux
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DLLAMA_CUBLAS=ON \
|
||||
-DJARVIS_BUILD_SERVER=ON \
|
||||
-DJARVIS_CURL=ON \
|
||||
-DJARVIS_CUBLAS=ON \
|
||||
-DCUDAToolkit_ROOT=/usr/local/cuda \
|
||||
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
|
||||
-DCMAKE_CUDA_ARCHITECTURES=75 \
|
||||
-DLLAMA_FATAL_WARNINGS=OFF \
|
||||
-DLLAMA_ALL_WARNINGS=OFF \
|
||||
-DJARVIS_FATAL_WARNINGS=OFF \
|
||||
-DJARVIS_ALL_WARNINGS=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release;
|
||||
cmake --build build --config Release -j $(nproc) --target llama-server
|
||||
cmake --build build --config Release -j $(nproc) --target jarvis-server
|
||||
|
||||
- name: Download the dataset
|
||||
id: download_dataset
|
||||
|
@ -240,7 +240,7 @@ jobs:
|
|||
message: |
|
||||
<p align="center">
|
||||
|
||||
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
|
||||
📈 **jarvis.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
|
||||
|
||||
</p>
|
||||
|
||||
|
@ -249,9 +249,9 @@ jobs:
|
|||
<summary>Expand details for performance related PR only</summary>
|
||||
|
||||
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
|
||||
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
|
||||
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
|
||||
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
|
||||
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.JARVISCPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.JARVISCPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
|
||||
- Prompt processing (pp): avg=${{ env.JARVISCPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.JARVISCPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
|
||||
- Token generation (tg): avg=${{ env.JARVISCPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.JARVISCPP_TOKENS_SECOND_P_95_ }}tk/s
|
||||
- ${{ env.BENCH_GRAPH_XLABEL }}
|
||||
|
||||
|
||||
|
|
148
.github/workflows/build.yml
vendored
148
.github/workflows/build.yml
vendored
|
@ -28,9 +28,9 @@ env:
|
|||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
JARVIS_LOG_COLORS: 1
|
||||
JARVIS_LOG_PREFIX: 1
|
||||
JARVIS_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
macOS-latest-cmake-arm64:
|
||||
|
@ -55,7 +55,7 @@ jobs:
|
|||
sysctl -a
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
|
||||
cmake -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
|
@ -82,14 +82,14 @@ jobs:
|
|||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
|
||||
zip -r jarvis-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
|
||||
name: llama-bin-macos-arm64.zip
|
||||
path: jarvis-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
|
||||
name: jarvis-bin-macos-arm64.zip
|
||||
|
||||
macOS-latest-cmake-x64:
|
||||
runs-on: macos-12
|
||||
|
@ -112,8 +112,8 @@ jobs:
|
|||
run: |
|
||||
sysctl -a
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
|
||||
# https://github.com/ggerganov/jarvis.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
|
@ -140,20 +140,20 @@ jobs:
|
|||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
|
||||
zip -r jarvis-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
||||
name: llama-bin-macos-x64.zip
|
||||
path: jarvis-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
||||
name: jarvis-bin-macos-x64.zip
|
||||
|
||||
ubuntu-focal-make:
|
||||
runs-on: ubuntu-20.04
|
||||
env:
|
||||
LLAMA_NODE_AVAILABLE: true
|
||||
LLAMA_PYTHON_AVAILABLE: true
|
||||
JARVIS_NODE_AVAILABLE: true
|
||||
JARVIS_PYTHON_AVAILABLE: true
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
@ -177,7 +177,7 @@ jobs:
|
|||
- name: Build
|
||||
id: make_build
|
||||
env:
|
||||
LLAMA_FATAL_WARNINGS: 1
|
||||
JARVIS_FATAL_WARNINGS: 1
|
||||
run: |
|
||||
CC=gcc-8 make -j $(nproc)
|
||||
|
||||
|
@ -204,8 +204,8 @@ jobs:
|
|||
- name: Build
|
||||
id: make_build
|
||||
env:
|
||||
LLAMA_FATAL_WARNINGS: 1
|
||||
LLAMA_CURL: 1
|
||||
JARVIS_FATAL_WARNINGS: 1
|
||||
JARVIS_CURL: 1
|
||||
run: |
|
||||
CC=gcc-8 make -j $(nproc)
|
||||
|
||||
|
@ -230,7 +230,7 @@ jobs:
|
|||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
|
||||
cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
|
@ -239,16 +239,16 @@ jobs:
|
|||
cd build
|
||||
ctest -L 'main|curl' --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
- name: Test jarvis2c conversion
|
||||
id: jarvis2c_test
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch tokenizer"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
wget https://huggingface.co/karpathy/tinyjarviss/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch jarvis2c model"
|
||||
wget https://huggingface.co/karpathy/tinyjarviss/resolve/main/stories260K/stories260K.bin
|
||||
./bin/jarvis-convert-jarvis2c-to-ggml --copy-vocab-from-model ./tok512.bin --jarvis2c-model stories260K.bin --jarvis2c-output-model stories260K.gguf
|
||||
./bin/jarvis-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
|
@ -268,14 +268,14 @@ jobs:
|
|||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
|
||||
zip -r jarvis-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
|
||||
name: llama-bin-ubuntu-x64.zip
|
||||
path: jarvis-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
|
||||
name: jarvis-bin-ubuntu-x64.zip
|
||||
|
||||
ubuntu-latest-cmake-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
|
@ -304,7 +304,7 @@ jobs:
|
|||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
|
@ -313,7 +313,7 @@ jobs:
|
|||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
|
||||
cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
|
||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
|
@ -487,7 +487,7 @@ jobs:
|
|||
|
||||
# TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||
# how to debug it.
|
||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
||||
# ref: https://github.com/ggerganov/jarvis.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
||||
macOS-latest-make:
|
||||
runs-on: macos-latest
|
||||
|
||||
|
@ -505,7 +505,7 @@ jobs:
|
|||
- name: Build
|
||||
id: make_build
|
||||
env:
|
||||
LLAMA_FATAL_WARNINGS: 1
|
||||
JARVIS_FATAL_WARNINGS: 1
|
||||
run: |
|
||||
GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
|
@ -517,7 +517,7 @@ jobs:
|
|||
|
||||
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||
# how to debug it.
|
||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
||||
# ref: https://github.com/ggerganov/jarvis.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
||||
# would be great if we fix these
|
||||
macOS-latest-cmake:
|
||||
runs-on: macos-latest
|
||||
|
@ -539,7 +539,7 @@ jobs:
|
|||
sysctl -a
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
|
||||
cmake -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
|
@ -570,9 +570,9 @@ jobs:
|
|||
cd build
|
||||
cmake -G Xcode .. \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DJARVIS_BUILD_EXAMPLES=OFF \
|
||||
-DJARVIS_BUILD_TESTS=OFF \
|
||||
-DJARVIS_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
|
@ -600,9 +600,9 @@ jobs:
|
|||
cd build
|
||||
cmake -G Xcode .. \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DJARVIS_BUILD_EXAMPLES=OFF \
|
||||
-DJARVIS_BUILD_TESTS=OFF \
|
||||
-DJARVIS_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
|
@ -629,7 +629,7 @@ jobs:
|
|||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
xcodebuild -scheme llama -destination "${{ matrix.destination }}"
|
||||
xcodebuild -scheme jarvis -destination "${{ matrix.destination }}"
|
||||
|
||||
- name: Build Swift Example
|
||||
id: make_build_swift_example
|
||||
|
@ -705,23 +705,23 @@ jobs:
|
|||
matrix:
|
||||
include:
|
||||
- build: 'noavx-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx2-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
||||
defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx512-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||
defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'openblas-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'kompute-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||
defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'vulkan-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
||||
defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'llvm-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'msvc-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
@ -807,7 +807,7 @@ jobs:
|
|||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||
cd build
|
||||
$env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
|
||||
$env:JARVIS_SKIP_TESTS_SLOW_ON_EMULATOR = 1
|
||||
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
- name: Determine tag name
|
||||
|
@ -827,15 +827,15 @@ jobs:
|
|||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||
Copy-Item LICENSE .\build\bin\Release\jarvis.cpp.txt
|
||||
7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
path: jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||
name: jarvis-bin-win-${{ matrix.build }}.zip
|
||||
|
||||
windows-latest-cmake-cuda:
|
||||
runs-on: windows-2019
|
||||
|
@ -865,7 +865,7 @@ jobs:
|
|||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
|
||||
cmake .. -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
|
||||
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
|
||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
|
@ -886,28 +886,28 @@ jobs:
|
|||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
|
||||
7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
||||
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
path: jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
||||
name: jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
run: |
|
||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
||||
$dst='.\build\bin\cudart\'
|
||||
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
|
||||
7z a cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
path: cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
windows-latest-cmake-sycl:
|
||||
runs-on: windows-latest
|
||||
|
@ -963,14 +963,14 @@ jobs:
|
|||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||
7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
path: jarvis-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
||||
name: jarvis-bin-win-sycl-x64.zip
|
||||
|
||||
windows-latest-cmake-hip:
|
||||
if: ${{ github.event.inputs.create_release != 'true' }}
|
||||
|
@ -1060,13 +1060,13 @@ jobs:
|
|||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
|
||||
7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
path: jarvis-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
name: jarvis-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
|
@ -1076,7 +1076,7 @@ jobs:
|
|||
uses: actions/checkout@v4
|
||||
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||
run: xcodebuild -project examples/jarvis.swiftui/jarvis.swiftui.xcodeproj -scheme jarvis.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||
|
||||
android-build:
|
||||
runs-on: ubuntu-latest
|
||||
|
@ -1098,7 +1098,7 @@ jobs:
|
|||
|
||||
- name: Build
|
||||
run: |
|
||||
cd examples/llama.android
|
||||
cd examples/jarvis.android
|
||||
|
||||
./gradlew build --no-daemon
|
||||
|
||||
|
@ -1261,7 +1261,7 @@ jobs:
|
|||
# sudo apt-get install cmake
|
||||
#
|
||||
# - name: Configure
|
||||
# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
|
@ -1300,7 +1300,7 @@ jobs:
|
|||
# - name: Upload binaries
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: llama-bin-${{ matrix.arch }}
|
||||
# name: jarvis-bin-${{ matrix.arch }}
|
||||
# path: build/bin/${{ matrix.build }}
|
||||
#
|
||||
# windows-blas:
|
||||
|
@ -1339,7 +1339,7 @@ jobs:
|
|||
# run: >
|
||||
# cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||
# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
# -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
|
||||
# -DJARVIS_SUPPORT_OPENBLAS=${{ matrix.blas }}
|
||||
# -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
|
||||
#
|
||||
# - name: Build
|
||||
|
@ -1355,7 +1355,7 @@ jobs:
|
|||
# if: matrix.blas == 'ON'
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: llama-blas-bin-${{ matrix.arch }}
|
||||
# name: jarvis-blas-bin-${{ matrix.arch }}
|
||||
# path: build/bin/${{ matrix.build }}
|
||||
#
|
||||
# emscripten:
|
||||
|
|
20
.github/workflows/docker.yml
vendored
20
.github/workflows/docker.yml
vendored
|
@ -37,21 +37,21 @@ jobs:
|
|||
strategy:
|
||||
matrix:
|
||||
config:
|
||||
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "light", dockerfile: ".devops/jarvis-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server", dockerfile: ".devops/jarvis-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "light-cuda", dockerfile: ".devops/jarvis-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-cuda", dockerfile: ".devops/jarvis-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "light-musa", dockerfile: ".devops/jarvis-cli-musa.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-musa", dockerfile: ".devops/jarvis-server-musa.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
|
||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||
#- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
#- { tag: "light-rocm", dockerfile: ".devops/jarvis-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
#- { tag: "server-rocm", dockerfile: ".devops/jarvis-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "light-intel", dockerfile: ".devops/jarvis-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-intel", dockerfile: ".devops/jarvis-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
|
2
.github/workflows/labeler.yml
vendored
2
.github/workflows/labeler.yml
vendored
|
@ -11,7 +11,7 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: "ggerganov/llama.cpp"
|
||||
repository: "ggerganov/jarvis.cpp"
|
||||
- uses: actions/labeler@v5
|
||||
with:
|
||||
configuration-path: '.github/labeler.yml'
|
||||
|
|
6
.github/workflows/nix-ci-aarch64.yml
vendored
6
.github/workflows/nix-ci-aarch64.yml
vendored
|
@ -47,8 +47,8 @@ jobs:
|
|||
extra-conf: |
|
||||
extra-platforms = aarch64-linux
|
||||
extra-system-features = nixos-test kvm
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
|
@ -56,7 +56,7 @@ jobs:
|
|||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: llama-cpp
|
||||
name: jarvis-cpp
|
||||
- name: Show all output paths
|
||||
run: >
|
||||
nix run github:nix-community/nix-eval-jobs
|
||||
|
|
10
.github/workflows/nix-ci.yml
vendored
10
.github/workflows/nix-ci.yml
vendored
|
@ -34,8 +34,8 @@ jobs:
|
|||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
|
@ -61,8 +61,8 @@ jobs:
|
|||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
|
@ -70,7 +70,7 @@ jobs:
|
|||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: llama-cpp
|
||||
name: jarvis-cpp
|
||||
- name: Build
|
||||
run: >
|
||||
nix run github:Mic92/nix-fast-build
|
||||
|
|
32
.github/workflows/server.yml
vendored
32
.github/workflows/server.yml
vendored
|
@ -21,10 +21,10 @@ on:
|
|||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
JARVIS_LOG_COLORS: 1
|
||||
JARVIS_LOG_PREFIX: 1
|
||||
JARVIS_LOG_TIMESTAMPS: 1
|
||||
JARVIS_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
|
@ -41,7 +41,7 @@ jobs:
|
|||
include:
|
||||
- build_type: Release
|
||||
sanitizer: ""
|
||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||
fail-fast: false # While -DJARVIS_SANITIZE_THREAD=ON is broken
|
||||
|
||||
steps:
|
||||
- name: Dependencies
|
||||
|
@ -99,12 +99,12 @@ jobs:
|
|||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DJARVIS_BUILD_SERVER=ON \
|
||||
-DJARVIS_CURL=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_OPENMP=OFF ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target jarvis-server
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
@ -112,11 +112,11 @@ jobs:
|
|||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DJARVIS_BUILD_SERVER=ON \
|
||||
-DJARVIS_CURL=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
-DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target jarvis-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
|
@ -155,8 +155,8 @@ jobs:
|
|||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||
cmake -B build -DJARVIS_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target jarvis-server
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
|
@ -180,7 +180,7 @@ jobs:
|
|||
run: |
|
||||
cd examples/server/tests
|
||||
$env:PYTHONIOENCODING = ":replace"
|
||||
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags jarvis.cpp
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
|
|
8
.gitignore
vendored
8
.gitignore
vendored
|
@ -48,8 +48,8 @@ build*
|
|||
!build-info.sh
|
||||
!build.zig
|
||||
!docs/build.md
|
||||
/libllama.so
|
||||
/llama-*
|
||||
/libjarvis.so
|
||||
/jarvis-*
|
||||
/vulkan-shaders-gen
|
||||
android-ndk-*
|
||||
arm_neon.h
|
||||
|
@ -57,7 +57,7 @@ cmake-build-*
|
|||
CMakeSettings.json
|
||||
compile_commands.json
|
||||
ggml-metal-embed.metal
|
||||
llama-batched-swift
|
||||
jarvis-batched-swift
|
||||
/rpc-server
|
||||
out/
|
||||
tmp/
|
||||
|
@ -118,7 +118,7 @@ poetry.toml
|
|||
/tests/test-double-float
|
||||
/tests/test-grad0
|
||||
/tests/test-grammar-parser
|
||||
/tests/test-llama-grammar
|
||||
/tests/test-jarvis-grammar
|
||||
/tests/test-opt
|
||||
/tests/test-quantize-fns
|
||||
/tests/test-quantize-perf
|
||||
|
|
118
CMakeLists.txt
118
CMakeLists.txt
|
@ -1,5 +1,5 @@
|
|||
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
||||
project("llama.cpp" C CXX)
|
||||
project("jarvis.cpp" C CXX)
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
#set(CMAKE_WARN_DEPRECATED YES)
|
||||
|
@ -18,20 +18,20 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
|||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||
|
||||
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||
set(LLAMA_STANDALONE ON)
|
||||
set(JARVIS_STANDALONE ON)
|
||||
|
||||
include(git-vars)
|
||||
|
||||
# configure project version
|
||||
# TODO
|
||||
else()
|
||||
set(LLAMA_STANDALONE OFF)
|
||||
set(JARVIS_STANDALONE OFF)
|
||||
endif()
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||
|
||||
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
|
||||
option(JARVIS_WASM_SINGLE_FILE "jarvis: embed WASM inside the generated jarvis.js" ON)
|
||||
else()
|
||||
if (MINGW)
|
||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||
|
@ -51,41 +51,41 @@ endif()
|
|||
#
|
||||
|
||||
# debug
|
||||
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
|
||||
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
|
||||
option(JARVIS_ALL_WARNINGS "jarvis: enable all compiler warnings" ON)
|
||||
option(JARVIS_ALL_WARNINGS_3RD_PARTY "jarvis: enable all compiler warnings in 3rd party libs" OFF)
|
||||
|
||||
# build
|
||||
option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF)
|
||||
option(JARVIS_FATAL_WARNINGS "jarvis: enable -Werror flag" OFF)
|
||||
|
||||
# sanitizers
|
||||
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
|
||||
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
|
||||
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
||||
option(JARVIS_SANITIZE_THREAD "jarvis: enable thread sanitizer" OFF)
|
||||
option(JARVIS_SANITIZE_ADDRESS "jarvis: enable address sanitizer" OFF)
|
||||
option(JARVIS_SANITIZE_UNDEFINED "jarvis: enable undefined sanitizer" OFF)
|
||||
|
||||
# utils
|
||||
option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
|
||||
option(JARVIS_BUILD_COMMON "jarvis: build common utils library" ${JARVIS_STANDALONE})
|
||||
|
||||
# extra artifacts
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(JARVIS_BUILD_TESTS "jarvis: build tests" ${JARVIS_STANDALONE})
|
||||
option(JARVIS_BUILD_EXAMPLES "jarvis: build examples" ${JARVIS_STANDALONE})
|
||||
option(JARVIS_BUILD_SERVER "jarvis: build server example" ${JARVIS_STANDALONE})
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||
option(JARVIS_CURL "jarvis: use libcurl to download model from an URL" OFF)
|
||||
|
||||
# Required for relocatable CMake package
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
|
||||
# override ggml options
|
||||
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
|
||||
set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
|
||||
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
|
||||
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
||||
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
||||
set(GGML_SANITIZE_THREAD ${JARVIS_SANITIZE_THREAD})
|
||||
set(GGML_SANITIZE_ADDRESS ${JARVIS_SANITIZE_ADDRESS})
|
||||
set(GGML_SANITIZE_UNDEFINED ${JARVIS_SANITIZE_UNDEFINED})
|
||||
set(GGML_ALL_WARNINGS ${JARVIS_ALL_WARNINGS})
|
||||
set(GGML_FATAL_WARNINGS ${JARVIS_FATAL_WARNINGS})
|
||||
|
||||
# change the default for these ggml options
|
||||
if (NOT DEFINED GGML_LLAMAFILE)
|
||||
set(GGML_LLAMAFILE_DEFAULT ON)
|
||||
if (NOT DEFINED GGML_JARVISFILE)
|
||||
set(GGML_JARVISFILE_DEFAULT ON)
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED GGML_AMX)
|
||||
|
@ -97,23 +97,23 @@ if (NOT DEFINED GGML_CUDA_GRAPHS)
|
|||
endif()
|
||||
|
||||
# transition helpers
|
||||
function (llama_option_depr TYPE OLD NEW)
|
||||
function (jarvis_option_depr TYPE OLD NEW)
|
||||
if (${OLD})
|
||||
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
|
||||
set(${NEW} ON PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
|
||||
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
|
||||
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
||||
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
||||
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
||||
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
||||
llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
|
||||
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
|
||||
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
|
||||
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
|
||||
jarvis_option_depr(FATAL_ERROR JARVIS_CUBLAS GGML_CUDA)
|
||||
jarvis_option_depr(WARNING JARVIS_CUDA GGML_CUDA)
|
||||
jarvis_option_depr(WARNING JARVIS_KOMPUTE GGML_KOMPUTE)
|
||||
jarvis_option_depr(WARNING JARVIS_METAL GGML_METAL)
|
||||
jarvis_option_depr(WARNING JARVIS_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
||||
jarvis_option_depr(WARNING JARVIS_NATIVE GGML_NATIVE)
|
||||
jarvis_option_depr(WARNING JARVIS_RPC GGML_RPC)
|
||||
jarvis_option_depr(WARNING JARVIS_SYCL GGML_SYCL)
|
||||
jarvis_option_depr(WARNING JARVIS_SYCL_F16 GGML_SYCL_F16)
|
||||
jarvis_option_depr(WARNING JARVIS_CANN GGML_CANN)
|
||||
|
||||
#
|
||||
# build the library
|
||||
|
@ -132,18 +132,18 @@ add_subdirectory(src)
|
|||
include(GNUInstallDirs)
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
||||
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
||||
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||
set(JARVIS_BUILD_NUMBER ${BUILD_NUMBER})
|
||||
set(JARVIS_BUILD_COMMIT ${BUILD_COMMIT})
|
||||
set(JARVIS_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||
|
||||
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
||||
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
||||
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
||||
set(JARVIS_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
||||
set(JARVIS_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
||||
set(JARVIS_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
||||
|
||||
|
||||
# At the moment some compile definitions are placed within the ggml/src
|
||||
# directory but not exported on the `ggml` target. This could be improved by
|
||||
# determining _precisely_ which defines are necessary for the llama-config
|
||||
# determining _precisely_ which defines are necessary for the jarvis-config
|
||||
# package.
|
||||
#
|
||||
set(GGML_TRANSIENT_DEFINES)
|
||||
|
@ -158,25 +158,25 @@ if (GGML_TARGET_DEFINES)
|
|||
endif()
|
||||
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
||||
|
||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
|
||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||
set_target_properties(jarvis PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/jarvis.h)
|
||||
install(TARGETS jarvis LIBRARY PUBLIC_HEADER)
|
||||
|
||||
configure_package_config_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
|
||||
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama
|
||||
PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
|
||||
LLAMA_LIB_INSTALL_DIR
|
||||
LLAMA_BIN_INSTALL_DIR )
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cmake/jarvis-config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/jarvis-config.cmake
|
||||
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/jarvis
|
||||
PATH_VARS JARVIS_INCLUDE_INSTALL_DIR
|
||||
JARVIS_LIB_INSTALL_DIR
|
||||
JARVIS_BIN_INSTALL_DIR )
|
||||
|
||||
write_basic_package_version_file(
|
||||
${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
|
||||
VERSION ${LLAMA_INSTALL_VERSION}
|
||||
${CMAKE_CURRENT_BINARY_DIR}/jarvis-version.cmake
|
||||
VERSION ${JARVIS_INSTALL_VERSION}
|
||||
COMPATIBILITY SameMajorVersion)
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/jarvis-config.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/jarvis-version.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/jarvis)
|
||||
|
||||
install(
|
||||
FILES convert_hf_to_gguf.py
|
||||
|
@ -190,27 +190,27 @@ install(
|
|||
WORLD_EXECUTE
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
|
||||
configure_file(cmake/llama.pc.in
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
||||
configure_file(cmake/jarvis.pc.in
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/jarvis.pc"
|
||||
@ONLY)
|
||||
|
||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/jarvis.pc"
|
||||
DESTINATION lib/pkgconfig)
|
||||
|
||||
#
|
||||
# utils, programs, examples and tests
|
||||
#
|
||||
|
||||
if (LLAMA_BUILD_COMMON)
|
||||
if (JARVIS_BUILD_COMMON)
|
||||
add_subdirectory(common)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||
if (JARVIS_BUILD_COMMON AND JARVIS_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||
include(CTest)
|
||||
add_subdirectory(tests)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
|
||||
if (JARVIS_BUILD_COMMON AND JARVIS_BUILD_EXAMPLES)
|
||||
add_subdirectory(examples)
|
||||
add_subdirectory(pocs)
|
||||
endif()
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
|
||||
- Squash-merge PRs
|
||||
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
||||
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
|
||||
- Optionally pick a `<module>` from here: https://github.com/ggerganov/jarvis.cpp/wiki/Modules
|
||||
|
||||
# Coding guidelines
|
||||
|
||||
|
@ -22,7 +22,7 @@
|
|||
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
|
||||
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
|
||||
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
|
||||
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
|
||||
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/jarvis.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
|
||||
|
||||

|
||||
|
||||
|
@ -30,4 +30,4 @@
|
|||
|
||||
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
|
||||
|
||||
https://github.com/ggerganov/llama.cpp/projects
|
||||
https://github.com/ggerganov/jarvis.cpp/projects
|
||||
|
|
|
@ -7,7 +7,7 @@ import java.util.Scanner;
|
|||
public class LLMCLI {
|
||||
public static void main(String[] args) {
|
||||
// Path to the .exe file
|
||||
String exePath = "bin/llama-cli.exe";
|
||||
String exePath = "bin/jarvis-cli.exe";
|
||||
|
||||
System.out.println("Enter -h for help");
|
||||
// Scanner to take user input for various commands
|
||||
|
|
388
Makefile
388
Makefile
|
@ -1,44 +1,44 @@
|
|||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = \
|
||||
libllava.a \
|
||||
llama-baby-llama \
|
||||
llama-batched \
|
||||
llama-batched-bench \
|
||||
llama-bench \
|
||||
llama-cli \
|
||||
llama-convert-llama2c-to-ggml \
|
||||
llama-embedding \
|
||||
llama-eval-callback \
|
||||
llama-export-lora \
|
||||
llama-gbnf-validator \
|
||||
llama-gguf \
|
||||
llama-gguf-hash \
|
||||
llama-gguf-split \
|
||||
llama-gritlm \
|
||||
llama-imatrix \
|
||||
llama-infill \
|
||||
llama-llava-cli \
|
||||
llama-minicpmv-cli\
|
||||
llama-lookahead \
|
||||
llama-lookup \
|
||||
llama-lookup-create \
|
||||
llama-lookup-merge \
|
||||
llama-lookup-stats \
|
||||
llama-parallel \
|
||||
llama-passkey \
|
||||
llama-perplexity \
|
||||
llama-q8dot \
|
||||
llama-quantize \
|
||||
llama-quantize-stats \
|
||||
llama-retrieval \
|
||||
llama-save-load-state \
|
||||
llama-server \
|
||||
llama-simple \
|
||||
llama-speculative \
|
||||
llama-tokenize \
|
||||
llama-vdot \
|
||||
llama-cvector-generator \
|
||||
llama-gen-docs \
|
||||
jarvis-baby-jarvis \
|
||||
jarvis-batched \
|
||||
jarvis-batched-bench \
|
||||
jarvis-bench \
|
||||
jarvis-cli \
|
||||
jarvis-convert-jarvis2c-to-ggml \
|
||||
jarvis-embedding \
|
||||
jarvis-eval-callback \
|
||||
jarvis-export-lora \
|
||||
jarvis-gbnf-validator \
|
||||
jarvis-gguf \
|
||||
jarvis-gguf-hash \
|
||||
jarvis-gguf-split \
|
||||
jarvis-gritlm \
|
||||
jarvis-imatrix \
|
||||
jarvis-infill \
|
||||
jarvis-llava-cli \
|
||||
jarvis-minicpmv-cli\
|
||||
jarvis-lookahead \
|
||||
jarvis-lookup \
|
||||
jarvis-lookup-create \
|
||||
jarvis-lookup-merge \
|
||||
jarvis-lookup-stats \
|
||||
jarvis-parallel \
|
||||
jarvis-passkey \
|
||||
jarvis-perplexity \
|
||||
jarvis-q8dot \
|
||||
jarvis-quantize \
|
||||
jarvis-quantize-stats \
|
||||
jarvis-retrieval \
|
||||
jarvis-save-load-state \
|
||||
jarvis-server \
|
||||
jarvis-simple \
|
||||
jarvis-speculative \
|
||||
jarvis-tokenize \
|
||||
jarvis-vdot \
|
||||
jarvis-cvector-generator \
|
||||
jarvis-gen-docs \
|
||||
tests/test-c.o
|
||||
|
||||
# Binaries only useful for tests
|
||||
|
@ -52,7 +52,7 @@ TEST_TARGETS = \
|
|||
tests/test-grammar-integration \
|
||||
tests/test-grammar-parser \
|
||||
tests/test-json-schema-to-grammar \
|
||||
tests/test-llama-grammar \
|
||||
tests/test-jarvis-grammar \
|
||||
tests/test-log \
|
||||
tests/test-model-load-cancel \
|
||||
tests/test-opt \
|
||||
|
@ -65,8 +65,8 @@ TEST_TARGETS = \
|
|||
tests/test-tokenizer-1-spm
|
||||
|
||||
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
||||
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
|
||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
||||
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-jarvis2c-to-ggml \
|
||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback jarvis-bench libllava.a llava-cli baby-jarvis \
|
||||
retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm
|
||||
|
||||
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
|
||||
|
@ -74,80 +74,80 @@ LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding
|
|||
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
|
||||
|
||||
# Deprecation aliases
|
||||
ifdef LLAMA_CUBLAS
|
||||
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
|
||||
ifdef JARVIS_CUBLAS
|
||||
$(error JARVIS_CUBLAS is removed. Use GGML_CUDA instead.)
|
||||
endif
|
||||
|
||||
ifdef LLAMA_CUDA
|
||||
ifdef JARVIS_CUDA
|
||||
GGML_CUDA := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_KOMPUTE
|
||||
ifdef JARVIS_KOMPUTE
|
||||
GGML_KOMPUTE := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
ifdef JARVIS_METAL
|
||||
GGML_METAL := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_RPC
|
||||
ifdef JARVIS_RPC
|
||||
GGML_RPC := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_SYCL
|
||||
ifdef JARVIS_SYCL
|
||||
GGML_SYCL := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_SYCL_F16
|
||||
ifdef JARVIS_SYCL_F16
|
||||
GGML_SYCL_F16 := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_OPENBLAS
|
||||
ifdef JARVIS_OPENBLAS
|
||||
GGML_OPENBLAS := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_OPENBLAS64
|
||||
ifdef JARVIS_OPENBLAS64
|
||||
GGML_OPENBLAS64 := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_BLIS
|
||||
ifdef JARVIS_BLIS
|
||||
GGML_BLIS := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_NO_LLAMAFILE
|
||||
GGML_NO_LLAMAFILE := 1
|
||||
ifdef JARVIS_NO_JARVISFILE
|
||||
GGML_NO_JARVISFILE := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_NO_ACCELERATE
|
||||
ifdef JARVIS_NO_ACCELERATE
|
||||
GGML_NO_ACCELERATE := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_NO_OPENMP
|
||||
ifdef JARVIS_NO_OPENMP
|
||||
GGML_NO_OPENMP := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_NO_METAL
|
||||
ifdef JARVIS_NO_METAL
|
||||
GGML_NO_METAL := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_DISABLE_LOGS
|
||||
ifdef JARVIS_DISABLE_LOGS
|
||||
REMOVE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_SERVER_VERBOSE
|
||||
ifdef JARVIS_SERVER_VERBOSE
|
||||
REMOVE_WARNING := 1
|
||||
endif
|
||||
|
||||
|
@ -211,8 +211,8 @@ test: $(TEST_TARGETS)
|
|||
@failures=0; \
|
||||
for test_target in $(TEST_TARGETS); do \
|
||||
if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-jarvis-spm.gguf; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-jarvis-bpe.gguf; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
||||
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
|
||||
|
@ -257,7 +257,7 @@ MK_CFLAGS = -std=c11 -fPIC
|
|||
MK_CXXFLAGS = -std=c++11 -fPIC
|
||||
MK_NVCCFLAGS = -std=c++11
|
||||
|
||||
ifdef LLAMA_NO_CCACHE
|
||||
ifdef JARVIS_NO_CCACHE
|
||||
GGML_NO_CCACHE := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
@ -320,7 +320,7 @@ ifdef GGML_SCHED_MAX_COPIES
|
|||
MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES)
|
||||
endif
|
||||
|
||||
ifdef LLAMA_DEBUG
|
||||
ifdef JARVIS_DEBUG
|
||||
MK_CFLAGS += -O0 -g
|
||||
MK_CXXFLAGS += -O0 -g
|
||||
MK_LDFLAGS += -g
|
||||
|
@ -336,25 +336,25 @@ else
|
|||
MK_NVCCFLAGS += -O3 -g
|
||||
endif
|
||||
|
||||
ifdef LLAMA_SANITIZE_THREAD
|
||||
ifdef JARVIS_SANITIZE_THREAD
|
||||
MK_CFLAGS += -fsanitize=thread -g
|
||||
MK_CXXFLAGS += -fsanitize=thread -g
|
||||
MK_LDFLAGS += -fsanitize=thread -g
|
||||
endif
|
||||
|
||||
ifdef LLAMA_SANITIZE_ADDRESS
|
||||
ifdef JARVIS_SANITIZE_ADDRESS
|
||||
MK_CFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
||||
MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
||||
MK_LDFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
|
||||
endif
|
||||
|
||||
ifdef LLAMA_SANITIZE_UNDEFINED
|
||||
ifdef JARVIS_SANITIZE_UNDEFINED
|
||||
MK_CFLAGS += -fsanitize=undefined -g
|
||||
MK_CXXFLAGS += -fsanitize=undefined -g
|
||||
MK_LDFLAGS += -fsanitize=undefined -g
|
||||
endif
|
||||
|
||||
ifdef LLAMA_SERVER_SSL
|
||||
ifdef JARVIS_SERVER_SSL
|
||||
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
|
||||
MK_LDFLAGS += -lssl -lcrypto
|
||||
endif
|
||||
|
@ -381,7 +381,7 @@ MK_CXXFLAGS += \
|
|||
-Wmissing-declarations \
|
||||
-Wmissing-noreturn
|
||||
|
||||
ifeq ($(LLAMA_FATAL_WARNINGS),1)
|
||||
ifeq ($(JARVIS_FATAL_WARNINGS),1)
|
||||
MK_CFLAGS += -Werror
|
||||
MK_CXXFLAGS += -Werror
|
||||
endif
|
||||
|
@ -420,7 +420,7 @@ ifeq ($(_WIN32),1)
|
|||
LWINSOCK2 := -lws2_32
|
||||
endif
|
||||
|
||||
ifdef LLAMA_GPROF
|
||||
ifdef JARVIS_GPROF
|
||||
MK_CFLAGS += -pg
|
||||
MK_CXXFLAGS += -pg
|
||||
endif
|
||||
|
@ -448,7 +448,7 @@ endif
|
|||
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
|
||||
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
|
||||
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
|
||||
# https://github.com/ggerganov/llama.cpp/issues/2922
|
||||
# https://github.com/ggerganov/jarvis.cpp/issues/2922
|
||||
MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
|
||||
MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
|
||||
|
||||
|
@ -574,9 +574,9 @@ ifdef GGML_NVPL
|
|||
OBJ_GGML += ggml/src/ggml-blas.o
|
||||
endif # GGML_NVPL
|
||||
|
||||
ifndef GGML_NO_LLAMAFILE
|
||||
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
||||
OBJ_GGML += ggml/src/llamafile/sgemm.o
|
||||
ifndef GGML_NO_JARVISFILE
|
||||
MK_CPPFLAGS += -DGGML_USE_JARVISFILE
|
||||
OBJ_GGML += ggml/src/jarvisfile/sgemm.o
|
||||
endif
|
||||
|
||||
ifndef GGML_NO_AMX
|
||||
|
@ -627,9 +627,9 @@ ifdef GGML_CUDA
|
|||
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
|
||||
OBJ_GGML += $(OBJ_CUDA_TMPL)
|
||||
|
||||
ifdef LLAMA_FATAL_WARNINGS
|
||||
ifdef JARVIS_FATAL_WARNINGS
|
||||
MK_NVCCFLAGS += -Werror all-warnings
|
||||
endif # LLAMA_FATAL_WARNINGS
|
||||
endif # JARVIS_FATAL_WARNINGS
|
||||
|
||||
ifndef GGML_MUSA
|
||||
ifndef JETSON_EOL_MODULE_DETECT
|
||||
|
@ -637,9 +637,9 @@ ifndef JETSON_EOL_MODULE_DETECT
|
|||
endif # JETSON_EOL_MODULE_DETECT
|
||||
endif # GGML_MUSA
|
||||
|
||||
ifdef LLAMA_DEBUG
|
||||
ifdef JARVIS_DEBUG
|
||||
MK_NVCCFLAGS += -lineinfo
|
||||
endif # LLAMA_DEBUG
|
||||
endif # JARVIS_DEBUG
|
||||
|
||||
ifdef GGML_CUDA_DEBUG
|
||||
MK_NVCCFLAGS += --device-debug
|
||||
|
@ -920,11 +920,11 @@ OBJ_GGML += \
|
|||
ggml/src/ggml-quants.o \
|
||||
ggml/src/ggml-aarch64.o
|
||||
|
||||
OBJ_LLAMA = \
|
||||
src/llama.o \
|
||||
src/llama-vocab.o \
|
||||
src/llama-grammar.o \
|
||||
src/llama-sampling.o \
|
||||
OBJ_JARVIS = \
|
||||
src/jarvis.o \
|
||||
src/jarvis-vocab.o \
|
||||
src/jarvis-grammar.o \
|
||||
src/jarvis-sampling.o \
|
||||
src/unicode.o \
|
||||
src/unicode-data.o
|
||||
|
||||
|
@ -939,19 +939,19 @@ OBJ_COMMON = \
|
|||
common/build-info.o \
|
||||
common/json-schema-to-grammar.o
|
||||
|
||||
OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
|
||||
OBJ_ALL = $(OBJ_GGML) $(OBJ_JARVIS) $(OBJ_COMMON)
|
||||
|
||||
LIB_GGML = $(LIB_PRE)ggml$(DSO_EXT)
|
||||
LIB_GGML_S = $(LIB_PRE)ggml.a
|
||||
|
||||
LIB_LLAMA = $(LIB_PRE)llama$(DSO_EXT)
|
||||
LIB_LLAMA_S = $(LIB_PRE)llama.a
|
||||
LIB_JARVIS = $(LIB_PRE)jarvis$(DSO_EXT)
|
||||
LIB_JARVIS_S = $(LIB_PRE)jarvis.a
|
||||
|
||||
LIB_COMMON = $(LIB_PRE)common$(DSO_EXT)
|
||||
LIB_COMMON_S = $(LIB_PRE)common.a
|
||||
|
||||
LIB_ALL = $(LIB_GGML) $(LIB_LLAMA) $(LIB_COMMON)
|
||||
LIB_ALL_S = $(LIB_GGML_S) $(LIB_LLAMA_S) $(LIB_COMMON_S)
|
||||
LIB_ALL = $(LIB_GGML) $(LIB_JARVIS) $(LIB_COMMON)
|
||||
LIB_ALL_S = $(LIB_GGML_S) $(LIB_JARVIS_S) $(LIB_COMMON_S)
|
||||
|
||||
GF_CC := $(CC)
|
||||
include scripts/get-flags.mk
|
||||
|
@ -971,8 +971,8 @@ include scripts/get-flags.mk
|
|||
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
||||
endif
|
||||
|
||||
ifdef LLAMA_CURL
|
||||
override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
|
||||
ifdef JARVIS_CURL
|
||||
override CXXFLAGS := $(CXXFLAGS) -DJARVIS_USE_CURL
|
||||
override LDFLAGS := $(LDFLAGS) -lcurl
|
||||
endif
|
||||
|
||||
|
@ -980,7 +980,7 @@ endif
|
|||
# Print build information
|
||||
#
|
||||
|
||||
$(info I llama.cpp build info: )
|
||||
$(info I jarvis.cpp build info: )
|
||||
$(info I UNAME_S: $(UNAME_S))
|
||||
$(info I UNAME_P: $(UNAME_P))
|
||||
$(info I UNAME_M: $(UNAME_M))
|
||||
|
@ -1009,30 +1009,30 @@ $(info )
|
|||
|
||||
ifdef DEPRECATE_WARNING
|
||||
$(info !!! DEPRECATION WARNING !!!)
|
||||
$(info The following LLAMA_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead)
|
||||
$(info - LLAMA_CUDA)
|
||||
$(info - LLAMA_METAL)
|
||||
$(info - LLAMA_METAL_EMBED_LIBRARY)
|
||||
$(info - LLAMA_OPENMP)
|
||||
$(info - LLAMA_RPC)
|
||||
$(info - LLAMA_SYCL)
|
||||
$(info - LLAMA_SYCL_F16)
|
||||
$(info - LLAMA_OPENBLAS)
|
||||
$(info - LLAMA_OPENBLAS64)
|
||||
$(info - LLAMA_BLIS)
|
||||
$(info - LLAMA_NO_LLAMAFILE)
|
||||
$(info - LLAMA_NO_ACCELERATE)
|
||||
$(info - LLAMA_NO_OPENMP)
|
||||
$(info - LLAMA_NO_METAL)
|
||||
$(info - LLAMA_NO_CCACHE)
|
||||
$(info The following JARVIS_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead)
|
||||
$(info - JARVIS_CUDA)
|
||||
$(info - JARVIS_METAL)
|
||||
$(info - JARVIS_METAL_EMBED_LIBRARY)
|
||||
$(info - JARVIS_OPENMP)
|
||||
$(info - JARVIS_RPC)
|
||||
$(info - JARVIS_SYCL)
|
||||
$(info - JARVIS_SYCL_F16)
|
||||
$(info - JARVIS_OPENBLAS)
|
||||
$(info - JARVIS_OPENBLAS64)
|
||||
$(info - JARVIS_BLIS)
|
||||
$(info - JARVIS_NO_JARVISFILE)
|
||||
$(info - JARVIS_NO_ACCELERATE)
|
||||
$(info - JARVIS_NO_OPENMP)
|
||||
$(info - JARVIS_NO_METAL)
|
||||
$(info - JARVIS_NO_CCACHE)
|
||||
$(info )
|
||||
endif
|
||||
|
||||
ifdef REMOVE_WARNING
|
||||
$(info !!! REMOVAL WARNING !!!)
|
||||
$(info The following LLAMA_ options have been removed and are no longer supported)
|
||||
$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418))
|
||||
$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
|
||||
$(info The following JARVIS_ options have been removed and are no longer supported)
|
||||
$(info - JARVIS_DISABLE_LOGS (https://github.com/ggerganov/jarvis.cpp/pull/9418))
|
||||
$(info - JARVIS_SERVER_VERBOSE (https://github.com/ggerganov/jarvis.cpp/pull/9418))
|
||||
$(info )
|
||||
endif
|
||||
|
||||
|
@ -1079,13 +1079,13 @@ ggml/src/ggml-blas.o: \
|
|||
ggml/include/ggml-blas.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
ifndef GGML_NO_LLAMAFILE
|
||||
ggml/src/llamafile/sgemm.o: \
|
||||
ggml/src/llamafile/sgemm.cpp \
|
||||
ggml/src/llamafile/sgemm.h \
|
||||
ifndef GGML_NO_JARVISFILE
|
||||
ggml/src/jarvisfile/sgemm.o: \
|
||||
ggml/src/jarvisfile/sgemm.cpp \
|
||||
ggml/src/jarvisfile/sgemm.h \
|
||||
ggml/include/ggml.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif # GGML_NO_LLAMAFILE
|
||||
endif # GGML_NO_JARVISFILE
|
||||
|
||||
ifndef GGML_NO_AMX
|
||||
ggml/src/ggml-amx.o: \
|
||||
|
@ -1115,7 +1115,7 @@ $(LIB_GGML_S): \
|
|||
$(OBJ_GGML)
|
||||
ar rcs $(LIB_GGML_S) $^
|
||||
|
||||
# llama
|
||||
# jarvis
|
||||
|
||||
src/unicode.o: \
|
||||
src/unicode.cpp \
|
||||
|
@ -1127,14 +1127,14 @@ src/unicode-data.o: \
|
|||
src/unicode-data.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
src/llama.o: \
|
||||
src/llama.cpp \
|
||||
src/llama-impl.h \
|
||||
src/llama-vocab.h \
|
||||
src/llama-grammar.h \
|
||||
src/llama-sampling.h \
|
||||
src/jarvis.o: \
|
||||
src/jarvis.cpp \
|
||||
src/jarvis-impl.h \
|
||||
src/jarvis-vocab.h \
|
||||
src/jarvis-grammar.h \
|
||||
src/jarvis-sampling.h \
|
||||
src/unicode.h \
|
||||
include/llama.h \
|
||||
include/jarvis.h \
|
||||
ggml/include/ggml-cuda.h \
|
||||
ggml/include/ggml-metal.h \
|
||||
ggml/include/ggml.h \
|
||||
|
@ -1142,37 +1142,37 @@ src/llama.o: \
|
|||
ggml/include/ggml-backend.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
src/llama-vocab.o: \
|
||||
src/llama-vocab.cpp \
|
||||
src/llama-vocab.h \
|
||||
src/llama-impl.h \
|
||||
include/llama.h
|
||||
src/jarvis-vocab.o: \
|
||||
src/jarvis-vocab.cpp \
|
||||
src/jarvis-vocab.h \
|
||||
src/jarvis-impl.h \
|
||||
include/jarvis.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
src/llama-grammar.o: \
|
||||
src/llama-grammar.cpp \
|
||||
src/llama-grammar.h \
|
||||
src/llama-impl.h \
|
||||
src/llama-vocab.h \
|
||||
src/llama-sampling.h \
|
||||
include/llama.h
|
||||
src/jarvis-grammar.o: \
|
||||
src/jarvis-grammar.cpp \
|
||||
src/jarvis-grammar.h \
|
||||
src/jarvis-impl.h \
|
||||
src/jarvis-vocab.h \
|
||||
src/jarvis-sampling.h \
|
||||
include/jarvis.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
src/llama-sampling.o: \
|
||||
src/llama-sampling.cpp \
|
||||
src/llama-sampling.h \
|
||||
src/llama-impl.h \
|
||||
include/llama.h
|
||||
src/jarvis-sampling.o: \
|
||||
src/jarvis-sampling.cpp \
|
||||
src/jarvis-sampling.h \
|
||||
src/jarvis-impl.h \
|
||||
include/jarvis.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
$(LIB_LLAMA): \
|
||||
$(OBJ_LLAMA) \
|
||||
$(LIB_JARVIS): \
|
||||
$(OBJ_JARVIS) \
|
||||
$(LIB_GGML)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
$(LIB_LLAMA_S): \
|
||||
$(OBJ_LLAMA)
|
||||
ar rcs $(LIB_LLAMA_S) $^
|
||||
$(LIB_JARVIS_S): \
|
||||
$(OBJ_JARVIS)
|
||||
ar rcs $(LIB_JARVIS_S) $^
|
||||
|
||||
# common
|
||||
|
||||
|
@ -1183,7 +1183,7 @@ common/common.o: \
|
|||
common/sampling.h \
|
||||
common/json.hpp \
|
||||
common/json-schema-to-grammar.h \
|
||||
include/llama.h
|
||||
include/jarvis.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common/arg.o: \
|
||||
|
@ -1199,7 +1199,7 @@ common/log.o: \
|
|||
common/sampling.o: \
|
||||
common/sampling.cpp \
|
||||
common/sampling.h \
|
||||
include/llama.h
|
||||
include/jarvis.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common/console.o: \
|
||||
|
@ -1224,7 +1224,7 @@ common/ngram-cache.o: \
|
|||
|
||||
$(LIB_COMMON): \
|
||||
$(OBJ_COMMON) \
|
||||
$(LIB_LLAMA) \
|
||||
$(LIB_JARVIS) \
|
||||
$(LIB_GGML)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
|
@ -1246,7 +1246,7 @@ clean:
|
|||
rm -rvf ggml/*.dll
|
||||
rm -rvf ggml/*.so
|
||||
rm -vrf ggml/src/*.o
|
||||
rm -rvf ggml/src/llamafile/*.o
|
||||
rm -rvf ggml/src/jarvisfile/*.o
|
||||
rm -rvf common/build-info.cpp
|
||||
rm -vrf ggml/src/ggml-metal-embed.metal
|
||||
rm -vrf ggml/src/ggml-cuda/*.o
|
||||
|
@ -1269,75 +1269,75 @@ clean:
|
|||
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
|
||||
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
|
||||
|
||||
llama-cli: examples/main/main.cpp \
|
||||
jarvis-cli: examples/main/main.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@echo
|
||||
@echo '==== Run ./llama-cli -h for help. ===='
|
||||
@echo '==== Run ./jarvis-cli -h for help. ===='
|
||||
@echo
|
||||
|
||||
llama-infill: examples/infill/infill.cpp \
|
||||
jarvis-infill: examples/infill/infill.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-simple: examples/simple/simple.cpp \
|
||||
jarvis-simple: examples/simple/simple.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-tokenize: examples/tokenize/tokenize.cpp \
|
||||
jarvis-tokenize: examples/tokenize/tokenize.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-batched: examples/batched/batched.cpp \
|
||||
jarvis-batched: examples/batched/batched.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-batched-bench: examples/batched-bench/batched-bench.cpp \
|
||||
jarvis-batched-bench: examples/batched-bench/batched-bench.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-quantize: examples/quantize/quantize.cpp \
|
||||
jarvis-quantize: examples/quantize/quantize.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
|
||||
jarvis-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-perplexity: examples/perplexity/perplexity.cpp \
|
||||
jarvis-perplexity: examples/perplexity/perplexity.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-imatrix: examples/imatrix/imatrix.cpp \
|
||||
jarvis-imatrix: examples/imatrix/imatrix.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-embedding: examples/embedding/embedding.cpp \
|
||||
jarvis-embedding: examples/embedding/embedding.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-gritlm: examples/gritlm/gritlm.cpp \
|
||||
jarvis-gritlm: examples/gritlm/gritlm.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-save-load-state: examples/save-load-state/save-load-state.cpp \
|
||||
jarvis-save-load-state: examples/save-load-state/save-load-state.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-gguf: examples/gguf/gguf.cpp \
|
||||
jarvis-gguf: examples/gguf/gguf.cpp \
|
||||
$(OBJ_GGML)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
@ -1354,92 +1354,92 @@ examples/gguf-hash/deps/sha256/sha256.o: \
|
|||
examples/gguf-hash/deps/sha256/sha256.c
|
||||
$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
|
||||
|
||||
llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
|
||||
jarvis-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-gguf-split: examples/gguf-split/gguf-split.cpp \
|
||||
jarvis-gguf-split: examples/gguf-split/gguf-split.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-eval-callback: examples/eval-callback/eval-callback.cpp \
|
||||
jarvis-eval-callback: examples/eval-callback/eval-callback.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
|
||||
jarvis-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
|
||||
jarvis-convert-jarvis2c-to-ggml: examples/convert-jarvis2c-to-ggml/convert-jarvis2c-to-ggml.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-bench: examples/llama-bench/llama-bench.cpp \
|
||||
jarvis-bench: examples/jarvis-bench/jarvis-bench.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-baby-llama: examples/baby-llama/baby-llama.cpp \
|
||||
jarvis-baby-jarvis: examples/baby-jarvis/baby-jarvis.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-export-lora: examples/export-lora/export-lora.cpp \
|
||||
jarvis-export-lora: examples/export-lora/export-lora.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-retrieval: examples/retrieval/retrieval.cpp \
|
||||
jarvis-retrieval: examples/retrieval/retrieval.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-speculative: examples/speculative/speculative.cpp \
|
||||
jarvis-speculative: examples/speculative/speculative.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-parallel: examples/parallel/parallel.cpp \
|
||||
jarvis-parallel: examples/parallel/parallel.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-lookahead: examples/lookahead/lookahead.cpp \
|
||||
jarvis-lookahead: examples/lookahead/lookahead.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-lookup: examples/lookup/lookup.cpp \
|
||||
jarvis-lookup: examples/lookup/lookup.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-lookup-create: examples/lookup/lookup-create.cpp \
|
||||
jarvis-lookup-create: examples/lookup/lookup-create.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-lookup-merge: examples/lookup/lookup-merge.cpp \
|
||||
jarvis-lookup-merge: examples/lookup/lookup-merge.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-lookup-stats: examples/lookup/lookup-stats.cpp \
|
||||
jarvis-lookup-stats: examples/lookup/lookup-stats.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-passkey: examples/passkey/passkey.cpp \
|
||||
jarvis-passkey: examples/passkey/passkey.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
|
||||
jarvis-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
@ -1450,7 +1450,7 @@ rpc-server: examples/rpc/rpc-server.cpp \
|
|||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
endif # GGML_RPC
|
||||
|
||||
llama-server: \
|
||||
jarvis-server: \
|
||||
examples/server/server.cpp \
|
||||
examples/server/utils.hpp \
|
||||
examples/server/httplib.h \
|
||||
|
@ -1485,7 +1485,7 @@ examples/server/%.hpp: examples/server/public/% Makefile
|
|||
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
||||
) > $@
|
||||
|
||||
llama-gen-docs: examples/gen-docs/gen-docs.cpp \
|
||||
jarvis-gen-docs: examples/gen-docs/gen-docs.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
@ -1499,7 +1499,7 @@ libllava.a: examples/llava/llava.cpp \
|
|||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
||||
|
||||
llama-llava-cli: examples/llava/llava-cli.cpp \
|
||||
jarvis-llava-cli: examples/llava/llava-cli.cpp \
|
||||
examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
examples/llava/clip.cpp \
|
||||
|
@ -1507,7 +1507,7 @@ llama-llava-cli: examples/llava/llava-cli.cpp \
|
|||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
|
||||
jarvis-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
|
||||
examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
examples/llava/clip.cpp \
|
||||
|
@ -1542,7 +1542,7 @@ tests/test-arg-parser: tests/test-arg-parser.cpp \
|
|||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp \
|
||||
tests/test-jarvis-grammar: tests/test-jarvis-grammar.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
@ -1616,7 +1616,7 @@ tests/test-rope: tests/test-rope.cpp ggml/src/ggml.o \
|
|||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-c.o: tests/test-c.c include/llama.h
|
||||
tests/test-c.o: tests/test-c.c include/jarvis.h
|
||||
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
|
||||
|
||||
tests/test-backend-ops: tests/test-backend-ops.cpp \
|
||||
|
@ -1643,12 +1643,12 @@ tests/test-chat-template: tests/test-chat-template.cpp \
|
|||
# PoCs
|
||||
#
|
||||
|
||||
llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
|
||||
jarvis-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
|
||||
$(OBJ_GGML)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
|
||||
jarvis-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
|
||||
$(OBJ_GGML)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
@ -1667,17 +1667,17 @@ examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning
|
|||
# Eventually we will want to remove these target from building all the time.
|
||||
main: examples/deprecation-warning/deprecation-warning.o
|
||||
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
|
||||
@echo "NOTICE: The 'main' binary is deprecated. Please use 'jarvis-cli' instead."
|
||||
|
||||
server: examples/deprecation-warning/deprecation-warning.o
|
||||
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
|
||||
@echo "NOTICE: The 'server' binary is deprecated. Please use 'jarvis-server' instead."
|
||||
|
||||
quantize: examples/deprecation-warning/deprecation-warning.o
|
||||
ifneq (,$(wildcard quantize))
|
||||
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||
@echo "#########"
|
||||
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
|
||||
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'jarvis-quantize' instead."
|
||||
@echo " Remove the 'quantize' binary to remove this warning."
|
||||
@echo "#########"
|
||||
endif
|
||||
|
@ -1686,7 +1686,7 @@ perplexity: examples/deprecation-warning/deprecation-warning.o
|
|||
ifneq (,$(wildcard perplexity))
|
||||
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||
@echo "#########"
|
||||
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
|
||||
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'jarvis-perplexity' instead."
|
||||
@echo " Remove the 'perplexity' binary to remove this warning."
|
||||
@echo "#########"
|
||||
endif
|
||||
|
@ -1695,7 +1695,7 @@ embedding: examples/deprecation-warning/deprecation-warning.o
|
|||
ifneq (,$(wildcard embedding))
|
||||
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||
@echo "#########"
|
||||
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
|
||||
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'jarvis-embedding' instead."
|
||||
@echo " Remove the 'embedding' binary to remove this warning."
|
||||
@echo "#########"
|
||||
endif
|
||||
|
|
|
@ -3,10 +3,10 @@
|
|||
import PackageDescription
|
||||
|
||||
var sources = [
|
||||
"src/llama.cpp",
|
||||
"src/llama-vocab.cpp",
|
||||
"src/llama-grammar.cpp",
|
||||
"src/llama-sampling.cpp",
|
||||
"src/jarvis.cpp",
|
||||
"src/jarvis-vocab.cpp",
|
||||
"src/jarvis-grammar.cpp",
|
||||
"src/jarvis-sampling.cpp",
|
||||
"src/unicode.cpp",
|
||||
"src/unicode-data.cpp",
|
||||
"ggml/src/ggml.c",
|
||||
|
@ -45,7 +45,7 @@ cSettings.append(
|
|||
#endif
|
||||
|
||||
let package = Package(
|
||||
name: "llama",
|
||||
name: "jarvis",
|
||||
platforms: [
|
||||
.macOS(.v12),
|
||||
.iOS(.v14),
|
||||
|
@ -53,11 +53,11 @@ let package = Package(
|
|||
.tvOS(.v14)
|
||||
],
|
||||
products: [
|
||||
.library(name: "llama", targets: ["llama"]),
|
||||
.library(name: "jarvis", targets: ["jarvis"]),
|
||||
],
|
||||
targets: [
|
||||
.target(
|
||||
name: "llama",
|
||||
name: "jarvis",
|
||||
path: ".",
|
||||
exclude: [
|
||||
"cmake",
|
||||
|
|
170
README.md
170
README.md
|
@ -1,30 +1,30 @@
|
|||
# llama.cpp
|
||||
# jarvis.cpp
|
||||
|
||||

|
||||

|
||||
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
||||
[](https://conan.io/center/llama-cpp)
|
||||
[](https://github.com/ggerganov/jarvis.cpp/actions/workflows/server.yml)
|
||||
[](https://conan.io/center/jarvis-cpp)
|
||||
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/jarvis.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/jarvis.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||
|
||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||
|
||||
## Recent API changes
|
||||
|
||||
- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
|
||||
- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
|
||||
- [Changelog for `libjarvis` API](https://github.com/ggerganov/jarvis.cpp/issues/9289)
|
||||
- [Changelog for `jarvis-server` REST API](https://github.com/ggerganov/jarvis.cpp/issues/9291)
|
||||
|
||||
## Hot topics
|
||||
|
||||
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
|
||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/jarvis.cpp/discussions/9669**
|
||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/jarvis.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
|
||||
----
|
||||
|
||||
## Description
|
||||
|
||||
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
||||
The main goal of `jarvis.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
||||
variety of hardware - locally and in the cloud.
|
||||
|
||||
- Plain C/C++ implementation without any dependencies
|
||||
|
@ -35,7 +35,7 @@ variety of hardware - locally and in the cloud.
|
|||
- Vulkan and SYCL backend support
|
||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||
|
||||
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
|
||||
Since its [inception](https://github.com/ggerganov/jarvis.cpp/issues/33#issuecomment-1465108022), the project has
|
||||
improved significantly thanks to many contributions. It is the main playground for developing new features for the
|
||||
[ggml](https://github.com/ggerganov/ggml) library.
|
||||
|
||||
|
@ -52,22 +52,22 @@ Typically finetunes of the base models below are supported as well.
|
|||
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
|
||||
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
|
||||
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
||||
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
|
||||
- [X] [BERT](https://github.com/ggerganov/jarvis.cpp/pull/5423)
|
||||
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
|
||||
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
|
||||
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
|
||||
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
|
||||
- [X] [Starcoder models](https://github.com/ggerganov/jarvis.cpp/pull/3187)
|
||||
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
|
||||
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
|
||||
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
|
||||
- [X] [MPT](https://github.com/ggerganov/jarvis.cpp/pull/3417)
|
||||
- [X] [Bloom](https://github.com/ggerganov/jarvis.cpp/pull/3553)
|
||||
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
|
||||
- [X] [StableLM models](https://huggingface.co/stabilityai)
|
||||
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
|
||||
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
|
||||
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
|
||||
- [x] [PLaMo-13B](https://github.com/ggerganov/jarvis.cpp/pull/3557)
|
||||
- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
|
||||
- [x] [GPT-2](https://huggingface.co/gpt2)
|
||||
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
|
||||
- [x] [Orion 14B](https://github.com/ggerganov/jarvis.cpp/pull/5118)
|
||||
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
|
||||
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
|
||||
- [x] [Gemma](https://ai.google.dev/gemma)
|
||||
|
@ -111,36 +111,36 @@ Typically finetunes of the base models below are supported as well.
|
|||
|
||||
**Bindings:**
|
||||
|
||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
||||
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
|
||||
- Python: [abetlen/jarvis-cpp-python](https://github.com/abetlen/jarvis-cpp-python)
|
||||
- Go: [go-skynet/go-jarvis.cpp](https://github.com/go-skynet/go-jarvis.cpp)
|
||||
- Node.js: [withcatai/node-jarvis-cpp](https://github.com/withcatai/node-jarvis-cpp)
|
||||
- JS/TS (jarvis.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/jarviscpp)
|
||||
- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
|
||||
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
|
||||
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
|
||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
|
||||
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
|
||||
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
||||
- JavaScript/Wasm (works in browser): [tangledgroup/jarvis-cpp-wasm](https://github.com/tangledgroup/jarvis-cpp-wasm)
|
||||
- Typescript/Wasm (nicer API, available on npm): [ngxson/wjarvis](https://github.com/ngxson/wjarvis)
|
||||
- Ruby: [yoshoku/jarvis_cpp.rb](https://github.com/yoshoku/jarvis_cpp.rb)
|
||||
- Rust (more features): [edgenai/jarvis_cpp-rs](https://github.com/edgenai/jarvis_cpp-rs)
|
||||
- Rust (nicer API): [mdrokz/rust-jarvis.cpp](https://github.com/mdrokz/rust-jarvis.cpp)
|
||||
- Rust (more direct bindings): [utilityai/jarvis-cpp-rs](https://github.com/utilityai/jarvis-cpp-rs)
|
||||
- C#/.NET: [SciSharp/JarvisSharp](https://github.com/SciSharp/JarvisSharp)
|
||||
- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
|
||||
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
|
||||
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
|
||||
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
|
||||
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
|
||||
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
|
||||
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
|
||||
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
|
||||
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
|
||||
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
|
||||
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
|
||||
- Clojure: [phronmophobic/jarvis.clj](https://github.com/phronmophobic/jarvis.clj)
|
||||
- React Native: [mybigday/jarvis.rn](https://github.com/mybigday/jarvis.rn)
|
||||
- Java: [kherud/java-jarvis.cpp](https://github.com/kherud/java-jarvis.cpp)
|
||||
- Zig: [deins/jarvis.cpp.zig](https://github.com/Deins/jarvis.cpp.zig)
|
||||
- Flutter/Dart: [netdur/jarvis_cpp_dart](https://github.com/netdur/jarvis_cpp_dart)
|
||||
- PHP (API bindings and features built on top of jarvis.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/jarvis.cpp/pull/6326)
|
||||
- Guile Scheme: [guile_jarvis_cpp](https://savannah.nongnu.org/projects/guile-jarvis-cpp)
|
||||
- Swift [srgtuszy/jarvis-cpp-swift](https://github.com/srgtuszy/jarvis-cpp-swift)
|
||||
- Swift [ShenghaiWang/SwiftJarvis](https://github.com/ShenghaiWang/SwiftJarvis)
|
||||
|
||||
**UI:**
|
||||
|
||||
Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
|
||||
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
|
||||
- [iohub/collama](https://github.com/iohub/coLLaMA)
|
||||
- [iohub/cojarvis](https://github.com/iohub/coLLaMA)
|
||||
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||
- [Faraday](https://faraday.dev/) (proprietary)
|
||||
|
@ -149,9 +149,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
|||
- [ramalama](https://github.com/containers/ramalama) (MIT)
|
||||
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
||||
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
||||
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
|
||||
- [Mozilla-Ocho/jarvisfile](https://github.com/Mozilla-Ocho/jarvisfile)
|
||||
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
|
||||
- [ollama/ollama](https://github.com/ollama/ollama)
|
||||
- [ojarvis/ojarvis](https://github.com/ojarvis/ojarvis)
|
||||
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
|
||||
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
||||
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
||||
|
@ -173,24 +173,24 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
|||
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
||||
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
||||
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
||||
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
|
||||
- [Jarvis Assistant](https://github.com/vietanhdev/jarvis-assistant) (GPL)
|
||||
- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
|
||||
|
||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||
*(to have a project listed here, it should clearly state that it depends on `jarvis.cpp`)*
|
||||
|
||||
**Tools:**
|
||||
|
||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||
- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
|
||||
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
||||
- [akx/ojarvis-dl](https://github.com/akx/ojarvis-dl) – download models from the Ojarvis library to be used directly with jarvis.cpp
|
||||
- [crashr/gppm](https://github.com/crashr/gppm) – launch jarvis.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
||||
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
||||
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
|
||||
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-jarvis-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
|
||||
|
||||
**Infrastructure:**
|
||||
|
||||
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
||||
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for jarvis.cpp
|
||||
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
||||
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
|
||||
- [jarvis_cpp_canister](https://github.com/onicai/jarvis_cpp_canister) - jarvis.cpp as a smart contract on the Internet Computer, using WebAssembly
|
||||
|
||||
**Games:**
|
||||
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
|
||||
|
@ -201,8 +201,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
|||
<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
|
||||
|
||||
```
|
||||
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
||||
I llama.cpp build info:
|
||||
$ make -j && ./jarvis-cli -m models/jarvis-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
||||
I jarvis.cpp build info:
|
||||
I UNAME_S: Darwin
|
||||
I UNAME_P: arm
|
||||
I UNAME_M: arm64
|
||||
|
@ -215,12 +215,12 @@ I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
|
|||
make: Nothing to be done for `default'.
|
||||
main: build = 1041 (cf658ad)
|
||||
main: seed = 1692823051
|
||||
llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
|
||||
llama_model_loader: - type f32: 81 tensors
|
||||
llama_model_loader: - type q4_0: 281 tensors
|
||||
llama_model_loader: - type q6_K: 1 tensors
|
||||
jarvis_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/jarvis-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
|
||||
jarvis_model_loader: - type f32: 81 tensors
|
||||
jarvis_model_loader: - type q4_0: 281 tensors
|
||||
jarvis_model_loader: - type q6_K: 1 tensors
|
||||
llm_load_print_meta: format = GGUF V1 (latest)
|
||||
llm_load_print_meta: arch = llama
|
||||
llm_load_print_meta: arch = jarvis
|
||||
llm_load_print_meta: vocab type = SPM
|
||||
llm_load_print_meta: n_vocab = 32000
|
||||
llm_load_print_meta: n_merges = 0
|
||||
|
@ -248,8 +248,8 @@ llm_load_print_meta: LF token = 13 '<0x0A>'
|
|||
llm_load_tensors: ggml ctx size = 0.11 MB
|
||||
llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state)
|
||||
...................................................................................................
|
||||
llama_new_context_with_model: kv self size = 400.00 MB
|
||||
llama_new_context_with_model: compute buffer total size = 75.41 MB
|
||||
jarvis_new_context_with_model: kv self size = 400.00 MB
|
||||
jarvis_new_context_with_model: compute buffer total size = 75.41 MB
|
||||
|
||||
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
|
||||
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
|
||||
|
@ -271,11 +271,11 @@ How does a Website Work?
|
|||
A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
|
||||
The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
|
||||
How to
|
||||
llama_print_timings: load time = 576.45 ms
|
||||
llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second)
|
||||
llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second)
|
||||
llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second)
|
||||
llama_print_timings: total time = 25431.49 ms
|
||||
jarvis_print_timings: load time = 576.45 ms
|
||||
jarvis_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second)
|
||||
jarvis_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second)
|
||||
jarvis_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second)
|
||||
jarvis_print_timings: total time = 25431.49 ms
|
||||
```
|
||||
|
||||
</details>
|
||||
|
@ -297,14 +297,14 @@ Here are the end-to-end binary build and model conversion steps for most support
|
|||
|
||||
Firstly, you need to get the binary. There are different methods that you can follow:
|
||||
- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
|
||||
- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
|
||||
- Method 2: If you are using MacOS or Linux, you can install jarvis.cpp via [brew, flox or nix](./docs/install.md)
|
||||
- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
|
||||
- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
|
||||
- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/jarvis.cpp/releases)
|
||||
|
||||
You can run a basic completion using this command:
|
||||
|
||||
```bash
|
||||
llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
|
||||
jarvis-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
|
||||
|
||||
# Output:
|
||||
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
|
||||
|
@ -317,7 +317,7 @@ See [this page](./examples/main/README.md) for a full list of parameters.
|
|||
If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
|
||||
|
||||
```bash
|
||||
llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
|
||||
jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
|
||||
|
||||
# Output:
|
||||
# > hi, who are you?
|
||||
|
@ -327,26 +327,26 @@ llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
|
|||
# Easy peasy! The answer to 1+1 is... 2!
|
||||
```
|
||||
|
||||
By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||
By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/jarvis.cpp/wiki/Templates-supported-by-jarvis_chat_apply_template)
|
||||
|
||||
```bash
|
||||
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
|
||||
./jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
|
||||
```
|
||||
|
||||
You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
|
||||
|
||||
```bash
|
||||
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
|
||||
./jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
|
||||
```
|
||||
|
||||
### Web server
|
||||
|
||||
[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
||||
[jarvis.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
|
||||
|
||||
Example usage:
|
||||
|
||||
```bash
|
||||
./llama-server -m your_model.gguf --port 8080
|
||||
./jarvis-server -m your_model.gguf --port 8080
|
||||
|
||||
# Basic web UI can be accessed via browser: http://localhost:8080
|
||||
# Chat completion endpoint: http://localhost:8080/v1/chat/completions
|
||||
|
@ -369,16 +369,16 @@ Here is an example of a few-shot interaction, invoked with the command
|
|||
./examples/chat-13B.sh
|
||||
|
||||
# custom arguments using a 13B model
|
||||
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||
./jarvis-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||
```
|
||||
|
||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
|
||||
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `jarvis-cli` example program.
|
||||
|
||||

|
||||
|
||||
### Persistent Interaction
|
||||
|
||||
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
||||
The prompt, user inputs, and model generations can be saved and resumed across calls to `./jarvis-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
|
||||
|
||||
```bash
|
||||
# Start a new chat
|
||||
|
@ -397,10 +397,10 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
|
|||
|
||||
### Constrained output with grammars
|
||||
|
||||
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
||||
`jarvis.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
|
||||
|
||||
```bash
|
||||
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
||||
./jarvis-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
|
||||
```
|
||||
|
||||
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
||||
|
@ -409,7 +409,7 @@ For authoring more complex JSON grammars, you can also check out https://grammar
|
|||
|
||||
## Build
|
||||
|
||||
Please refer to [Build llama.cpp locally](./docs/build.md)
|
||||
Please refer to [Build jarvis.cpp locally](./docs/build.md)
|
||||
|
||||
## Supported backends
|
||||
|
||||
|
@ -430,11 +430,11 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
|
|||
### Prepare and Quantize
|
||||
|
||||
> [!NOTE]
|
||||
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
|
||||
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `jarvis.cpp` main every 6 hours.
|
||||
|
||||
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
||||
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-jarvis-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
||||
|
||||
Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
|
||||
Note: `convert.py` has been moved to `examples/convert_legacy_jarvis.py` and shouldn't be used for anything other than `Jarvis/Jarvis2/Mistral` models and their derivatives.
|
||||
It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
||||
|
||||
To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
|
||||
|
@ -444,17 +444,17 @@ To learn more about quantizing model, [read this documentation](./examples/quant
|
|||
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
|
||||
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
|
||||
|
||||
To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
|
||||
To learn more how to measure perplexity using jarvis.cpp, [read this documentation](./examples/perplexity/README.md)
|
||||
|
||||
## Contributing
|
||||
|
||||
- Contributors can open PRs
|
||||
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
|
||||
- Collaborators can push to branches in the `jarvis.cpp` repo and merge PRs into the `master` branch
|
||||
- Collaborators will be invited based on contributions
|
||||
- Any help with managing issues, PRs and projects is very appreciated!
|
||||
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
|
||||
- See [good first issues](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
|
||||
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
|
||||
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
|
||||
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/jarvis.cpp/discussions/205)
|
||||
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
|
||||
|
||||
## Other documentations
|
||||
|
@ -470,13 +470,13 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
|
|||
- [Running on Docker](./docs/docker.md)
|
||||
- [Build on Android](./docs/android.md)
|
||||
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
|
||||
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
||||
- [GGML tips & tricks](https://github.com/ggerganov/jarvis.cpp/wiki/GGML-Tips-&-Tricks)
|
||||
|
||||
**Seminal papers and background on the models**
|
||||
|
||||
If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
|
||||
- LLaMA:
|
||||
- [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
|
||||
- [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-jarvis-meta-ai/)
|
||||
- [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
|
||||
- GPT-3
|
||||
- [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# Security Policy
|
||||
|
||||
- [**Using llama.cpp securely**](#using-llamacpp-securely)
|
||||
- [**Using jarvis.cpp securely**](#using-jarviscpp-securely)
|
||||
- [Untrusted models](#untrusted-models)
|
||||
- [Untrusted inputs](#untrusted-inputs)
|
||||
- [Data privacy](#data-privacy)
|
||||
|
@ -8,7 +8,7 @@
|
|||
- [Multi-Tenant environments](#multi-tenant-environments)
|
||||
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
|
||||
|
||||
## Using llama.cpp securely
|
||||
## Using jarvis.cpp securely
|
||||
|
||||
### Untrusted models
|
||||
Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
|
||||
|
@ -57,11 +57,11 @@ If you intend to run multiple models in parallel with shared memory, it is your
|
|||
|
||||
## Reporting a vulnerability
|
||||
|
||||
Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
|
||||
Beware that none of the topics under [Using jarvis.cpp securely](#using-jarviscpp-securely) are considered vulnerabilities of LLaMA C++.
|
||||
|
||||
<!-- normal version -->
|
||||
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
|
||||
|
||||
Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
|
||||
Please disclose it as a private [security advisory](https://github.com/ggerganov/jarvis.cpp/security/advisories/new).
|
||||
|
||||
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
# CI
|
||||
|
||||
In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
|
||||
In addition to [Github Actions](https://github.com/ggerganov/jarvis.cpp/actions) `jarvis.cpp` uses a custom CI framework:
|
||||
|
||||
https://github.com/ggml-org/ci
|
||||
|
||||
It monitors the `master` branch for new commits and runs the
|
||||
[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
|
||||
[ci/run.sh](https://github.com/ggerganov/jarvis.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
|
||||
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
|
||||
to cover various hardware architectures, including GPU and Apple Silicon instances.
|
||||
|
||||
|
|
268
ci/run.sh
268
ci/run.sh
|
@ -36,7 +36,7 @@ sd=`dirname $0`
|
|||
cd $sd/../
|
||||
SRC=`pwd`
|
||||
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
||||
CMAKE_EXTRA="-DJARVIS_FATAL_WARNINGS=ON"
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
|
@ -217,7 +217,7 @@ function gg_sum_test_scripts_release {
|
|||
function gg_get_model {
|
||||
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
|
||||
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
|
||||
local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
||||
local gguf_2="$MNT/models/open-jarvis/7B-v2/ggml-model-f16.gguf"
|
||||
if [[ -s $gguf_0 ]]; then
|
||||
echo -n "$gguf_0"
|
||||
elif [[ -s $gguf_1 ]]; then
|
||||
|
@ -236,7 +236,7 @@ function gg_run_ctest_with_model_debug {
|
|||
local model; model=$(gg_get_model)
|
||||
cd build-ci-debug
|
||||
set -e
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(JARVISCPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
set +e
|
||||
cd ..
|
||||
}
|
||||
|
@ -247,7 +247,7 @@ function gg_run_ctest_with_model_release {
|
|||
local model; model=$(gg_get_model)
|
||||
cd build-ci-release
|
||||
set -e
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(JARVISCPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
set +e
|
||||
cd ..
|
||||
}
|
||||
|
@ -272,24 +272,24 @@ function gg_sum_ctest_with_model_release {
|
|||
gg_printf '```\n'
|
||||
}
|
||||
|
||||
# open_llama_7b_v2
|
||||
# open_jarvis_7b_v2
|
||||
|
||||
function gg_run_open_llama_7b_v2 {
|
||||
function gg_run_open_jarvis_7b_v2 {
|
||||
cd ${SRC}
|
||||
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
|
||||
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/config.json
|
||||
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/tokenizer.model
|
||||
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/tokenizer_config.json
|
||||
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/special_tokens_map.json
|
||||
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/pytorch_model.bin.index.json
|
||||
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
|
||||
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
|
||||
gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/generation_config.json
|
||||
|
||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||
|
||||
path_models="../models-mnt/open-llama/7B-v2"
|
||||
path_models="../models-mnt/open-jarvis/7B-v2"
|
||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||
|
||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||
|
@ -299,7 +299,7 @@ function gg_run_open_llama_7b_v2 {
|
|||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
python3 ../examples/convert_legacy_jarvis.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
|
@ -315,47 +315,47 @@ function gg_run_open_llama_7b_v2 {
|
|||
|
||||
wiki_test="${path_wiki}/wiki.test.raw"
|
||||
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/jarvis-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/jarvis-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/jarvis-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/jarvis-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/jarvis-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/jarvis-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/jarvis-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/jarvis-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/jarvis-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/jarvis-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
@ -387,7 +387,7 @@ function gg_run_open_llama_7b_v2 {
|
|||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_open_llama_7b_v2 {
|
||||
function gg_sum_open_jarvis_7b_v2 {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'OpenLLaMA 7B-v2:\n'
|
||||
|
@ -449,45 +449,45 @@ function gg_run_pythia_1_4b {
|
|||
|
||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/llama-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/jarvis-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/jarvis-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/jarvis-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/jarvis-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/jarvis-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/jarvis-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/jarvis-save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/jarvis-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
@ -580,47 +580,47 @@ function gg_run_pythia_2_8b {
|
|||
|
||||
wiki_test="${path_wiki}/wiki.test.raw"
|
||||
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/jarvis-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/jarvis-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/jarvis-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/jarvis-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/jarvis-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/jarvis-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/jarvis-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
(time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/jarvis-save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/jarvis-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/jarvis-save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/jarvis-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
@ -704,10 +704,10 @@ function gg_run_embd_bge_small {
|
|||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/jarvis-embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/jarvis-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
@ -752,7 +752,7 @@ function gg_run_rerank_tiny {
|
|||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
|
||||
# for this model, the SEP token is "</s>"
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||
(time ./bin/jarvis-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||
|
||||
# sample output
|
||||
# rerank score 0: 0.029
|
||||
|
@ -804,11 +804,11 @@ function gg_check_build_requirements {
|
|||
|
||||
## main
|
||||
|
||||
export LLAMA_LOG_PREFIX=1
|
||||
export LLAMA_LOG_TIMESTAMPS=1
|
||||
export JARVIS_LOG_PREFIX=1
|
||||
export JARVIS_LOG_TIMESTAMPS=1
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
||||
# Create symlink: ./jarvis.cpp/models-mnt -> $MNT/models/models-mnt
|
||||
rm -rf ${SRC}/models-mnt
|
||||
mnt_models=${MNT}/models
|
||||
mkdir -p ${mnt_models}
|
||||
|
@ -841,7 +841,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
|||
test $ret -eq 0 && gg_run pythia_1_4b
|
||||
else
|
||||
test $ret -eq 0 && gg_run pythia_2_8b
|
||||
#test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||
#test $ret -eq 0 && gg_run open_jarvis_7b_v2
|
||||
fi
|
||||
test $ret -eq 0 && gg_run ctest_with_model_debug
|
||||
test $ret -eq 0 && gg_run ctest_with_model_release
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
|
||||
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
||||
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
||||
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||
set(JARVIS_VERSION @JARVIS_INSTALL_VERSION@)
|
||||
set(JARVIS_BUILD_COMMIT @JARVIS_BUILD_COMMIT@)
|
||||
set(JARVIS_BUILD_NUMBER @JARVIS_BUILD_NUMBER@)
|
||||
set(JARVIS_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||
|
||||
set(GGML_BLAS @GGML_BLAS@)
|
||||
set(GGML_CUDA @GGML_CUDA@)
|
||||
|
@ -18,9 +18,9 @@ set(GGML_OPENMP @GGML_OPENMP@)
|
|||
|
||||
@PACKAGE_INIT@
|
||||
|
||||
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
||||
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
||||
set_and_check(JARVIS_INCLUDE_DIR "@PACKAGE_JARVIS_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(JARVIS_LIB_DIR "@PACKAGE_JARVIS_LIB_INSTALL_DIR@")
|
||||
set_and_check(JARVIS_BIN_DIR "@PACKAGE_JARVIS_BIN_INSTALL_DIR@")
|
||||
|
||||
# Ensure transient dependencies satisfied
|
||||
|
||||
|
@ -66,25 +66,25 @@ endif()
|
|||
|
||||
find_library(ggml_LIBRARY ggml
|
||||
REQUIRED
|
||||
HINTS ${LLAMA_LIB_DIR})
|
||||
HINTS ${JARVIS_LIB_DIR})
|
||||
|
||||
find_library(llama_LIBRARY llama
|
||||
find_library(jarvis_LIBRARY jarvis
|
||||
REQUIRED
|
||||
HINTS ${LLAMA_LIB_DIR})
|
||||
HINTS ${JARVIS_LIB_DIR})
|
||||
|
||||
set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
|
||||
set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
|
||||
set(_jarvis_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
|
||||
set(_jarvis_transient_defines "@GGML_TRANSIENT_DEFINES@")
|
||||
|
||||
add_library(llama UNKNOWN IMPORTED)
|
||||
add_library(jarvis UNKNOWN IMPORTED)
|
||||
|
||||
set_target_properties(llama
|
||||
set_target_properties(jarvis
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
||||
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
|
||||
INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${JARVIS_INCLUDE_DIR}"
|
||||
INTERFACE_LINK_LIBRARIES "${_jarvis_link_deps}"
|
||||
INTERFACE_COMPILE_DEFINITIONS "${_jarvis_transient_defines}"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${llama_LIBRARY}"
|
||||
IMPORTED_LOCATION "${jarvis_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES cxx_std_11
|
||||
POSITION_INDEPENDENT_CODE ON )
|
||||
|
||||
check_required_components(Llama)
|
||||
check_required_components(Jarvis)
|
||||
|
|
|
@ -3,8 +3,8 @@ exec_prefix=${prefix}
|
|||
libdir=${exec_prefix}/lib
|
||||
includedir=${prefix}/include
|
||||
|
||||
Name: llama
|
||||
Name: jarvis
|
||||
Description: Port of Facebook's LLaMA model in C/C++
|
||||
Version: @PROJECT_VERSION@
|
||||
Libs: -L${libdir} -lllama
|
||||
Libs: -L${libdir} -ljarvis
|
||||
Cflags: -I${includedir}
|
||||
|
|
|
@ -74,17 +74,17 @@ if (BUILD_SHARED_LIBS)
|
|||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
|
||||
set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
||||
set(JARVIS_COMMON_EXTRA_LIBS build_info)
|
||||
|
||||
# Use curl to download model url
|
||||
if (LLAMA_CURL)
|
||||
if (JARVIS_CURL)
|
||||
find_package(CURL REQUIRED)
|
||||
add_definitions(-DLLAMA_USE_CURL)
|
||||
add_definitions(-DJARVIS_USE_CURL)
|
||||
include_directories(${CURL_INCLUDE_DIRS})
|
||||
find_library(CURL_LIBRARY curl REQUIRED)
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
||||
set(JARVIS_COMMON_EXTRA_LIBS ${JARVIS_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
||||
endif ()
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC .)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_11)
|
||||
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||
target_link_libraries (${TARGET} PRIVATE ${JARVIS_COMMON_EXTRA_LIBS} PUBLIC jarvis Threads::Threads)
|
||||
|
|
382
common/arg.cpp
382
common/arg.cpp
File diff suppressed because it is too large
Load diff
12
common/arg.h
12
common/arg.h
|
@ -11,7 +11,7 @@
|
|||
//
|
||||
|
||||
struct common_arg {
|
||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||
std::set<enum jarvis_example> examples = {JARVIS_EXAMPLE_COMMON};
|
||||
std::vector<const char *> args;
|
||||
const char * value_hint = nullptr; // help text or example for arg value
|
||||
const char * value_hint_2 = nullptr; // for second arg value
|
||||
|
@ -52,17 +52,17 @@ struct common_arg {
|
|||
void (*handler)(common_params & params, const std::string &, const std::string &)
|
||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||
|
||||
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
||||
common_arg & set_examples(std::initializer_list<enum jarvis_example> examples);
|
||||
common_arg & set_env(const char * env);
|
||||
common_arg & set_sparam();
|
||||
bool in_example(enum llama_example ex);
|
||||
bool in_example(enum jarvis_example ex);
|
||||
bool get_value_from_env(std::string & output);
|
||||
bool has_value_from_env();
|
||||
std::string to_string();
|
||||
};
|
||||
|
||||
struct common_params_context {
|
||||
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
||||
enum jarvis_example ex = JARVIS_EXAMPLE_COMMON;
|
||||
common_params & params;
|
||||
std::vector<common_arg> options;
|
||||
void(*print_usage)(int, char **) = nullptr;
|
||||
|
@ -71,7 +71,7 @@ struct common_params_context {
|
|||
|
||||
// parse input arguments from CLI
|
||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
bool common_params_parse(int argc, char ** argv, common_params & params, jarvis_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
// function to be used by test-arg-parser
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
common_params_context common_params_parser_init(common_params & params, jarvis_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
|
||||
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
|
||||
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
||||
int JARVIS_BUILD_NUMBER = @BUILD_NUMBER@;
|
||||
char const *JARVIS_COMMIT = "@BUILD_COMMIT@";
|
||||
char const *JARVIS_COMPILER = "@BUILD_COMPILER@";
|
||||
char const *JARVIS_BUILD_TARGET = "@BUILD_TARGET@";
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include "json.hpp"
|
||||
#include "json-schema-to-grammar.h"
|
||||
#include "llama.h"
|
||||
#include "jarvis.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
|
@ -48,7 +48,7 @@
|
|||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#if defined(LLAMA_USE_CURL)
|
||||
#if defined(JARVIS_USE_CURL)
|
||||
#include <curl/curl.h>
|
||||
#include <curl/easy.h>
|
||||
#include <future>
|
||||
|
@ -58,7 +58,7 @@
|
|||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#if defined(LLAMA_USE_CURL)
|
||||
#if defined(JARVIS_USE_CURL)
|
||||
#ifdef __linux__
|
||||
#include <linux/limits.h>
|
||||
#elif defined(_WIN32)
|
||||
|
@ -66,8 +66,8 @@
|
|||
#else
|
||||
#include <sys/syslimits.h>
|
||||
#endif
|
||||
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||
#endif // LLAMA_USE_CURL
|
||||
#define JARVIS_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||
#endif // JARVIS_USE_CURL
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
|
@ -364,8 +364,8 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
|
|||
}
|
||||
|
||||
void common_init() {
|
||||
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
||||
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
|
||||
jarvis_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
||||
if (LOG_DEFAULT_JARVIS <= common_log_verbosity_thold) {
|
||||
common_log_add(common_log_main(), level, "%s", text);
|
||||
}
|
||||
}, NULL);
|
||||
|
@ -376,7 +376,7 @@ void common_init() {
|
|||
const char * build_type = " (debug)";
|
||||
#endif
|
||||
|
||||
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
||||
LOG_INF("build: %d (%s) with %s for %s%s\n", JARVIS_BUILD_NUMBER, JARVIS_COMMIT, JARVIS_COMPILER, JARVIS_BUILD_TARGET, build_type);
|
||||
}
|
||||
|
||||
std::string common_params_get_system_info(const common_params & params) {
|
||||
|
@ -389,9 +389,9 @@ std::string common_params_get_system_info(const common_params & params) {
|
|||
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
||||
// TODO: windows + arm64 + mingw64
|
||||
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
|
||||
os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
|
||||
os << " / " << logicalProcessorCount << " | " << jarvis_print_system_info();
|
||||
#else
|
||||
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
|
||||
os << " / " << std::thread::hardware_concurrency() << " | " << jarvis_print_system_info();
|
||||
#endif
|
||||
|
||||
return os.str();
|
||||
|
@ -483,7 +483,7 @@ std::string string_from(const std::vector<int> & values) {
|
|||
return buf.str();
|
||||
}
|
||||
|
||||
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||
std::string string_from(const struct jarvis_context * ctx, const std::vector<jarvis_token> & tokens) {
|
||||
std::stringstream buf;
|
||||
|
||||
buf << "[ ";
|
||||
|
@ -514,7 +514,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
|
|||
return buf.str();
|
||||
}
|
||||
|
||||
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
|
||||
std::string string_from(const struct jarvis_context * ctx, const struct jarvis_batch & batch) {
|
||||
std::stringstream buf;
|
||||
|
||||
buf << "[ ";
|
||||
|
@ -586,27 +586,27 @@ void string_process_escapes(std::string & input) {
|
|||
input.resize(output_idx);
|
||||
}
|
||||
|
||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
||||
bool string_parse_kv_override(const char * data, std::vector<jarvis_model_kv_override> & overrides) {
|
||||
const char * sep = strchr(data, '=');
|
||||
if (sep == nullptr || sep - data >= 128) {
|
||||
LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
|
||||
return false;
|
||||
}
|
||||
llama_model_kv_override kvo;
|
||||
jarvis_model_kv_override kvo;
|
||||
std::strncpy(kvo.key, data, sep - data);
|
||||
kvo.key[sep - data] = 0;
|
||||
sep++;
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.tag = JARVIS_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.val_i64 = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||
kvo.tag = JARVIS_KV_OVERRIDE_TYPE_FLOAT;
|
||||
kvo.val_f64 = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||
kvo.tag = JARVIS_KV_OVERRIDE_TYPE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.val_bool = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
|
@ -617,7 +617,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
|||
}
|
||||
} else if (strncmp(sep, "str:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
||||
kvo.tag = JARVIS_KV_OVERRIDE_TYPE_STR;
|
||||
if (strlen(sep) > 127) {
|
||||
LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
||||
return false;
|
||||
|
@ -788,8 +788,8 @@ std::string fs_get_cache_directory() {
|
|||
}
|
||||
return p;
|
||||
};
|
||||
if (getenv("LLAMA_CACHE")) {
|
||||
cache_directory = std::getenv("LLAMA_CACHE");
|
||||
if (getenv("JARVIS_CACHE")) {
|
||||
cache_directory = std::getenv("JARVIS_CACHE");
|
||||
} else {
|
||||
#ifdef __linux__
|
||||
if (std::getenv("XDG_CACHE_HOME")) {
|
||||
|
@ -803,7 +803,7 @@ std::string fs_get_cache_directory() {
|
|||
cache_directory = std::getenv("LOCALAPPDATA");
|
||||
#endif // __linux__
|
||||
cache_directory = ensure_trailing_slash(cache_directory);
|
||||
cache_directory += "llama.cpp";
|
||||
cache_directory += "jarvis.cpp";
|
||||
}
|
||||
return ensure_trailing_slash(cache_directory);
|
||||
}
|
||||
|
@ -824,16 +824,16 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|||
//
|
||||
struct common_init_result common_init_from_params(common_params & params) {
|
||||
common_init_result iparams;
|
||||
auto mparams = common_model_params_to_llama(params);
|
||||
auto mparams = common_model_params_to_jarvis(params);
|
||||
|
||||
llama_model * model = nullptr;
|
||||
jarvis_model * model = nullptr;
|
||||
|
||||
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
||||
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||
} else if (!params.model_url.empty()) {
|
||||
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||
} else {
|
||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||
model = jarvis_load_model_from_file(params.model.c_str(), mparams);
|
||||
}
|
||||
|
||||
if (model == NULL) {
|
||||
|
@ -844,58 +844,58 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
if (params.reranking) {
|
||||
bool ok = true;
|
||||
|
||||
if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
|
||||
if (jarvis_token_bos(model) == JARVIS_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
||||
if (jarvis_token_eos(model) == JARVIS_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
|
||||
if (jarvis_token_sep(model) == JARVIS_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
llama_free_model(model);
|
||||
jarvis_free_model(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
}
|
||||
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
auto cparams = common_context_params_to_jarvis(params);
|
||||
|
||||
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
||||
jarvis_context * lctx = jarvis_new_context_with_model(model, cparams);
|
||||
if (lctx == NULL) {
|
||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||
llama_free_model(model);
|
||||
jarvis_free_model(model);
|
||||
return iparams;
|
||||
}
|
||||
|
||||
if (!params.control_vectors.empty()) {
|
||||
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
||||
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
||||
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = jarvis_n_layer(model);
|
||||
|
||||
const auto cvec = common_control_vector_load(params.control_vectors);
|
||||
if (cvec.n_embd == -1) {
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
jarvis_free(lctx);
|
||||
jarvis_free_model(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
|
||||
int err = llama_control_vector_apply(lctx,
|
||||
int err = jarvis_control_vector_apply(lctx,
|
||||
cvec.data.data(),
|
||||
cvec.data.size(),
|
||||
cvec.n_embd,
|
||||
params.control_vector_layer_start,
|
||||
params.control_vector_layer_end);
|
||||
if (err) {
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
jarvis_free(lctx);
|
||||
jarvis_free_model(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
|
@ -906,11 +906,11 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
common_lora_adapter_container loaded_la;
|
||||
loaded_la.path = la.path;
|
||||
loaded_la.scale = la.scale;
|
||||
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
||||
loaded_la.adapter = jarvis_lora_adapter_init(model, la.path.c_str());
|
||||
if (loaded_la.adapter == nullptr) {
|
||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
jarvis_free(lctx);
|
||||
jarvis_free_model(model);
|
||||
return iparams;
|
||||
}
|
||||
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
||||
|
@ -919,7 +919,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
||||
}
|
||||
|
||||
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
||||
if (params.sparams.ignore_eos && jarvis_token_eos(model) == JARVIS_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||
params.sparams.ignore_eos = false;
|
||||
}
|
||||
|
@ -927,35 +927,35 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
if (params.warmup) {
|
||||
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||
|
||||
std::vector<llama_token> tmp;
|
||||
llama_token bos = llama_token_bos(model);
|
||||
llama_token eos = llama_token_eos(model);
|
||||
std::vector<jarvis_token> tmp;
|
||||
jarvis_token bos = jarvis_token_bos(model);
|
||||
jarvis_token eos = jarvis_token_eos(model);
|
||||
// some models (e.g. T5) don't have a BOS token
|
||||
if (bos != LLAMA_TOKEN_NULL) {
|
||||
if (bos != JARVIS_TOKEN_NULL) {
|
||||
tmp.push_back(bos);
|
||||
}
|
||||
if (eos != LLAMA_TOKEN_NULL) {
|
||||
if (eos != JARVIS_TOKEN_NULL) {
|
||||
tmp.push_back(eos);
|
||||
}
|
||||
if (tmp.empty()) {
|
||||
tmp.push_back(0);
|
||||
}
|
||||
|
||||
if (llama_model_has_encoder(model)) {
|
||||
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
||||
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
||||
if (jarvis_model_has_encoder(model)) {
|
||||
jarvis_encode(lctx, jarvis_batch_get_one(tmp.data(), tmp.size()));
|
||||
jarvis_token decoder_start_token_id = jarvis_model_decoder_start_token(model);
|
||||
if (decoder_start_token_id == -1) {
|
||||
decoder_start_token_id = bos;
|
||||
}
|
||||
tmp.clear();
|
||||
tmp.push_back(decoder_start_token_id);
|
||||
}
|
||||
if (llama_model_has_decoder(model)) {
|
||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
||||
if (jarvis_model_has_decoder(model)) {
|
||||
jarvis_decode(lctx, jarvis_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
||||
}
|
||||
llama_kv_cache_clear(lctx);
|
||||
llama_synchronize(lctx);
|
||||
llama_perf_context_reset(lctx);
|
||||
jarvis_kv_cache_clear(lctx);
|
||||
jarvis_synchronize(lctx);
|
||||
jarvis_perf_context_reset(lctx);
|
||||
}
|
||||
|
||||
iparams.model = model;
|
||||
|
@ -964,17 +964,17 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
return iparams;
|
||||
}
|
||||
|
||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
|
||||
llama_lora_adapter_clear(ctx);
|
||||
void common_lora_adapters_apply(struct jarvis_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
|
||||
jarvis_lora_adapter_clear(ctx);
|
||||
for (auto & la : lora_adapters) {
|
||||
if (la.scale != 0.0f) {
|
||||
llama_lora_adapter_set(ctx, la.adapter, la.scale);
|
||||
jarvis_lora_adapter_set(ctx, la.adapter, la.scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct llama_model_params common_model_params_to_llama(const common_params & params) {
|
||||
auto mparams = llama_model_default_params();
|
||||
struct jarvis_model_params common_model_params_to_jarvis(const common_params & params) {
|
||||
auto mparams = jarvis_model_default_params();
|
||||
|
||||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
|
@ -1025,8 +1025,8 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|||
throw std::runtime_error("Unsupported cache type: " + s);
|
||||
}
|
||||
|
||||
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
||||
auto cparams = llama_context_default_params();
|
||||
struct jarvis_context_params common_context_params_to_jarvis(const common_params & params) {
|
||||
auto cparams = jarvis_context_default_params();
|
||||
|
||||
cparams.n_ctx = params.n_ctx;
|
||||
cparams.n_seq_max = params.n_parallel;
|
||||
|
@ -1056,7 +1056,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|||
|
||||
if (params.reranking) {
|
||||
cparams.embeddings = true;
|
||||
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
||||
cparams.pooling_type = JARVIS_POOLING_TYPE_RANK;
|
||||
}
|
||||
|
||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||
|
@ -1081,7 +1081,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
|||
return tpp;
|
||||
}
|
||||
|
||||
#ifdef LLAMA_USE_CURL
|
||||
#ifdef JARVIS_USE_CURL
|
||||
|
||||
#define CURL_MAX_RETRY 3
|
||||
#define CURL_RETRY_DELAY_SECONDS 2
|
||||
|
@ -1279,7 +1279,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
|
||||
|
||||
// helper function to hide password in URL
|
||||
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
||||
auto jarvis_download_hide_password_in_url = [](const std::string & url) -> std::string {
|
||||
std::size_t protocol_pos = url.find("://");
|
||||
if (protocol_pos == std::string::npos) {
|
||||
return url; // Malformed URL
|
||||
|
@ -1295,7 +1295,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|||
|
||||
// start the download
|
||||
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
||||
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
||||
jarvis_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
||||
if (!was_perform_successful) {
|
||||
return false;
|
||||
|
@ -1329,11 +1329,11 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|||
return true;
|
||||
}
|
||||
|
||||
struct llama_model * common_load_model_from_url(
|
||||
struct jarvis_model * common_load_model_from_url(
|
||||
const char * model_url,
|
||||
const char * path_model,
|
||||
const char * hf_token,
|
||||
const struct llama_model_params & params) {
|
||||
const struct jarvis_model_params & params) {
|
||||
// Basic validation of the model_url
|
||||
if (!model_url || strlen(model_url) == 0) {
|
||||
LOG_ERR("%s: invalid model_url\n", __func__);
|
||||
|
@ -1367,17 +1367,17 @@ struct llama_model * common_load_model_from_url(
|
|||
|
||||
if (n_split > 1) {
|
||||
char split_prefix[PATH_MAX] = {0};
|
||||
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
||||
char split_url_prefix[JARVIS_CURL_MAX_URL_LENGTH] = {0};
|
||||
|
||||
// Verify the first split file format
|
||||
// and extract split URL and PATH prefixes
|
||||
{
|
||||
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
||||
if (!jarvis_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
||||
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
||||
if (!jarvis_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
||||
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -1388,10 +1388,10 @@ struct llama_model * common_load_model_from_url(
|
|||
for (int idx = 1; idx < n_split; idx++) {
|
||||
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
|
||||
char split_path[PATH_MAX] = {0};
|
||||
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
|
||||
jarvis_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
|
||||
|
||||
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
||||
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
||||
char split_url[JARVIS_CURL_MAX_URL_LENGTH] = {0};
|
||||
jarvis_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
||||
|
||||
return common_download_file(split_url, split_path, hf_token);
|
||||
}, idx));
|
||||
|
@ -1405,19 +1405,19 @@ struct llama_model * common_load_model_from_url(
|
|||
}
|
||||
}
|
||||
|
||||
return llama_load_model_from_file(path_model, params);
|
||||
return jarvis_load_model_from_file(path_model, params);
|
||||
}
|
||||
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
struct jarvis_model * common_load_model_from_hf(
|
||||
const char * repo,
|
||||
const char * model,
|
||||
const char * path_model,
|
||||
const char * hf_token,
|
||||
const struct llama_model_params & params) {
|
||||
const struct jarvis_model_params & params) {
|
||||
// construct hugging face model url:
|
||||
//
|
||||
// --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
|
||||
// https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
|
||||
// --repo ggml-org/models --file tinyjarvis-1.1b/ggml-model-f16.gguf
|
||||
// https://huggingface.co/ggml-org/models/resolve/main/tinyjarvis-1.1b/ggml-model-f16.gguf
|
||||
//
|
||||
// --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
|
||||
// https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
|
||||
|
@ -1433,42 +1433,42 @@ struct llama_model * common_load_model_from_hf(
|
|||
|
||||
#else
|
||||
|
||||
struct llama_model * common_load_model_from_url(
|
||||
struct jarvis_model * common_load_model_from_url(
|
||||
const char * /*model_url*/,
|
||||
const char * /*path_model*/,
|
||||
const char * /*hf_token*/,
|
||||
const struct llama_model_params & /*params*/) {
|
||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
||||
const struct jarvis_model_params & /*params*/) {
|
||||
LOG_WRN("%s: jarvis.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
struct jarvis_model * common_load_model_from_hf(
|
||||
const char * /*repo*/,
|
||||
const char * /*model*/,
|
||||
const char * /*path_model*/,
|
||||
const char * /*hf_token*/,
|
||||
const struct llama_model_params & /*params*/) {
|
||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
||||
const struct jarvis_model_params & /*params*/) {
|
||||
LOG_WRN("%s: jarvis.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#endif // LLAMA_USE_CURL
|
||||
#endif // JARVIS_USE_CURL
|
||||
|
||||
//
|
||||
// Batch utils
|
||||
//
|
||||
|
||||
void common_batch_clear(struct llama_batch & batch) {
|
||||
void common_batch_clear(struct jarvis_batch & batch) {
|
||||
batch.n_tokens = 0;
|
||||
}
|
||||
|
||||
void common_batch_add(
|
||||
struct llama_batch & batch,
|
||||
llama_token id,
|
||||
llama_pos pos,
|
||||
const std::vector<llama_seq_id> & seq_ids,
|
||||
struct jarvis_batch & batch,
|
||||
jarvis_token id,
|
||||
jarvis_pos pos,
|
||||
const std::vector<jarvis_seq_id> & seq_ids,
|
||||
bool logits) {
|
||||
GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
|
||||
GGML_ASSERT(batch.seq_id[batch.n_tokens] && "jarvis_batch size exceeded");
|
||||
|
||||
batch.token [batch.n_tokens] = id;
|
||||
batch.pos [batch.n_tokens] = pos;
|
||||
|
@ -1485,26 +1485,26 @@ void common_batch_add(
|
|||
// Vocab utils
|
||||
//
|
||||
|
||||
std::vector<llama_token> common_tokenize(
|
||||
const struct llama_context * ctx,
|
||||
std::vector<jarvis_token> common_tokenize(
|
||||
const struct jarvis_context * ctx,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
||||
return common_tokenize(jarvis_get_model(ctx), text, add_special, parse_special);
|
||||
}
|
||||
|
||||
std::vector<llama_token> common_tokenize(
|
||||
const struct llama_model * model,
|
||||
std::vector<jarvis_token> common_tokenize(
|
||||
const struct jarvis_model * model,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
// upper limit for the number of tokens
|
||||
int n_tokens = text.length() + 2 * add_special;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
std::vector<jarvis_token> result(n_tokens);
|
||||
n_tokens = jarvis_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
int check = jarvis_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
|
@ -1512,13 +1512,13 @@ std::vector<llama_token> common_tokenize(
|
|||
return result;
|
||||
}
|
||||
|
||||
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
||||
std::string common_token_to_piece(const struct jarvis_context * ctx, jarvis_token token, bool special) {
|
||||
std::string piece;
|
||||
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
||||
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
||||
const int n_chars = jarvis_token_to_piece(jarvis_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
||||
if (n_chars < 0) {
|
||||
piece.resize(-n_chars);
|
||||
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
||||
int check = jarvis_token_to_piece(jarvis_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
||||
GGML_ASSERT(check == -n_chars);
|
||||
}
|
||||
else {
|
||||
|
@ -1528,13 +1528,13 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
|
|||
return piece;
|
||||
}
|
||||
|
||||
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
||||
std::string common_detokenize(jarvis_context * ctx, const std::vector<jarvis_token> & tokens, bool special) {
|
||||
std::string text;
|
||||
text.resize(std::max(text.capacity(), tokens.size()));
|
||||
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
||||
int32_t n_chars = jarvis_detokenize(jarvis_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
||||
if (n_chars < 0) {
|
||||
text.resize(-n_chars);
|
||||
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
||||
n_chars = jarvis_detokenize(jarvis_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
||||
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
|
||||
}
|
||||
|
||||
|
@ -1549,18 +1549,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
|
|||
//
|
||||
|
||||
bool common_chat_verify_template(const std::string & tmpl) {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||
jarvis_chat_message chat[] = {{"user", "test"}};
|
||||
int res = jarvis_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||
return res >= 0;
|
||||
}
|
||||
|
||||
std::string common_chat_apply_template(const struct llama_model * model,
|
||||
std::string common_chat_apply_template(const struct jarvis_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<common_chat_msg> & msgs,
|
||||
bool add_ass) {
|
||||
int alloc_size = 0;
|
||||
bool fallback = false; // indicate if we must fallback to default chatml
|
||||
std::vector<llama_chat_message> chat;
|
||||
std::vector<jarvis_chat_message> chat;
|
||||
for (auto & msg : msgs) {
|
||||
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
||||
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
|
||||
|
@ -1570,17 +1570,17 @@ std::string common_chat_apply_template(const struct llama_model * model,
|
|||
std::vector<char> buf(alloc_size);
|
||||
|
||||
// run the first time to get the total output length
|
||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
int32_t res = jarvis_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
|
||||
// error: chat template is not supported
|
||||
if (res < 0) {
|
||||
if (ptr_tmpl != nullptr) {
|
||||
// if the custom "tmpl" is not supported, we throw an error
|
||||
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
||||
// this is a bit redundant (for good), since we're not sure if user validated the custom template with jarvis_chat_verify_template()
|
||||
throw std::runtime_error("this custom template is not supported");
|
||||
} else {
|
||||
// If the built-in template is not supported, we default to chatml
|
||||
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
res = jarvis_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
fallback = true;
|
||||
}
|
||||
}
|
||||
|
@ -1588,7 +1588,7 @@ std::string common_chat_apply_template(const struct llama_model * model,
|
|||
// if it turns out that our buffer is too small, we resize it
|
||||
if ((size_t) res > buf.size()) {
|
||||
buf.resize(res);
|
||||
res = llama_chat_apply_template(
|
||||
res = jarvis_chat_apply_template(
|
||||
fallback ? nullptr : model,
|
||||
fallback ? "chatml" : ptr_tmpl,
|
||||
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
|
@ -1598,7 +1598,7 @@ std::string common_chat_apply_template(const struct llama_model * model,
|
|||
return formatted_chat;
|
||||
}
|
||||
|
||||
std::string common_chat_format_single(const struct llama_model * model,
|
||||
std::string common_chat_format_single(const struct jarvis_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<common_chat_msg> & past_msg,
|
||||
const common_chat_msg & new_msg,
|
||||
|
@ -1618,7 +1618,7 @@ std::string common_chat_format_single(const struct llama_model * model,
|
|||
return ss.str();
|
||||
}
|
||||
|
||||
std::string common_chat_format_example(const struct llama_model * model,
|
||||
std::string common_chat_format_example(const struct jarvis_model * model,
|
||||
const std::string & tmpl) {
|
||||
std::vector<common_chat_msg> msgs = {
|
||||
{"system", "You are a helpful assistant"},
|
||||
|
@ -1633,14 +1633,14 @@ std::string common_chat_format_example(const struct llama_model * model,
|
|||
// KV cache utils
|
||||
//
|
||||
|
||||
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
||||
void common_kv_cache_dump_view(const jarvis_kv_cache_view & view, int row_size) {
|
||||
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
||||
|
||||
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
||||
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
||||
|
||||
llama_kv_cache_view_cell * c_curr = view.cells;
|
||||
llama_seq_id * cs_curr = view.cells_sequences;
|
||||
jarvis_kv_cache_view_cell * c_curr = view.cells;
|
||||
jarvis_seq_id * cs_curr = view.cells_sequences;
|
||||
|
||||
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
||||
if (i % row_size == 0) {
|
||||
|
@ -1656,15 +1656,15 @@ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
|||
printf("\n=== Done dumping\n");
|
||||
}
|
||||
|
||||
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
||||
void common_kv_cache_dump_view_seqs(const jarvis_kv_cache_view & view, int row_size) {
|
||||
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||
|
||||
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
||||
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
||||
|
||||
std::unordered_map<llama_seq_id, size_t> seqs;
|
||||
llama_kv_cache_view_cell * c_curr = view.cells;
|
||||
llama_seq_id * cs_curr = view.cells_sequences;
|
||||
std::unordered_map<jarvis_seq_id, size_t> seqs;
|
||||
jarvis_kv_cache_view_cell * c_curr = view.cells;
|
||||
jarvis_seq_id * cs_curr = view.cells_sequences;
|
||||
|
||||
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
||||
for (int j = 0; j < view.n_seq_max; j++) {
|
||||
|
@ -1949,12 +1949,12 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
|||
}
|
||||
}
|
||||
|
||||
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
|
||||
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const jarvis_context * lctx,
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||
const auto & sparams = params.sparams;
|
||||
|
||||
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
||||
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
||||
fprintf(stream, "build_commit: %s\n", JARVIS_COMMIT);
|
||||
fprintf(stream, "build_number: %d\n", JARVIS_BUILD_NUMBER);
|
||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
|
||||
|
@ -1985,7 +1985,7 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
|
|||
#endif // NDEBUG
|
||||
|
||||
fprintf(stream, "model_desc: %s\n", model_desc);
|
||||
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
|
||||
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", jarvis_n_vocab(jarvis_get_model(lctx)));
|
||||
|
||||
#ifdef __OPTIMIZE__
|
||||
fprintf(stream, "optimize: true\n");
|
||||
|
@ -2087,7 +2087,7 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
|
|||
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
||||
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
||||
|
||||
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|
||||
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + jarvis_max_devices());
|
||||
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
||||
|
||||
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
||||
|
|
138
common/common.h
138
common/common.h
|
@ -2,7 +2,7 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "jarvis.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -18,8 +18,8 @@
|
|||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||
|
||||
#define print_build_info() do { \
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, JARVIS_BUILD_NUMBER, JARVIS_COMMIT); \
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, JARVIS_COMPILER, JARVIS_BUILD_TARGET); \
|
||||
} while(0)
|
||||
|
||||
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
||||
|
@ -30,14 +30,14 @@ struct common_lora_adapter_info {
|
|||
};
|
||||
|
||||
struct common_lora_adapter_container : common_lora_adapter_info {
|
||||
struct llama_lora_adapter * adapter;
|
||||
struct jarvis_lora_adapter * adapter;
|
||||
};
|
||||
|
||||
// build info
|
||||
extern int LLAMA_BUILD_NUMBER;
|
||||
extern char const * LLAMA_COMMIT;
|
||||
extern char const * LLAMA_COMPILER;
|
||||
extern char const * LLAMA_BUILD_TARGET;
|
||||
extern int JARVIS_BUILD_NUMBER;
|
||||
extern char const * JARVIS_COMMIT;
|
||||
extern char const * JARVIS_COMPILER;
|
||||
extern char const * JARVIS_BUILD_TARGET;
|
||||
|
||||
struct common_control_vector_load_info;
|
||||
|
||||
|
@ -61,25 +61,25 @@ int32_t cpu_get_num_math();
|
|||
// Common params
|
||||
//
|
||||
|
||||
enum llama_example {
|
||||
LLAMA_EXAMPLE_COMMON,
|
||||
LLAMA_EXAMPLE_SPECULATIVE,
|
||||
LLAMA_EXAMPLE_MAIN,
|
||||
LLAMA_EXAMPLE_INFILL,
|
||||
LLAMA_EXAMPLE_EMBEDDING,
|
||||
LLAMA_EXAMPLE_PERPLEXITY,
|
||||
LLAMA_EXAMPLE_RETRIEVAL,
|
||||
LLAMA_EXAMPLE_PASSKEY,
|
||||
LLAMA_EXAMPLE_IMATRIX,
|
||||
LLAMA_EXAMPLE_BENCH,
|
||||
LLAMA_EXAMPLE_SERVER,
|
||||
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
||||
LLAMA_EXAMPLE_EXPORT_LORA,
|
||||
LLAMA_EXAMPLE_LLAVA,
|
||||
LLAMA_EXAMPLE_LOOKUP,
|
||||
LLAMA_EXAMPLE_PARALLEL,
|
||||
enum jarvis_example {
|
||||
JARVIS_EXAMPLE_COMMON,
|
||||
JARVIS_EXAMPLE_SPECULATIVE,
|
||||
JARVIS_EXAMPLE_MAIN,
|
||||
JARVIS_EXAMPLE_INFILL,
|
||||
JARVIS_EXAMPLE_EMBEDDING,
|
||||
JARVIS_EXAMPLE_PERPLEXITY,
|
||||
JARVIS_EXAMPLE_RETRIEVAL,
|
||||
JARVIS_EXAMPLE_PASSKEY,
|
||||
JARVIS_EXAMPLE_IMATRIX,
|
||||
JARVIS_EXAMPLE_BENCH,
|
||||
JARVIS_EXAMPLE_SERVER,
|
||||
JARVIS_EXAMPLE_CVECTOR_GENERATOR,
|
||||
JARVIS_EXAMPLE_EXPORT_LORA,
|
||||
JARVIS_EXAMPLE_LLAVA,
|
||||
JARVIS_EXAMPLE_LOOKUP,
|
||||
JARVIS_EXAMPLE_PARALLEL,
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
JARVIS_EXAMPLE_COUNT,
|
||||
};
|
||||
|
||||
enum common_sampler_type {
|
||||
|
@ -103,7 +103,7 @@ enum dimre_method {
|
|||
|
||||
// sampler parameters
|
||||
struct common_sampler_params {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||
uint32_t seed = JARVIS_DEFAULT_SEED; // the seed used to initialize jarvis_sampler
|
||||
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
|
@ -149,7 +149,7 @@ struct common_sampler_params {
|
|||
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
|
||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||
std::vector<jarvis_logit_bias> logit_bias; // logit biases to apply
|
||||
|
||||
// print the parameters into a string
|
||||
std::string print() const;
|
||||
|
@ -192,10 +192,10 @@ struct common_params {
|
|||
|
||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||
enum jarvis_split_mode split_mode = JARVIS_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
enum jarvis_rope_scaling_type rope_scaling_type = JARVIS_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||
enum jarvis_pooling_type pooling_type = JARVIS_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||
enum jarvis_attention_type attention_type = JARVIS_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||
|
||||
struct common_sampler_params sparams;
|
||||
|
||||
|
@ -219,9 +219,9 @@ struct common_params {
|
|||
|
||||
std::vector<std::string> in_files; // all input files
|
||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
std::vector<jarvis_model_kv_override> kv_overrides;
|
||||
|
||||
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
||||
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using jarvis_lora_adapter_apply)
|
||||
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
||||
|
||||
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||
|
@ -377,15 +377,15 @@ bool set_process_priority(enum ggml_sched_priority prio);
|
|||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
#else
|
||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
#endif
|
||||
#else
|
||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
||||
#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...)
|
||||
#endif
|
||||
|
||||
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
||||
JARVIS_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
||||
std::string string_format(const char * fmt, ...);
|
||||
|
||||
std::string string_strip(const std::string & str);
|
||||
|
@ -424,13 +424,13 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
|
|||
return parts;
|
||||
}
|
||||
|
||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||
bool string_parse_kv_override(const char * data, std::vector<jarvis_model_kv_override> & overrides);
|
||||
void string_process_escapes(std::string & input);
|
||||
|
||||
std::string string_from(bool value);
|
||||
std::string string_from(const std::vector<int> & values);
|
||||
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
||||
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
||||
std::string string_from(const struct jarvis_context * ctx, const std::vector<jarvis_token> & tokens);
|
||||
std::string string_from(const struct jarvis_context * ctx, const struct jarvis_batch & batch);
|
||||
|
||||
//
|
||||
// Filesystem utils
|
||||
|
@ -447,32 +447,32 @@ std::string fs_get_cache_file(const std::string & filename);
|
|||
//
|
||||
|
||||
struct common_init_result {
|
||||
struct llama_model * model = nullptr;
|
||||
struct llama_context * context = nullptr;
|
||||
struct jarvis_model * model = nullptr;
|
||||
struct jarvis_context * context = nullptr;
|
||||
std::vector<common_lora_adapter_container> lora_adapters;
|
||||
};
|
||||
|
||||
struct common_init_result common_init_from_params(common_params & params);
|
||||
|
||||
struct llama_model_params common_model_params_to_llama (const common_params & params);
|
||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||
struct jarvis_model_params common_model_params_to_jarvis (const common_params & params);
|
||||
struct jarvis_context_params common_context_params_to_jarvis(const common_params & params);
|
||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||
|
||||
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||
struct jarvis_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct jarvis_model_params & params);
|
||||
struct jarvis_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct jarvis_model_params & params);
|
||||
|
||||
// clear LoRA adapters from context, then apply new list of adapters
|
||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
||||
void common_lora_adapters_apply(struct jarvis_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
||||
|
||||
// Batch utils
|
||||
|
||||
void common_batch_clear(struct llama_batch & batch);
|
||||
void common_batch_clear(struct jarvis_batch & batch);
|
||||
|
||||
void common_batch_add(
|
||||
struct llama_batch & batch,
|
||||
llama_token id,
|
||||
llama_pos pos,
|
||||
const std::vector<llama_seq_id> & seq_ids,
|
||||
struct jarvis_batch & batch,
|
||||
jarvis_token id,
|
||||
jarvis_pos pos,
|
||||
const std::vector<jarvis_seq_id> & seq_ids,
|
||||
bool logits);
|
||||
|
||||
//
|
||||
|
@ -481,14 +481,14 @@ void common_batch_add(
|
|||
|
||||
// tokenizes a string into a vector of tokens
|
||||
// should work similar to Python's `tokenizer.encode`
|
||||
std::vector<llama_token> common_tokenize(
|
||||
const struct llama_context * ctx,
|
||||
std::vector<jarvis_token> common_tokenize(
|
||||
const struct jarvis_context * ctx,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
||||
std::vector<llama_token> common_tokenize(
|
||||
const struct llama_model * model,
|
||||
std::vector<jarvis_token> common_tokenize(
|
||||
const struct jarvis_model * model,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
@ -496,23 +496,23 @@ std::vector<llama_token> common_tokenize(
|
|||
// tokenizes a token into a piece, optionally renders special/control tokens
|
||||
// should work similar to Python's `tokenizer.id_to_piece`
|
||||
std::string common_token_to_piece(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token,
|
||||
const struct jarvis_context * ctx,
|
||||
jarvis_token token,
|
||||
bool special = true);
|
||||
|
||||
// detokenizes a vector of tokens into a string
|
||||
// should work similar to Python's `tokenizer.decode`
|
||||
// optionally renders special/control tokens
|
||||
std::string common_detokenize(
|
||||
llama_context * ctx,
|
||||
const std::vector<llama_token> & tokens,
|
||||
jarvis_context * ctx,
|
||||
const std::vector<jarvis_token> & tokens,
|
||||
bool special = true);
|
||||
|
||||
//
|
||||
// Chat template utils
|
||||
//
|
||||
|
||||
// same with llama_chat_message, but uses std::string
|
||||
// same with jarvis_chat_message, but uses std::string
|
||||
struct common_chat_msg {
|
||||
std::string role;
|
||||
std::string content;
|
||||
|
@ -521,23 +521,23 @@ struct common_chat_msg {
|
|||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||
bool common_chat_verify_template(const std::string & tmpl);
|
||||
|
||||
// CPP wrapper for llama_chat_apply_template
|
||||
// CPP wrapper for jarvis_chat_apply_template
|
||||
// If the built-in template is not supported, we default to chatml
|
||||
// If the custom "tmpl" is not supported, we throw an error
|
||||
std::string common_chat_apply_template(const struct llama_model * model,
|
||||
std::string common_chat_apply_template(const struct jarvis_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<common_chat_msg> & chat,
|
||||
bool add_ass);
|
||||
|
||||
// Format single message, while taking into account the position of that message in chat history
|
||||
std::string common_chat_format_single(const struct llama_model * model,
|
||||
std::string common_chat_format_single(const struct jarvis_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<common_chat_msg> & past_msg,
|
||||
const common_chat_msg & new_msg,
|
||||
bool add_ass);
|
||||
|
||||
// Returns an example of formatted chat
|
||||
std::string common_chat_format_example(const struct llama_model * model,
|
||||
std::string common_chat_format_example(const struct jarvis_model * model,
|
||||
const std::string & tmpl);
|
||||
|
||||
//
|
||||
|
@ -545,10 +545,10 @@ std::string common_chat_format_example(const struct llama_model * model,
|
|||
//
|
||||
|
||||
// Dump the KV cache view with the number of sequences per cell.
|
||||
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||
void common_kv_cache_dump_view(const jarvis_kv_cache_view & view, int row_size = 80);
|
||||
|
||||
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||
void common_kv_cache_dump_view_seqs(const jarvis_kv_cache_view & view, int row_size = 40);
|
||||
|
||||
//
|
||||
// Embedding utils
|
||||
|
@ -596,5 +596,5 @@ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std
|
|||
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
||||
|
||||
void yaml_dump_non_result_info(
|
||||
FILE * stream, const common_params & params, const llama_context * lctx,
|
||||
FILE * stream, const common_params & params, const jarvis_context * lctx,
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||
|
|
|
@ -435,7 +435,7 @@ namespace console {
|
|||
fputc('\n', out);
|
||||
has_more = !has_more;
|
||||
} else {
|
||||
// llama will just eat the single space, it won't act as a space
|
||||
// jarvis will just eat the single space, it won't act as a space
|
||||
if (line.length() == 1 && line.back() == ' ') {
|
||||
line.clear();
|
||||
pop_cursor();
|
||||
|
|
|
@ -5336,7 +5336,7 @@ template<typename IteratorType> class iteration_proxy
|
|||
};
|
||||
|
||||
// Structured Bindings Support
|
||||
// For further reference see https://blog.tartanllama.xyz/structured-bindings/
|
||||
// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
|
||||
// And see https://github.com/nlohmann/json/pull/1391
|
||||
template<std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
|
||||
auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key())
|
||||
|
@ -5344,7 +5344,7 @@ auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decl
|
|||
return i.key();
|
||||
}
|
||||
// Structured Bindings Support
|
||||
// For further reference see https://blog.tartanllama.xyz/structured-bindings/
|
||||
// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
|
||||
// And see https://github.com/nlohmann/json/pull/1391
|
||||
template<std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
|
||||
auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value())
|
||||
|
@ -5357,7 +5357,7 @@ NLOHMANN_JSON_NAMESPACE_END
|
|||
|
||||
// The Addition to the STD Namespace is required to add
|
||||
// Structured Bindings Support to the iteration_proxy_value class
|
||||
// For further reference see https://blog.tartanllama.xyz/structured-bindings/
|
||||
// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
|
||||
// And see https://github.com/nlohmann/json/pull/1391
|
||||
namespace std
|
||||
{
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
||||
int common_log_verbosity_thold = LOG_DEFAULT_JARVIS;
|
||||
|
||||
void common_log_set_verbosity_thold(int verbosity) {
|
||||
common_log_verbosity_thold = verbosity;
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
#endif
|
||||
|
||||
#define LOG_DEFAULT_DEBUG 1
|
||||
#define LOG_DEFAULT_LLAMA 0
|
||||
#define LOG_DEFAULT_JARVIS 0
|
||||
|
||||
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
||||
// set via common_log_set_verbosity()
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
#include <thread>
|
||||
|
||||
void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
||||
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
||||
std::vector<jarvis_token> & inp, int nnew, bool print_progress) {
|
||||
const int64_t t_start_ms = ggml_time_ms();
|
||||
const int64_t inp_size = inp.size();
|
||||
|
||||
|
@ -21,7 +21,7 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min,
|
|||
for (int64_t i = i_start; i < inp_size; ++i) {
|
||||
const int64_t ngram_start = i - ngram_size;
|
||||
common_ngram ngram(&inp[ngram_start], ngram_size);
|
||||
const llama_token token = inp[i];
|
||||
const jarvis_token token = inp[i];
|
||||
|
||||
common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
|
||||
if (part_it == ngram_cache.end()) {
|
||||
|
@ -51,18 +51,18 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min,
|
|||
}
|
||||
|
||||
// Helper function to get a token from the combined, speculative sequence of inp and draft.
|
||||
static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
|
||||
static jarvis_token get_token(const std::vector<jarvis_token> & inp, const std::vector<jarvis_token> & draft, const size_t i) {
|
||||
return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
|
||||
}
|
||||
|
||||
// If sample size or percentage are below these thresholds the draft is aborted early:
|
||||
constexpr int draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2, 2, 1, 1};
|
||||
constexpr int draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50};
|
||||
constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
|
||||
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
|
||||
constexpr int draft_min_sample_size_lax[JARVIS_NGRAM_MAX] = { 2, 2, 1, 1};
|
||||
constexpr int draft_min_percent_lax[JARVIS_NGRAM_MAX] = {66, 50, 50, 50};
|
||||
constexpr int draft_min_sample_size_strict[JARVIS_NGRAM_MAX] = { 4, 3, 2, 2};
|
||||
constexpr int draft_min_percent_strict[JARVIS_NGRAM_MAX] = {75, 66, 66, 66};
|
||||
|
||||
// Helper function that tries to draft a token from only the static ngram cache:
|
||||
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
||||
static jarvis_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
||||
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||
if (part_static_it == nc_static.end()) {
|
||||
return -1;
|
||||
|
@ -71,10 +71,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
|
|||
|
||||
int max_count_static = 0;
|
||||
int sum_count_static = 0;
|
||||
llama_token max_token = -1;
|
||||
jarvis_token max_token = -1;
|
||||
|
||||
for (std::pair<llama_token, int> token_count_static : part_static) {
|
||||
const llama_token token = token_count_static.first;
|
||||
for (std::pair<jarvis_token, int> token_count_static : part_static) {
|
||||
const jarvis_token token = token_count_static.first;
|
||||
const int32_t count_static = token_count_static.second;
|
||||
|
||||
if (count_static > max_count_static) {
|
||||
|
@ -84,21 +84,21 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
|
|||
sum_count_static += count_static;
|
||||
}
|
||||
|
||||
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
|
||||
if (sum_count_static < draft_min_sample_size_lax[JARVIS_NGRAM_STATIC-1]) {
|
||||
return -1;
|
||||
}
|
||||
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
|
||||
if (100*max_count_static < draft_min_percent_lax[JARVIS_NGRAM_STATIC-1]*sum_count_static) {
|
||||
return -1;
|
||||
}
|
||||
return max_token;
|
||||
}
|
||||
|
||||
// Try to draft a token from primary cache (context/dynamic), validate with static cache:
|
||||
static llama_token try_draft(
|
||||
static jarvis_token try_draft(
|
||||
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
|
||||
const int * min_sample_size, const int * min_percent) {
|
||||
|
||||
llama_token drafted_token = -1;
|
||||
jarvis_token drafted_token = -1;
|
||||
|
||||
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
|
||||
const common_ngram ngram_primary = ngrams_primary[i];
|
||||
|
@ -112,10 +112,10 @@ static llama_token try_draft(
|
|||
int max_count_primary = 0;
|
||||
int max_count_static = 0;
|
||||
int sum_count_primary = 0;
|
||||
llama_token max_token = -1;
|
||||
jarvis_token max_token = -1;
|
||||
|
||||
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
||||
const llama_token token = token_count_primary.first;
|
||||
for (std::pair<jarvis_token, int> token_count_primary : part_primary) {
|
||||
const jarvis_token token = token_count_primary.first;
|
||||
|
||||
common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
|
||||
|
||||
|
@ -143,22 +143,22 @@ static llama_token try_draft(
|
|||
}
|
||||
|
||||
void common_ngram_cache_draft(
|
||||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||
std::vector<jarvis_token> & inp, std::vector<jarvis_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
|
||||
) {
|
||||
GGML_ASSERT(draft.size() == 1);
|
||||
const int inp_size = inp.size();
|
||||
|
||||
if (inp_size < LLAMA_NGRAM_STATIC) {
|
||||
if (inp_size < JARVIS_NGRAM_STATIC) {
|
||||
return;
|
||||
}
|
||||
|
||||
while ((int) draft.size()-1 < n_draft) {
|
||||
llama_token drafted_token = -1;
|
||||
jarvis_token drafted_token = -1;
|
||||
|
||||
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
||||
const int ngram_start_static = inp_size-JARVIS_NGRAM_STATIC + draft.size()-1;
|
||||
common_ngram ngram_static;
|
||||
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
|
||||
for (int j = ngram_start_static; j < ngram_start_static + JARVIS_NGRAM_STATIC; ++j) {
|
||||
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
|
||||
}
|
||||
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||
|
@ -207,12 +207,12 @@ void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & fil
|
|||
|
||||
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram));
|
||||
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
|
||||
for (std::pair<llama_token, int32_t> item2 : token_counts) {
|
||||
const llama_token token = item2.first;
|
||||
for (std::pair<jarvis_token, int32_t> item2 : token_counts) {
|
||||
const jarvis_token token = item2.first;
|
||||
const int32_t count = item2.second;
|
||||
GGML_ASSERT(count > 0);
|
||||
|
||||
file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token));
|
||||
file_out.write(reinterpret_cast<const char *>(&token), sizeof(jarvis_token));
|
||||
file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
|
||||
}
|
||||
}
|
||||
|
@ -228,7 +228,7 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) {
|
|||
|
||||
common_ngram ngram;
|
||||
int32_t ntokens;
|
||||
llama_token token;
|
||||
jarvis_token token;
|
||||
int32_t count;
|
||||
|
||||
char * ngramc = reinterpret_cast<char*>(&ngram);
|
||||
|
@ -243,7 +243,7 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) {
|
|||
|
||||
for (int i = 0; i < ntokens; ++i) {
|
||||
GGML_ASSERT(!hashmap_file.eof());
|
||||
GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token)));
|
||||
GGML_ASSERT(hashmap_file.read(tokenc, sizeof(jarvis_token)));
|
||||
GGML_ASSERT(!hashmap_file.eof());
|
||||
GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
|
||||
GGML_ASSERT(count > 0);
|
||||
|
@ -268,8 +268,8 @@ void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ng
|
|||
continue;
|
||||
}
|
||||
|
||||
for (std::pair<llama_token, int32_t> token_count : part) {
|
||||
const llama_token token = token_count.first;
|
||||
for (std::pair<jarvis_token, int32_t> token_count : part) {
|
||||
const jarvis_token token = token_count.first;
|
||||
const int32_t count = token_count.second;
|
||||
GGML_ASSERT(count > 0);
|
||||
|
||||
|
|
|
@ -1,34 +1,34 @@
|
|||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "jarvis.h"
|
||||
|
||||
#include <unordered_map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define LLAMA_NGRAM_MIN 1
|
||||
#define LLAMA_NGRAM_MAX 4
|
||||
#define LLAMA_NGRAM_STATIC 2
|
||||
#define JARVIS_NGRAM_MIN 1
|
||||
#define JARVIS_NGRAM_MAX 4
|
||||
#define JARVIS_NGRAM_STATIC 2
|
||||
|
||||
// Data structures to map n-grams to empirical token probabilities:
|
||||
|
||||
struct common_ngram {
|
||||
llama_token tokens[LLAMA_NGRAM_MAX];
|
||||
jarvis_token tokens[JARVIS_NGRAM_MAX];
|
||||
|
||||
common_ngram() {
|
||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
|
||||
tokens[i] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
common_ngram(const llama_token * input, const int ngram_size) {
|
||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
common_ngram(const jarvis_token * input, const int ngram_size) {
|
||||
for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
|
||||
tokens[i] = i < ngram_size ? input[i] : -1;
|
||||
}
|
||||
}
|
||||
|
||||
bool operator==(const common_ngram & other) const {
|
||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
|
||||
if (tokens[i] != other.tokens[i]) {
|
||||
return false;
|
||||
}
|
||||
|
@ -38,7 +38,7 @@ struct common_ngram {
|
|||
};
|
||||
|
||||
struct common_token_hash_function {
|
||||
size_t operator()(const llama_token token) const {
|
||||
size_t operator()(const jarvis_token token) const {
|
||||
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
||||
return token * 11400714819323198485llu;
|
||||
}
|
||||
|
@ -47,7 +47,7 @@ struct common_token_hash_function {
|
|||
struct common_ngram_hash_function {
|
||||
size_t operator()(const common_ngram & ngram) const {
|
||||
size_t hash = common_token_hash_function{}(ngram.tokens[0]);
|
||||
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
for (int i = 1; i < JARVIS_NGRAM_MAX; ++i) {
|
||||
hash ^= common_token_hash_function{}(ngram.tokens[i]);
|
||||
}
|
||||
return hash;
|
||||
|
@ -55,7 +55,7 @@ struct common_ngram_hash_function {
|
|||
};
|
||||
|
||||
// token -> number of times token has been seen
|
||||
typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
|
||||
typedef std::unordered_map<jarvis_token, int32_t> common_ngram_cache_part;
|
||||
|
||||
// n-gram -> empirical distribution of following tokens
|
||||
typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
|
||||
|
@ -71,7 +71,7 @@ typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_h
|
|||
// In order to get correct results inp_data can ONLY BE APPENDED TO.
|
||||
// Changes in the middle need a complete rebuild.
|
||||
void common_ngram_cache_update(
|
||||
common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
|
||||
common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<jarvis_token> & inp_data, int nnew, bool print_progress);
|
||||
|
||||
// Try to draft tokens from ngram caches.
|
||||
// inp: the tokens generated so far.
|
||||
|
@ -82,7 +82,7 @@ void common_ngram_cache_update(
|
|||
// nc_dynamic: ngram cache based on previous user generations.
|
||||
// nc_static: ngram cache generated from a large text corpus, used for validation.
|
||||
void common_ngram_cache_draft(
|
||||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||
std::vector<jarvis_token> & inp, std::vector<jarvis_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
|
||||
|
||||
// Save an ngram cache to a file.
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
#include <unordered_map>
|
||||
|
||||
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||
// TODO: deduplicate with llama-impl.h
|
||||
// TODO: deduplicate with jarvis-impl.h
|
||||
template<typename T>
|
||||
struct ring_buffer {
|
||||
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
|
||||
|
@ -101,24 +101,24 @@ struct ring_buffer {
|
|||
struct common_sampler {
|
||||
common_sampler_params params;
|
||||
|
||||
struct llama_sampler * grmr;
|
||||
struct llama_sampler * chain;
|
||||
struct jarvis_sampler * grmr;
|
||||
struct jarvis_sampler * chain;
|
||||
|
||||
ring_buffer<llama_token> prev;
|
||||
ring_buffer<jarvis_token> prev;
|
||||
|
||||
std::vector<llama_token_data> cur;
|
||||
std::vector<jarvis_token_data> cur;
|
||||
|
||||
llama_token_data_array cur_p;
|
||||
jarvis_token_data_array cur_p;
|
||||
|
||||
void set_logits(struct llama_context * ctx, int idx) {
|
||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
void set_logits(struct jarvis_context * ctx, int idx) {
|
||||
const auto * logits = jarvis_get_logits_ith(ctx, idx);
|
||||
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
const int n_vocab = jarvis_n_vocab(jarvis_get_model(ctx));
|
||||
|
||||
cur.resize(n_vocab);
|
||||
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
||||
for (jarvis_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
cur[token_id] = jarvis_token_data{token_id, logits[token_id], 0.0f};
|
||||
}
|
||||
|
||||
cur_p = { cur.data(), cur.size(), -1, false };
|
||||
|
@ -141,31 +141,31 @@ std::string common_sampler_params::print() const {
|
|||
return std::string(result);
|
||||
}
|
||||
|
||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
|
||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||
struct common_sampler * common_sampler_init(const struct jarvis_model * model, const struct common_sampler_params & params) {
|
||||
jarvis_sampler_chain_params lparams = jarvis_sampler_chain_default_params();
|
||||
|
||||
lparams.no_perf = params.no_perf;
|
||||
|
||||
auto * result = new common_sampler {
|
||||
/* .params = */ params,
|
||||
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
|
||||
/* .chain = */ llama_sampler_chain_init(lparams),
|
||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||
/* .grmr = */ jarvis_sampler_init_grammar(model, params.grammar.c_str(), "root"),
|
||||
/* .chain = */ jarvis_sampler_chain_init(lparams),
|
||||
/* .prev = */ ring_buffer<jarvis_token>(std::max(32, params.n_prev)),
|
||||
/* .cur = */ {},
|
||||
/* .cur_p = */ {},
|
||||
};
|
||||
|
||||
llama_sampler_chain_add(result->chain,
|
||||
llama_sampler_init_logit_bias(
|
||||
llama_n_vocab(model),
|
||||
jarvis_sampler_chain_add(result->chain,
|
||||
jarvis_sampler_init_logit_bias(
|
||||
jarvis_n_vocab(model),
|
||||
params.logit_bias.size(),
|
||||
params.logit_bias.data()));
|
||||
|
||||
llama_sampler_chain_add(result->chain,
|
||||
llama_sampler_init_penalties(
|
||||
llama_n_vocab (model),
|
||||
llama_token_eos(model),
|
||||
llama_token_nl (model),
|
||||
jarvis_sampler_chain_add(result->chain,
|
||||
jarvis_sampler_init_penalties(
|
||||
jarvis_n_vocab (model),
|
||||
jarvis_token_eos(model),
|
||||
jarvis_token_nl (model),
|
||||
params.penalty_last_n,
|
||||
params.penalty_repeat,
|
||||
params.penalty_freq,
|
||||
|
@ -184,44 +184,44 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
}
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TFS_Z:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
||||
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
|
||||
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_infill (model));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
}
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_dist(params.seed));
|
||||
} else if (params.mirostat == 1) {
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
||||
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp(params.temp));
|
||||
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_mirostat(jarvis_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
||||
} else if (params.mirostat == 2) {
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
||||
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp(params.temp));
|
||||
jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
||||
} else {
|
||||
GGML_ASSERT(false && "unknown mirostat version");
|
||||
}
|
||||
|
@ -231,53 +231,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|||
|
||||
void common_sampler_free(struct common_sampler * gsmpl) {
|
||||
if (gsmpl) {
|
||||
llama_sampler_free(gsmpl->grmr);
|
||||
jarvis_sampler_free(gsmpl->grmr);
|
||||
|
||||
llama_sampler_free(gsmpl->chain);
|
||||
jarvis_sampler_free(gsmpl->chain);
|
||||
|
||||
delete gsmpl;
|
||||
}
|
||||
}
|
||||
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, jarvis_token token, bool accept_grammar) {
|
||||
if (accept_grammar) {
|
||||
llama_sampler_accept(gsmpl->grmr, token);
|
||||
jarvis_sampler_accept(gsmpl->grmr, token);
|
||||
}
|
||||
|
||||
llama_sampler_accept(gsmpl->chain, token);
|
||||
jarvis_sampler_accept(gsmpl->chain, token);
|
||||
|
||||
gsmpl->prev.push_back(token);
|
||||
}
|
||||
|
||||
void common_sampler_reset(struct common_sampler * gsmpl) {
|
||||
llama_sampler_reset(gsmpl->grmr);
|
||||
jarvis_sampler_reset(gsmpl->grmr);
|
||||
|
||||
llama_sampler_reset(gsmpl->chain);
|
||||
jarvis_sampler_reset(gsmpl->chain);
|
||||
}
|
||||
|
||||
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||
return new common_sampler {
|
||||
/* .params = */ gsmpl->params,
|
||||
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||
/* .grmr = */ jarvis_sampler_clone(gsmpl->grmr),
|
||||
/* .chain = */ jarvis_sampler_clone(gsmpl->chain),
|
||||
/* .prev = */ gsmpl->prev,
|
||||
/* .cur = */ gsmpl->cur,
|
||||
/* .cur_p = */ gsmpl->cur_p,
|
||||
};
|
||||
}
|
||||
|
||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
|
||||
void common_perf_print(const struct jarvis_context * ctx, const struct common_sampler * gsmpl) {
|
||||
// TODO: measure grammar performance
|
||||
|
||||
if (gsmpl) {
|
||||
llama_perf_sampler_print(gsmpl->chain);
|
||||
jarvis_perf_sampler_print(gsmpl->chain);
|
||||
}
|
||||
if (ctx) {
|
||||
llama_perf_context_print(ctx);
|
||||
jarvis_perf_context_print(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
||||
jarvis_token common_sampler_sample(struct common_sampler * gsmpl, struct jarvis_context * ctx, int idx, bool grammar_first) {
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
auto & grmr = gsmpl->grmr;
|
||||
|
@ -285,14 +285,14 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|||
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||
|
||||
if (grammar_first) {
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
jarvis_sampler_apply(grmr, &cur_p);
|
||||
}
|
||||
|
||||
llama_sampler_apply(chain, &cur_p);
|
||||
jarvis_sampler_apply(chain, &cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
||||
|
||||
const llama_token id = cur_p.data[cur_p.selected].id;
|
||||
const jarvis_token id = cur_p.data[cur_p.selected].id;
|
||||
|
||||
if (grammar_first) {
|
||||
return id;
|
||||
|
@ -300,10 +300,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|||
|
||||
// check if it the sampled token fits the grammar
|
||||
{
|
||||
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
||||
jarvis_token_data single_token_data = { id, 1.0f, 0.0f };
|
||||
jarvis_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
||||
|
||||
llama_sampler_apply(grmr, &single_token_data_array);
|
||||
jarvis_sampler_apply(grmr, &single_token_data_array);
|
||||
|
||||
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||
if (is_valid) {
|
||||
|
@ -315,8 +315,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|||
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
llama_sampler_apply(chain, &cur_p);
|
||||
jarvis_sampler_apply(grmr, &cur_p);
|
||||
jarvis_sampler_apply(chain, &cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
|
||||
|
||||
|
@ -324,31 +324,31 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|||
}
|
||||
|
||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||
return llama_sampler_get_seed(gsmpl->chain);
|
||||
return jarvis_sampler_get_seed(gsmpl->chain);
|
||||
}
|
||||
|
||||
// helpers
|
||||
|
||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
|
||||
jarvis_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
|
||||
return &gsmpl->cur_p;
|
||||
}
|
||||
|
||||
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
|
||||
jarvis_token common_sampler_last(const struct common_sampler * gsmpl) {
|
||||
return gsmpl->prev.rat(0);
|
||||
}
|
||||
|
||||
std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
||||
std::string result = "logits ";
|
||||
|
||||
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
||||
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||
result += std::string("-> ") + llama_sampler_name(smpl) + " ";
|
||||
for (int i = 0; i < jarvis_sampler_chain_n(gsmpl->chain); i++) {
|
||||
const auto * smpl = jarvis_sampler_chain_get(gsmpl->chain, i);
|
||||
result += std::string("-> ") + jarvis_sampler_name(smpl) + " ";
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
|
||||
std::string common_sampler_prev_str(common_sampler * gsmpl, jarvis_context * ctx_main, int n) {
|
||||
n = std::min(n, (int) gsmpl->prev.size());
|
||||
|
||||
if (n <= 0) {
|
||||
|
@ -359,9 +359,9 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_
|
|||
result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
|
||||
|
||||
for (int i = n - 1; i >= 0; i--) {
|
||||
const llama_token id = gsmpl->prev.rat(i);
|
||||
const jarvis_token id = gsmpl->prev.rat(i);
|
||||
|
||||
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
|
||||
GGML_ASSERT(id != JARVIS_TOKEN_NULL && "null token in the sampling history - should not happen");
|
||||
|
||||
result += common_token_to_piece(ctx_main, id);
|
||||
}
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "jarvis.h"
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// common_sampler extends llama_sampler with additional functionality:
|
||||
// common_sampler extends jarvis_sampler with additional functionality:
|
||||
//
|
||||
// - grammar support
|
||||
// - custom sampler logic based on the parameters
|
||||
|
@ -24,7 +24,7 @@
|
|||
// grammar constraints are applied to the full vocabulary and the token is resampled.
|
||||
//
|
||||
// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
|
||||
// be moved into the core llama library.
|
||||
// be moved into the core jarvis library.
|
||||
//
|
||||
// For convenience, the common_sampler also maintains a container with the current candidate tokens.
|
||||
// This can be used to access the probabilities of the rest of the non-sampled tokens.
|
||||
|
@ -34,19 +34,19 @@
|
|||
|
||||
struct common_sampler;
|
||||
|
||||
// llama_sampler API overloads
|
||||
// jarvis_sampler API overloads
|
||||
|
||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
|
||||
struct common_sampler * common_sampler_init(const struct jarvis_model * model, const struct common_sampler_params & params);
|
||||
|
||||
void common_sampler_free(struct common_sampler * gsmpl);
|
||||
|
||||
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, jarvis_token token, bool accept_grammar);
|
||||
void common_sampler_reset (struct common_sampler * gsmpl);
|
||||
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
||||
|
||||
// arguments can be nullptr to skip printing
|
||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
||||
void common_perf_print(const struct jarvis_context * ctx, const struct common_sampler * gsmpl);
|
||||
|
||||
// extended sampling implementation:
|
||||
//
|
||||
|
@ -58,23 +58,23 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
|||
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
||||
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
||||
//
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||
jarvis_token common_sampler_sample(struct common_sampler * gsmpl, struct jarvis_context * ctx, int idx, bool grammar_first = false);
|
||||
|
||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||
|
||||
// helpers
|
||||
|
||||
// access the internal list of current candidate tokens
|
||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
|
||||
jarvis_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
|
||||
|
||||
// get the last accepted token
|
||||
llama_token common_sampler_last(const struct common_sampler * gsmpl);
|
||||
jarvis_token common_sampler_last(const struct common_sampler * gsmpl);
|
||||
|
||||
// print the sampler chain into a string
|
||||
std::string common_sampler_print(const struct common_sampler * gsmpl);
|
||||
|
||||
// get a string representation of the last accepted tokens
|
||||
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
|
||||
std::string common_sampler_prev_str(common_sampler * gsmpl, jarvis_context * ctx, int n);
|
||||
|
||||
char common_sampler_type_to_chr(enum common_sampler_type cnstr);
|
||||
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
|
||||
|
|
|
@ -34,7 +34,7 @@ struct train_state * init_train_state() {
|
|||
state->opt = new struct ggml_opt_context;
|
||||
state->opt->ctx = NULL;
|
||||
state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
|
||||
state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
|
||||
state->opt->params.graph_size = JARVIS_TRAIN_MAX_NODES;
|
||||
state->opt->loss_after = 0.0f;
|
||||
|
||||
return state;
|
||||
|
@ -213,7 +213,7 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
|
|||
}
|
||||
|
||||
int64_t get_example_targets_batch(
|
||||
struct llama_context * lctx,
|
||||
struct jarvis_context * lctx,
|
||||
struct ggml_tensor * tokens_input,
|
||||
struct ggml_tensor * target_probs,
|
||||
int64_t example_id,
|
||||
|
@ -221,7 +221,7 @@ int64_t get_example_targets_batch(
|
|||
const size_t * samples_begin,
|
||||
const size_t * samples_size,
|
||||
size_t samples_count,
|
||||
const llama_token * train_data,
|
||||
const jarvis_token * train_data,
|
||||
size_t n_train_data,
|
||||
bool separate_with_eos,
|
||||
bool separate_with_bos,
|
||||
|
@ -241,8 +241,8 @@ int64_t get_example_targets_batch(
|
|||
int64_t used_samples = 0;
|
||||
|
||||
ggml_set_f32(target_probs, 0.0f);
|
||||
llama_token bos = llama_token_bos(llama_get_model(lctx));
|
||||
llama_token eos = llama_token_eos(llama_get_model(lctx));
|
||||
jarvis_token bos = jarvis_token_bos(jarvis_get_model(lctx));
|
||||
jarvis_token eos = jarvis_token_eos(jarvis_get_model(lctx));
|
||||
// printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
|
||||
for (int k=0; k<n_batch; ++k) {
|
||||
// printf("%s: batch %d\n", __func__, k);
|
||||
|
@ -259,7 +259,7 @@ int64_t get_example_targets_batch(
|
|||
bool sample_separation_eos = !separate_with_eos;
|
||||
bool sample_separation_bos = !separate_with_bos;
|
||||
for (int64_t i=0; i<n_tokens; ++i) {
|
||||
llama_token token = eos;
|
||||
jarvis_token token = eos;
|
||||
if (sample_offs >= sample_size && fill_with_next_samples) {
|
||||
if (!sample_separation_eos) {
|
||||
// insert eos token to separate samples
|
||||
|
@ -281,7 +281,7 @@ int64_t get_example_targets_batch(
|
|||
}
|
||||
// note: no else-if here
|
||||
if (sample_offs < sample_size) {
|
||||
token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1));
|
||||
token = clamp(train_data[sample_begin+sample_offs], 0, (jarvis_token) (n_vocab - 1));
|
||||
++sample_offs;
|
||||
}
|
||||
ggml_set_f32_nd(target_probs, token, (int) i, (int) k, 0, +1.0f);
|
||||
|
@ -712,12 +712,12 @@ void save_train_state_gguf(struct gguf_context * fctx, struct train_state * trai
|
|||
}
|
||||
|
||||
|
||||
struct llama_file {
|
||||
struct jarvis_file {
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
size_t size;
|
||||
|
||||
llama_file(const char * fname, const char * mode) {
|
||||
jarvis_file(const char * fname, const char * mode) {
|
||||
fp = std::fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
size = 0;
|
||||
|
@ -788,7 +788,7 @@ struct llama_file {
|
|||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
~llama_file() {
|
||||
~jarvis_file() {
|
||||
if (fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
|
@ -823,16 +823,16 @@ static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nu
|
|||
}
|
||||
|
||||
size_t tokenize_file(
|
||||
struct llama_context * lctx,
|
||||
struct jarvis_context * lctx,
|
||||
const char * filename,
|
||||
const std::string & sample_start,
|
||||
bool include_sample_start,
|
||||
bool overlapping_samples,
|
||||
unsigned context_length,
|
||||
std::vector<llama_token> & out_tokens,
|
||||
std::vector<jarvis_token> & out_tokens,
|
||||
std::vector<size_t> & out_samples_begin,
|
||||
std::vector<size_t> & out_samples_size) {
|
||||
struct llama_file f(filename, "rb");
|
||||
struct jarvis_file f(filename, "rb");
|
||||
|
||||
if (f.size == 0) {
|
||||
out_tokens.clear();
|
||||
|
@ -844,7 +844,7 @@ size_t tokenize_file(
|
|||
}
|
||||
|
||||
// account for possible leading whitespace that will be added by tokenizer
|
||||
// e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
|
||||
// e.g. '\t' will be tokenized by jarvis spm tokenizer to [29871, 12]
|
||||
const int n_max_tokens_overhead = 1;
|
||||
|
||||
std::vector<char> buf;
|
||||
|
@ -862,8 +862,8 @@ size_t tokenize_file(
|
|||
// tokenize all data at once
|
||||
out_tokens.resize(buf.size() + n_max_tokens_overhead);
|
||||
|
||||
int n_tokens = llama_tokenize(
|
||||
llama_get_model(lctx),
|
||||
int n_tokens = jarvis_tokenize(
|
||||
jarvis_get_model(lctx),
|
||||
buf.data(),
|
||||
(int) buf.size(),
|
||||
out_tokens.data(),
|
||||
|
@ -871,8 +871,8 @@ size_t tokenize_file(
|
|||
false, false);
|
||||
if (n_tokens < 0) {
|
||||
out_tokens.resize(-n_tokens);
|
||||
n_tokens = llama_tokenize(
|
||||
llama_get_model(lctx),
|
||||
n_tokens = jarvis_tokenize(
|
||||
jarvis_get_model(lctx),
|
||||
buf.data(),
|
||||
(int) buf.size(),
|
||||
out_tokens.data(),
|
||||
|
@ -915,7 +915,7 @@ size_t tokenize_file(
|
|||
out_samples_size.resize(out_samples_begin.size(), 0);
|
||||
|
||||
std::vector<char> buf_sample;
|
||||
std::vector<llama_token> tok_sample;
|
||||
std::vector<jarvis_token> tok_sample;
|
||||
|
||||
const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
|
||||
size_t found_too_big_sample = 0;
|
||||
|
@ -925,11 +925,11 @@ size_t tokenize_file(
|
|||
size_t found_max_sample_size = 0;
|
||||
|
||||
size_t max_token_text_size = 0;
|
||||
int n_vocab = llama_n_vocab(llama_get_model(lctx));
|
||||
for (llama_token token=0; token < n_vocab; ++token) {
|
||||
int n_vocab = jarvis_n_vocab(jarvis_get_model(lctx));
|
||||
for (jarvis_token token=0; token < n_vocab; ++token) {
|
||||
max_token_text_size = std::max(
|
||||
max_token_text_size,
|
||||
strlen(llama_token_get_text(llama_get_model(lctx), token)));
|
||||
strlen(jarvis_token_get_text(jarvis_get_model(lctx), token)));
|
||||
}
|
||||
|
||||
// upper bound of context byte length.
|
||||
|
@ -957,7 +957,7 @@ size_t tokenize_file(
|
|||
}
|
||||
|
||||
if (sample_size > 0) {
|
||||
// llama_tokenize expects zero terminated string,
|
||||
// jarvis_tokenize expects zero terminated string,
|
||||
// copy sample into buffer and zero terminate it.
|
||||
buf_sample.resize(sample_size);
|
||||
memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
|
||||
|
@ -966,7 +966,7 @@ size_t tokenize_file(
|
|||
|
||||
// tokenize the sample
|
||||
tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
|
||||
int n_tokens = llama_tokenize(llama_get_model(lctx),
|
||||
int n_tokens = jarvis_tokenize(jarvis_get_model(lctx),
|
||||
buf_sample.data(),
|
||||
(int) buf_sample.size(),
|
||||
tok_sample.data(),
|
||||
|
@ -974,7 +974,7 @@ size_t tokenize_file(
|
|||
false, false);
|
||||
if (n_tokens < 0) {
|
||||
tok_sample.resize(-n_tokens);
|
||||
n_tokens = llama_tokenize(llama_get_model(lctx),
|
||||
n_tokens = jarvis_tokenize(jarvis_get_model(lctx),
|
||||
buf_sample.data(),
|
||||
(int) buf_sample.size(),
|
||||
tok_sample.data(),
|
||||
|
@ -1365,7 +1365,7 @@ bool consume_common_train_arg(
|
|||
*invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
if (llama_supports_gpu_offload()) {
|
||||
if (jarvis_supports_gpu_offload()) {
|
||||
params->n_gpu_layers = std::stoi(argv[i]);
|
||||
} else {
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
|
|
|
@ -7,9 +7,9 @@
|
|||
#include <vector>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "jarvis.h"
|
||||
|
||||
#define LLAMA_TRAIN_MAX_NODES 16384
|
||||
#define JARVIS_TRAIN_MAX_NODES 16384
|
||||
|
||||
typedef std::string mt19937_state;
|
||||
|
||||
|
@ -92,9 +92,9 @@ struct train_opt_callback_data {
|
|||
struct train_state * train;
|
||||
save_train_files_callback save_cb;
|
||||
void * save_data;
|
||||
struct llama_context * lctx;
|
||||
struct jarvis_context * lctx;
|
||||
int last_save_iter;
|
||||
llama_token * tokens_data;
|
||||
jarvis_token * tokens_data;
|
||||
size_t tokens_size;
|
||||
size_t * samples_begin;
|
||||
size_t * samples_size;
|
||||
|
@ -146,18 +146,18 @@ void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
|
|||
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
|
||||
|
||||
size_t tokenize_file(
|
||||
struct llama_context * lctx,
|
||||
struct jarvis_context * lctx,
|
||||
const char * filename,
|
||||
const std::string & sample_start,
|
||||
bool include_sample_start,
|
||||
bool overlapping_samples,
|
||||
unsigned context_length,
|
||||
std::vector<llama_token> & out_tokens,
|
||||
std::vector<jarvis_token> & out_tokens,
|
||||
std::vector<size_t> & out_samples_begin,
|
||||
std::vector<size_t> & out_samples_size);
|
||||
|
||||
int64_t get_example_targets_batch(
|
||||
struct llama_context * lctx,
|
||||
struct jarvis_context * lctx,
|
||||
struct ggml_tensor * tokens_input,
|
||||
struct ggml_tensor * target_probs,
|
||||
int64_t example_id,
|
||||
|
@ -165,7 +165,7 @@ int64_t get_example_targets_batch(
|
|||
const size_t * samples_begin,
|
||||
const size_t * samples_size,
|
||||
size_t samples_count,
|
||||
const llama_token * train_data,
|
||||
const jarvis_token * train_data,
|
||||
size_t n_train_data,
|
||||
bool separate_with_eos,
|
||||
bool separate_with_bos,
|
||||
|
|
|
@ -49,7 +49,7 @@ class Model:
|
|||
_model_classes: dict[str, type[Model]] = {}
|
||||
|
||||
dir_model: Path
|
||||
ftype: gguf.LlamaFileType
|
||||
ftype: gguf.JarvisFileType
|
||||
fname_out: Path
|
||||
is_big_endian: bool
|
||||
endianess: gguf.GGUFEndian
|
||||
|
@ -69,7 +69,7 @@ class Model:
|
|||
# subclasses should define this!
|
||||
model_arch: gguf.MODEL_ARCH
|
||||
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
||||
def __init__(self, dir_model: Path, ftype: gguf.JarvisFileType, fname_out: Path, is_big_endian: bool = False,
|
||||
use_temp_file: bool = False, eager: bool = False,
|
||||
metadata_override: Path | None = None, model_name: str | None = None,
|
||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
||||
|
@ -96,15 +96,15 @@ class Model:
|
|||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||
|
||||
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||
if self.ftype == gguf.JarvisFileType.GUESSED:
|
||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||
_, first_tensor = next(self.get_tensors())
|
||||
if first_tensor.dtype == torch.float16:
|
||||
logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_F16
|
||||
self.ftype = gguf.JarvisFileType.MOSTLY_F16
|
||||
else:
|
||||
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
||||
self.ftype = gguf.JarvisFileType.MOSTLY_BF16
|
||||
|
||||
# Configure GGUF Writer
|
||||
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
||||
|
@ -308,7 +308,7 @@ class Model:
|
|||
if n_dims <= 1 or new_name.endswith("_norm.weight"):
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
|
||||
# Conditions should closely match those in jarvis_model_quantize_internal in jarvis.cpp
|
||||
# Some tensor types are always in float32
|
||||
if data_qtype is False and (
|
||||
any(
|
||||
|
@ -337,25 +337,25 @@ class Model:
|
|||
)
|
||||
):
|
||||
if self.ftype in (
|
||||
gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||
gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||
gguf.JarvisFileType.MOSTLY_TQ1_0,
|
||||
gguf.JarvisFileType.MOSTLY_TQ2_0,
|
||||
):
|
||||
# TODO: use Q4_K and Q6_K
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
|
||||
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
||||
if isinstance(data_qtype, bool):
|
||||
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||
if self.ftype == gguf.JarvisFileType.ALL_F32:
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
||||
elif self.ftype == gguf.JarvisFileType.MOSTLY_F16:
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||
elif self.ftype == gguf.JarvisFileType.MOSTLY_BF16:
|
||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||
elif self.ftype == gguf.JarvisFileType.MOSTLY_Q8_0:
|
||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
|
||||
elif self.ftype == gguf.JarvisFileType.MOSTLY_TQ1_0:
|
||||
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
|
||||
elif self.ftype == gguf.JarvisFileType.MOSTLY_TQ2_0:
|
||||
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
||||
else:
|
||||
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
||||
|
@ -394,7 +394,7 @@ class Model:
|
|||
if self.metadata.size_label is None and total_params > 0:
|
||||
self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
|
||||
|
||||
# Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
|
||||
# Extract the encoding scheme from the file type name. e.g. 'gguf.JarvisFileType.MOSTLY_Q8_0' --> 'Q8_0'
|
||||
output_type: str = self.ftype.name.partition("_")[2]
|
||||
|
||||
# Filename Output
|
||||
|
@ -537,13 +537,13 @@ class Model:
|
|||
|
||||
# NOTE: this function is generated by convert_hf_to_gguf_update.py
|
||||
# do not modify it manually!
|
||||
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||
# ref: https://github.com/ggerganov/jarvis.cpp/pull/6920
|
||||
# Marker: Start get_vocab_base_pre
|
||||
def get_vocab_base_pre(self, tokenizer) -> str:
|
||||
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
||||
# is specific for the BPE pre-tokenizer used by the model
|
||||
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
||||
# use in llama.cpp to implement the same pre-tokenizer
|
||||
# use in jarvis.cpp to implement the same pre-tokenizer
|
||||
|
||||
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||
|
||||
|
@ -559,8 +559,8 @@ class Model:
|
|||
# or pull the latest version of the model from Huggingface
|
||||
# don't edit the hashes manually!
|
||||
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
||||
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
res = "llama-bpe"
|
||||
# ref: https://huggingface.co/meta-jarvis/Meta-Jarvis-3-8B
|
||||
res = "jarvis-bpe"
|
||||
if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
|
||||
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
|
||||
res = "deepseek-llm"
|
||||
|
@ -616,7 +616,7 @@ class Model:
|
|||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||
res = "jina-v2-de"
|
||||
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
||||
# ref: https://huggingface.co/abacusai/Smaug-Jarvis-3-70B-Instruct
|
||||
res = "smaug-bpe"
|
||||
if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
|
||||
# ref: https://huggingface.co/LumiOpen/Poro-34B-chat
|
||||
|
@ -666,7 +666,7 @@ class Model:
|
|||
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
||||
logger.warning("** - the pre-tokenization config has changed upstream")
|
||||
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
||||
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
||||
logger.warning("** ref: https://github.com/ggerganov/jarvis.cpp/pull/6920")
|
||||
logger.warning("**")
|
||||
logger.warning(f"** chkhsh: {chkhsh}")
|
||||
logger.warning("**************************************************************************************")
|
||||
|
@ -746,7 +746,7 @@ class Model:
|
|||
def _set_vocab_sentencepiece(self, add_to_gguf=True):
|
||||
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_model("jarvis")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
|
@ -835,8 +835,8 @@ class Model:
|
|||
|
||||
return tokens, scores, toktypes
|
||||
|
||||
def _set_vocab_llama_hf(self):
|
||||
vocab = gguf.LlamaHfVocab(self.dir_model)
|
||||
def _set_vocab_jarvis_hf(self):
|
||||
vocab = gguf.JarvisHfVocab(self.dir_model)
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
|
@ -848,7 +848,7 @@ class Model:
|
|||
|
||||
assert len(tokens) == vocab.vocab_size
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_model("jarvis")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
|
@ -857,7 +857,7 @@ class Model:
|
|||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
|
||||
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "jarvis-spm"], vocab_size: int):
|
||||
tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
|
||||
logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
|
||||
vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
|
||||
|
@ -875,7 +875,7 @@ class Model:
|
|||
assert field # token list
|
||||
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
||||
|
||||
if model_name == "llama-spm":
|
||||
if model_name == "jarvis-spm":
|
||||
field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
|
||||
assert field # token scores
|
||||
self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
||||
|
@ -884,7 +884,7 @@ class Model:
|
|||
assert field # token types
|
||||
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
||||
|
||||
if model_name != "llama-spm":
|
||||
if model_name != "jarvis-spm":
|
||||
field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
||||
assert field # token merges
|
||||
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
||||
|
@ -1226,7 +1226,7 @@ class XverseModel(Model):
|
|||
tokens.append(token_text)
|
||||
toktypes.append(toktype)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_model("jarvis")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
@ -1515,21 +1515,21 @@ class StableLMModel(Model):
|
|||
raise ValueError(f"Unprocessed norms: {norms}")
|
||||
|
||||
|
||||
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
||||
class LlamaModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||
@Model.register("LLaMAForCausalLM", "JarvisForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
||||
class JarvisModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.JARVIS
|
||||
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
try:
|
||||
self._set_vocab_llama_hf()
|
||||
self._set_vocab_jarvis_hf()
|
||||
except (FileNotFoundError, TypeError):
|
||||
# Llama 3
|
||||
# Jarvis 3
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
# Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
|
||||
# Apply to CodeJarvis only (and ignore for Jarvis 3 with a vocab size of 128256)
|
||||
if self.hparams.get("vocab_size", 32000) == 32016:
|
||||
special_vocab = gguf.SpecialVocab(
|
||||
self.dir_model, load_merges=False,
|
||||
|
@ -1583,9 +1583,9 @@ class LlamaModel(Model):
|
|||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
|
||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||
data_torch = JarvisModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||
data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
|
||||
|
||||
# process the experts separately
|
||||
if name.find("block_sparse_moe.experts") != -1:
|
||||
|
@ -1625,7 +1625,7 @@ class LlamaModel(Model):
|
|||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||
if rope_scaling.get("rope_type", '').lower() == "jarvis3":
|
||||
base = self.hparams.get("rope_theta", 10000.0)
|
||||
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||
|
@ -1793,7 +1793,7 @@ class DbrxModel(Model):
|
|||
|
||||
# Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
|
||||
# original implementation expects (n_expert, n_ff, n_embd) for all experts weights
|
||||
# But llama.cpp moe graph works differently
|
||||
# But jarvis.cpp moe graph works differently
|
||||
# AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
|
||||
# so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
|
||||
exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
||||
|
@ -1842,7 +1842,7 @@ class MiniCPMModel(Model):
|
|||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_llama_hf()
|
||||
self._set_vocab_jarvis_hf()
|
||||
|
||||
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
||||
if n_kv_head is not None and n_head != n_kv_head:
|
||||
|
@ -2188,7 +2188,7 @@ class Phi3MiniModel(Model):
|
|||
if foken_data.get("special"):
|
||||
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_model("jarvis")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
|
@ -2456,7 +2456,7 @@ class InternLM2Model(Model):
|
|||
if foken_data.get("special"):
|
||||
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_model("jarvis")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
|
@ -2468,7 +2468,7 @@ class InternLM2Model(Model):
|
|||
if chat_eos_token_id is not None:
|
||||
# For the chat model, we replace the eos with '<|im_end|>'.
|
||||
# TODO: this is a hack, should be fixed
|
||||
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
|
||||
# https://github.com/ggerganov/jarvis.cpp/pull/6745#issuecomment-2067687048
|
||||
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
||||
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
||||
" in chat mode so that the conversation can end normally.")
|
||||
|
@ -2505,8 +2505,8 @@ class InternLM2Model(Model):
|
|||
q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]
|
||||
|
||||
# The model weights of q and k equire additional reshape.
|
||||
q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
|
||||
k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
|
||||
q = JarvisModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
|
||||
k = JarvisModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
|
||||
v = v.reshape((-1, v.shape[-1]))
|
||||
|
||||
return [
|
||||
|
@ -2769,7 +2769,7 @@ class GemmaModel(Model):
|
|||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
||||
# lm_head is not used in jarvis.cpp, while autoawq will include this tensor in model
|
||||
# To prevent errors, skip loading lm_head.weight.
|
||||
if name == "lm_head.weight":
|
||||
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
||||
|
@ -2816,7 +2816,7 @@ class Gemma2Model(Model):
|
|||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
||||
# lm_head is not used in jarvis.cpp, while autoawq will include this tensor in model
|
||||
# To prevent errors, skip loading lm_head.weight.
|
||||
if name == "lm_head.weight":
|
||||
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
||||
|
@ -2894,7 +2894,7 @@ class Rwkv6Model(Model):
|
|||
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
# required by llama.cpp, unused
|
||||
# required by jarvis.cpp, unused
|
||||
self.gguf_writer.add_head_count(0)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
|
@ -3024,7 +3024,7 @@ class OlmoModel(Model):
|
|||
self.gguf_writer.add_clamp_kqv(clip_qkv)
|
||||
|
||||
# Same as super class, but permuting q_proj, k_proj
|
||||
# Copied from: LlamaModel
|
||||
# Copied from: JarvisModel
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
|
@ -3032,9 +3032,9 @@ class OlmoModel(Model):
|
|||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
|
||||
if name.endswith("q_proj.weight"):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||
data_torch = JarvisModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith("k_proj.weight"):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||
data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
@ -3174,12 +3174,12 @@ class OpenELMModel(Model):
|
|||
assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
|
||||
assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
|
||||
|
||||
# Uses the tokenizer from meta-llama/Llama-2-7b-hf
|
||||
# Uses the tokenizer from meta-jarvis/Jarvis-2-7b-hf
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
|
||||
self._set_vocab_builtin("jarvis-spm", self.hparams["vocab_size"])
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
n_embd = self._n_embd
|
||||
|
@ -3300,7 +3300,7 @@ class ArcticModel(Model):
|
|||
toktypes[token_id] = token_type
|
||||
scores[token_id] = token_score
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_model("jarvis")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
|
@ -3322,9 +3322,9 @@ class ArcticModel(Model):
|
|||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
|
||||
if name.endswith("q_proj.weight"):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||
data_torch = JarvisModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith("k_proj.weight"):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||
data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
|
||||
|
||||
# process the experts separately
|
||||
if name.find("block_sparse_moe.experts") != -1:
|
||||
|
@ -3882,7 +3882,7 @@ class ChatGLMModel(Model):
|
|||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_model("jarvis")
|
||||
# glm3 needs prefix and suffix formatted as:
|
||||
# prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
|
||||
self.gguf_writer.add_tokenizer_pre("chatglm-spm")
|
||||
|
@ -4087,7 +4087,7 @@ class ExaoneModel(Model):
|
|||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||
if rope_scaling.get("rope_type", '').lower() == "jarvis3":
|
||||
base = self.hparams.get("rope_theta", 10000.0)
|
||||
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||
|
@ -4116,12 +4116,12 @@ class ExaoneModel(Model):
|
|||
|
||||
|
||||
@Model.register("GraniteForCausalLM")
|
||||
class GraniteModel(LlamaModel):
|
||||
class GraniteModel(JarvisModel):
|
||||
"""Conversion for IBM's GraniteForCausalLM"""
|
||||
model_arch = gguf.MODEL_ARCH.GRANITE
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
"""Granite uses standard llama parameters with the following differences:
|
||||
"""Granite uses standard jarvis parameters with the following differences:
|
||||
|
||||
- No head_dim support
|
||||
- New multiplier params:
|
||||
|
@ -4196,9 +4196,9 @@ class ChameleonModel(Model):
|
|||
hidden_dim = self.hparams.get("hidden_size")
|
||||
|
||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||
data_torch = JarvisModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||
data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
|
||||
if name.endswith(("q_norm.weight", "q_norm.bias")):
|
||||
data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
|
||||
if name.endswith(("k_norm.weight", "k_norm.bias")):
|
||||
|
@ -4379,14 +4379,14 @@ def main() -> None:
|
|||
logger.error(f'Error: {args.model} is not a directory')
|
||||
sys.exit(1)
|
||||
|
||||
ftype_map: dict[str, gguf.LlamaFileType] = {
|
||||
"f32": gguf.LlamaFileType.ALL_F32,
|
||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||
"auto": gguf.LlamaFileType.GUESSED,
|
||||
ftype_map: dict[str, gguf.JarvisFileType] = {
|
||||
"f32": gguf.JarvisFileType.ALL_F32,
|
||||
"f16": gguf.JarvisFileType.MOSTLY_F16,
|
||||
"bf16": gguf.JarvisFileType.MOSTLY_BF16,
|
||||
"q8_0": gguf.JarvisFileType.MOSTLY_Q8_0,
|
||||
"tq1_0": gguf.JarvisFileType.MOSTLY_TQ1_0,
|
||||
"tq2_0": gguf.JarvisFileType.MOSTLY_TQ2_0,
|
||||
"auto": gguf.JarvisFileType.GUESSED,
|
||||
}
|
||||
|
||||
is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
|
||||
|
|
|
@ -5,10 +5,10 @@
|
|||
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
|
||||
#
|
||||
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
|
||||
# provide the necessary information to llama.cpp via the GGUF header in order to implement
|
||||
# provide the necessary information to jarvis.cpp via the GGUF header in order to implement
|
||||
# the same pre-tokenizer.
|
||||
#
|
||||
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||
# ref: https://github.com/ggerganov/jarvis.cpp/pull/6920
|
||||
#
|
||||
# Instructions:
|
||||
#
|
||||
|
@ -18,9 +18,9 @@
|
|||
# python3 convert_hf_to_gguf_update.py <huggingface_token>
|
||||
#
|
||||
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
|
||||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
||||
# - Update jarvis.cpp with the new pre-tokenizer if necessary
|
||||
#
|
||||
# TODO: generate tokenizer tests for llama.cpp
|
||||
# TODO: generate tokenizer tests for jarvis.cpp
|
||||
#
|
||||
|
||||
import logging
|
||||
|
@ -65,8 +65,8 @@ else:
|
|||
|
||||
# TODO: add models here, base models preferred
|
||||
models = [
|
||||
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
|
||||
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
|
||||
{"name": "jarvis-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-jarvis/Jarvis-2-7b-hf", },
|
||||
{"name": "jarvis-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-jarvis/Meta-Jarvis-3-8B", },
|
||||
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
|
||||
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
|
||||
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
||||
|
@ -86,7 +86,7 @@ models = [
|
|||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Jarvis-3-70B-Instruct", },
|
||||
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
|
||||
|
@ -215,7 +215,7 @@ src_func = f"""
|
|||
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
||||
# is specific for the BPE pre-tokenizer used by the model
|
||||
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
||||
# use in llama.cpp to implement the same pre-tokenizer
|
||||
# use in jarvis.cpp to implement the same pre-tokenizer
|
||||
|
||||
chktxt = {repr(CHK_TXT)}
|
||||
|
||||
|
@ -239,7 +239,7 @@ src_func = f"""
|
|||
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
||||
logger.warning("** - the pre-tokenization config has changed upstream")
|
||||
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
||||
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
||||
logger.warning("** ref: https://github.com/ggerganov/jarvis.cpp/pull/6920")
|
||||
logger.warning("**")
|
||||
logger.warning(f"** chkhsh: {{chkhsh}}")
|
||||
logger.warning("**************************************************************************************")
|
||||
|
@ -311,7 +311,7 @@ tests = [
|
|||
"3333333",
|
||||
"33333333",
|
||||
"333333333",
|
||||
"Cửa Việt", # llama-bpe fails on this
|
||||
"Cửa Việt", # jarvis-bpe fails on this
|
||||
" discards",
|
||||
CHK_TXT,
|
||||
]
|
||||
|
|
|
@ -223,13 +223,13 @@ class GGMLToGGUF:
|
|||
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
|
||||
logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
|
||||
self.n_kv_head = n_kv_head
|
||||
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
|
||||
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.JARVIS, ggml_model.hyperparameters.n_layer)
|
||||
|
||||
def save(self):
|
||||
logger.info('* Preparing to save GGUF file')
|
||||
gguf_writer = gguf.GGUFWriter(
|
||||
self.cfg.output,
|
||||
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
||||
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.JARVIS],
|
||||
use_temp_file = False)
|
||||
self.add_params(gguf_writer)
|
||||
self.add_vocab(gguf_writer)
|
||||
|
@ -286,7 +286,7 @@ class GGMLToGGUF:
|
|||
|
||||
def add_vocab(self, gguf_writer):
|
||||
hp = self.model.hyperparameters
|
||||
gguf_writer.add_tokenizer_model('llama')
|
||||
gguf_writer.add_tokenizer_model('jarvis')
|
||||
gguf_writer.add_tokenizer_pre('default')
|
||||
tokens = []
|
||||
scores = []
|
||||
|
@ -358,7 +358,7 @@ class GGMLToGGUF:
|
|||
|
||||
|
||||
def handle_metadata(cfg, hp):
|
||||
import examples.convert_legacy_llama as convert
|
||||
import examples.convert_legacy_jarvis as convert
|
||||
|
||||
assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
|
||||
hf_config_path = cfg.model_metadata_dir / "config.json"
|
||||
|
|
|
@ -271,12 +271,12 @@ if __name__ == '__main__':
|
|||
args = parse_args()
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
ftype_map: dict[str, gguf.LlamaFileType] = {
|
||||
"f32": gguf.LlamaFileType.ALL_F32,
|
||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||
"auto": gguf.LlamaFileType.GUESSED,
|
||||
ftype_map: dict[str, gguf.JarvisFileType] = {
|
||||
"f32": gguf.JarvisFileType.ALL_F32,
|
||||
"f16": gguf.JarvisFileType.MOSTLY_F16,
|
||||
"bf16": gguf.JarvisFileType.MOSTLY_BF16,
|
||||
"q8_0": gguf.JarvisFileType.MOSTLY_Q8_0,
|
||||
"auto": gguf.JarvisFileType.GUESSED,
|
||||
}
|
||||
|
||||
ftype = ftype_map[args.outtype]
|
||||
|
@ -372,9 +372,9 @@ if __name__ == '__main__':
|
|||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
dest = list(super().modify_tensors(data_torch, name, bid))
|
||||
# some archs may have the same tensor for lm_head and output (tie word embeddings)
|
||||
# in this case, adapters targeting lm_head will fail when using llama-export-lora
|
||||
# in this case, adapters targeting lm_head will fail when using jarvis-export-lora
|
||||
# therefore, we ignore them for now
|
||||
# see: https://github.com/ggerganov/llama.cpp/issues/9065
|
||||
# see: https://github.com/ggerganov/jarvis.cpp/issues/9065
|
||||
if name == "lm_head.weight" and len(dest) == 0:
|
||||
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
||||
for dest_name, dest_data in dest:
|
||||
|
|
|
@ -5,14 +5,14 @@
|
|||
|
||||
[Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid.
|
||||
|
||||
With Termux, you can install and run `llama.cpp` as if the environment were Linux. Once in the Termux shell:
|
||||
With Termux, you can install and run `jarvis.cpp` as if the environment were Linux. Once in the Termux shell:
|
||||
|
||||
```
|
||||
$ apt update && apt upgrade -y
|
||||
$ apt install git cmake
|
||||
```
|
||||
|
||||
Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake.
|
||||
Then, follow the [build instructions](https://github.com/ggerganov/jarvis.cpp/blob/master/docs/build.md), specifically for CMake.
|
||||
|
||||
Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:
|
||||
|
||||
|
@ -20,22 +20,22 @@ Once the binaries are built, download your model of choice (e.g., from Hugging F
|
|||
$ curl -L {model-url} -o ~/{model}.gguf
|
||||
```
|
||||
|
||||
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
|
||||
Then, if you are not already in the repo directory, `cd` into `jarvis.cpp` and:
|
||||
|
||||
```
|
||||
$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
|
||||
$ ./build/bin/jarvis-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
|
||||
```
|
||||
|
||||
Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
|
||||
Here, we show `jarvis-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
|
||||
|
||||
To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
|
||||
|
||||
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
||||
|
||||
## Cross-compile using Android NDK
|
||||
It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)
|
||||
It's possible to build `jarvis.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)
|
||||
|
||||
Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory:
|
||||
Once you're ready and have cloned `jarvis.cpp`, invoke the following in the project directory:
|
||||
|
||||
```
|
||||
$ cmake \
|
||||
|
@ -45,15 +45,15 @@ $ cmake \
|
|||
-DCMAKE_C_FLAGS="-march=armv8.7a" \
|
||||
-DCMAKE_CXX_FLAGS="-march=armv8.7a" \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DGGML_LLAMAFILE=OFF \
|
||||
-DGGML_JARVISFILE=OFF \
|
||||
-B build-android
|
||||
```
|
||||
|
||||
Notes:
|
||||
- While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time
|
||||
- `llamafile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/llamafile/issues/325)
|
||||
- `jarvisfile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/jarvisfile/issues/325)
|
||||
|
||||
The above command should configure `llama.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `llama.cpp` includes runtime checks for available CPU features it can use.
|
||||
The above command should configure `jarvis.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `jarvis.cpp` includes runtime checks for available CPU features it can use.
|
||||
|
||||
Feel free to adjust the Android ABI for your target. Once the project is configured:
|
||||
|
||||
|
@ -65,17 +65,17 @@ $ cmake --install build-android --prefix {install-dir} --config Release
|
|||
After installing, go ahead and download the model of your choice to your host system. Then:
|
||||
|
||||
```
|
||||
$ adb shell "mkdir /data/local/tmp/llama.cpp"
|
||||
$ adb push {install-dir} /data/local/tmp/llama.cpp/
|
||||
$ adb push {model}.gguf /data/local/tmp/llama.cpp/
|
||||
$ adb shell "mkdir /data/local/tmp/jarvis.cpp"
|
||||
$ adb push {install-dir} /data/local/tmp/jarvis.cpp/
|
||||
$ adb push {model}.gguf /data/local/tmp/jarvis.cpp/
|
||||
$ adb shell
|
||||
```
|
||||
|
||||
In the `adb shell`:
|
||||
|
||||
```
|
||||
$ cd /data/local/tmp/llama.cpp
|
||||
$ LD_LIBRARY_PATH=lib ./bin/llama-simple -m {model}.gguf -c {context-size} -p "{your-prompt}"
|
||||
$ cd /data/local/tmp/jarvis.cpp
|
||||
$ LD_LIBRARY_PATH=lib ./bin/jarvis-simple -m {model}.gguf -c {context-size} -p "{your-prompt}"
|
||||
```
|
||||
|
||||
That's it!
|
||||
|
|
|
@ -25,13 +25,13 @@ sudo make install
|
|||
|
||||
We recommend using openmp since it's easier to modify the cores being used.
|
||||
|
||||
### llama.cpp compilation
|
||||
### jarvis.cpp compilation
|
||||
|
||||
Makefile:
|
||||
|
||||
```bash
|
||||
make GGML_BLIS=1 -j
|
||||
# make GGML_BLIS=1 llama-benchmark-matmult
|
||||
# make GGML_BLIS=1 jarvis-benchmark-matmult
|
||||
```
|
||||
|
||||
CMake:
|
||||
|
@ -43,7 +43,7 @@ cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
|
|||
make -j
|
||||
```
|
||||
|
||||
### llama.cpp execution
|
||||
### jarvis.cpp execution
|
||||
|
||||
According to the BLIS documentation, we could set the following
|
||||
environment variables to modify the behavior of openmp:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# llama.cpp for CANN
|
||||
# jarvis.cpp for CANN
|
||||
|
||||
- [Background](#background)
|
||||
- [News](#news)
|
||||
|
@ -17,9 +17,9 @@
|
|||
|
||||
**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
|
||||
|
||||
**Llama.cpp + CANN**
|
||||
**Jarvis.cpp + CANN**
|
||||
|
||||
The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
|
||||
The jarvis.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
|
||||
|
||||
## News
|
||||
|
||||
|
@ -78,11 +78,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
|||
| GritLM-7B | √ | √ | √ |
|
||||
| internlm2_5-7b-chat | √ | √ | √ |
|
||||
| koala-7B-HF | √ | √ | √ |
|
||||
| Llama-2-7b-chat-hf | √ | √ | √ |
|
||||
| Llama-3-Smaug-8B | √ | √ | √ |
|
||||
| Llama2-Chinese-7b-Chat | √ | √ | √ |
|
||||
| Llama3-8B | √ | √ | √ |
|
||||
| Llama3-8b-chinese | √ | √ | √ |
|
||||
| Jarvis-2-7b-chat-hf | √ | √ | √ |
|
||||
| Jarvis-3-Smaug-8B | √ | √ | √ |
|
||||
| Jarvis2-Chinese-7b-Chat | √ | √ | √ |
|
||||
| Jarvis3-8B | √ | √ | √ |
|
||||
| Jarvis3-8b-chinese | √ | √ | √ |
|
||||
| mamba-130m-hf | √ | √ | √ |
|
||||
| Mistral-7B-Instruct-v0.2 | √ | √ | √ |
|
||||
| Mixtral-8x7B-Instruct-v0.1 | x | √ | √ |
|
||||
|
@ -120,9 +120,9 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
|||
## Docker
|
||||
|
||||
### Build Images
|
||||
You can get a image with llama.cpp in one command.
|
||||
You can get a image with jarvis.cpp in one command.
|
||||
```sh
|
||||
docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile .
|
||||
docker build -t jarvis-cpp-cann -f .devops/jarvis-cli-cann.Dockerfile .
|
||||
```
|
||||
|
||||
### Run container
|
||||
|
@ -133,7 +133,7 @@ npu-smi info
|
|||
|
||||
# Select the cards that you want to use, make sure these cards are not used by someone.
|
||||
# Following using cards of device0.
|
||||
docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
|
||||
docker run --name jarviscpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it jarvis-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
|
||||
```
|
||||
|
||||
*Notes:*
|
||||
|
@ -208,7 +208,7 @@ docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager
|
|||
|
||||
Upon a successful installation, CANN is enabled for the available ascend devices.
|
||||
|
||||
### II. Build llama.cpp
|
||||
### II. Build jarvis.cpp
|
||||
|
||||
```sh
|
||||
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
|
||||
|
@ -242,13 +242,13 @@ cmake --build build --config release
|
|||
- Use device 0:
|
||||
|
||||
```sh
|
||||
./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||
./build/bin/jarvis-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||
```
|
||||
|
||||
- Use multiple devices:
|
||||
|
||||
```sh
|
||||
./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||
./build/bin/jarvis-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||
```
|
||||
|
||||
### **GitHub contribution**:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# llama.cpp for SYCL
|
||||
# jarvis.cpp for SYCL
|
||||
|
||||
- [Background](#background)
|
||||
- [Recommended Release](#recommended-release)
|
||||
|
@ -24,9 +24,9 @@
|
|||
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
|
||||
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
||||
|
||||
### Llama.cpp + SYCL
|
||||
### Jarvis.cpp + SYCL
|
||||
|
||||
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
|
||||
The jarvis.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
|
||||
|
||||
## Recommended Release
|
||||
|
||||
|
@ -36,7 +36,7 @@ The following release is verified with good quality:
|
|||
|
||||
|Commit ID|Tag|Release|Verified Platform|
|
||||
|-|-|-|-|
|
||||
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
|
||||
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[jarvis-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/jarvis.cpp/releases/download/b3038/jarvis-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
|
||||
|
||||
|
||||
## News
|
||||
|
@ -46,7 +46,7 @@ The following release is verified with good quality:
|
|||
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
|
||||
|
||||
- 2024.5
|
||||
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
|
||||
- Performance is increased: 34 -> 37 tokens/s of jarvis-2-7b.Q4_0 on Arc770.
|
||||
- Arch Linux is verified successfully.
|
||||
|
||||
- 2024.4
|
||||
|
@ -54,8 +54,8 @@ The following release is verified with good quality:
|
|||
|
||||
- 2024.3
|
||||
- Release binary files of Windows.
|
||||
- A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
|
||||
- New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
|
||||
- A blog is published: **Run LLM on all Intel GPUs Using jarvis.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-jarvis-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-jarvis-cpp-fd2e2dcbd9bd).
|
||||
- New base line is ready: [tag b2437](https://github.com/ggerganov/jarvis.cpp/tree/b2437).
|
||||
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
|
||||
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
|
||||
- Support detecting all GPUs with level-zero and same top **Max compute units**.
|
||||
|
@ -100,9 +100,9 @@ SYCL backend supports Intel GPU Family:
|
|||
*Notes:*
|
||||
|
||||
- **Memory**
|
||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
|
||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/jarvis-cli`.
|
||||
|
||||
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
||||
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *jarvis-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
||||
|
||||
- **Execution Unit (EU)**
|
||||
- If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
|
||||
|
@ -130,14 +130,14 @@ The docker build option is currently limited to *intel GPU* targets.
|
|||
### Build image
|
||||
```sh
|
||||
# Using FP16
|
||||
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
|
||||
docker build -t jarvis-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/jarvis-cli-intel.Dockerfile .
|
||||
```
|
||||
|
||||
*Notes*:
|
||||
|
||||
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
|
||||
|
||||
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
||||
You can also use the `.devops/jarvis-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
||||
|
||||
### Run container
|
||||
|
||||
|
@ -145,7 +145,7 @@ You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *
|
|||
# First, find all the DRI cards
|
||||
ls -la /dev/dri
|
||||
# Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
|
||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 jarvis-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
```
|
||||
|
||||
*Notes:*
|
||||
|
@ -276,7 +276,7 @@ For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
|
|||
[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
|
||||
```
|
||||
|
||||
### II. Build llama.cpp
|
||||
### II. Build jarvis.cpp
|
||||
|
||||
#### Intel GPU
|
||||
|
||||
|
@ -309,7 +309,7 @@ export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
|
|||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
|
||||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
||||
|
||||
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
||||
# Build JARVIS with Nvidia BLAS acceleration through SYCL
|
||||
|
||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
@ -329,7 +329,7 @@ export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
|
|||
export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
|
||||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
|
||||
|
||||
# Build LLAMA with rocBLAS acceleration through SYCL
|
||||
# Build JARVIS with rocBLAS acceleration through SYCL
|
||||
|
||||
## AMD
|
||||
# Use FP32, FP16 is not supported
|
||||
|
@ -344,7 +344,7 @@ cmake --build build --config Release -j -v
|
|||
|
||||
#### Retrieve and prepare model
|
||||
|
||||
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
|
||||
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [jarvis-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Jarvis-2-7B-GGUF/blob/main/jarvis-2-7b.Q4_0.gguf) model as example.
|
||||
|
||||
##### Check device
|
||||
|
||||
|
@ -359,7 +359,7 @@ source /opt/intel/oneapi/setvars.sh
|
|||
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
||||
|
||||
```sh
|
||||
./build/bin/llama-ls-sycl-device
|
||||
./build/bin/jarvis-ls-sycl-device
|
||||
```
|
||||
|
||||
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
||||
|
@ -390,12 +390,12 @@ Choose one of following methods to run.
|
|||
- Use device 0:
|
||||
|
||||
```sh
|
||||
./examples/sycl/run-llama2.sh 0
|
||||
./examples/sycl/run-jarvis2.sh 0
|
||||
```
|
||||
- Use multiple devices:
|
||||
|
||||
```sh
|
||||
./examples/sycl/run-llama2.sh
|
||||
./examples/sycl/run-jarvis2.sh
|
||||
```
|
||||
|
||||
2. Command line
|
||||
|
@ -418,13 +418,13 @@ Examples:
|
|||
- Use device 0:
|
||||
|
||||
```sh
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/jarvis-cli -m models/jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||
```
|
||||
|
||||
- Use multiple devices:
|
||||
|
||||
```sh
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/jarvis-cli -m models/jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||
```
|
||||
|
||||
*Notes:*
|
||||
|
@ -492,7 +492,7 @@ a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can
|
|||
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
|
||||
|
||||
|
||||
### II. Build llama.cpp
|
||||
### II. Build jarvis.cpp
|
||||
|
||||
You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
|
||||
|
||||
|
@ -506,7 +506,7 @@ Choose one of following methods to build from source code.
|
|||
|
||||
2. CMake
|
||||
|
||||
On the oneAPI command line window, step into the llama.cpp main directory and run the following:
|
||||
On the oneAPI command line window, step into the jarvis.cpp main directory and run the following:
|
||||
|
||||
```
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
@ -524,34 +524,34 @@ Or, use CMake presets to build:
|
|||
|
||||
```sh
|
||||
cmake --preset x64-windows-sycl-release
|
||||
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||
cmake --build build-x64-windows-sycl-release -j --target jarvis-cli
|
||||
|
||||
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||
cmake --build build-x64-windows-sycl-release -j --target jarvis-cli
|
||||
|
||||
cmake --preset x64-windows-sycl-debug
|
||||
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
|
||||
cmake --build build-x64-windows-sycl-debug -j --target jarvis-cli
|
||||
```
|
||||
|
||||
3. Visual Studio
|
||||
|
||||
You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
|
||||
You can use Visual Studio to open jarvis.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
|
||||
|
||||
*Notes:*
|
||||
|
||||
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
|
||||
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target jarvis-cli`.
|
||||
|
||||
### III. Run the inference
|
||||
|
||||
#### Retrieve and prepare model
|
||||
|
||||
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
|
||||
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [jarvis-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Jarvis-2-7B-GGUF/blob/main/jarvis-2-7b.Q4_0.gguf) model as example.
|
||||
|
||||
##### Check device
|
||||
|
||||
1. Enable oneAPI running environment
|
||||
|
||||
On the oneAPI command line window, run the following and step into the llama.cpp directory:
|
||||
On the oneAPI command line window, run the following and step into the jarvis.cpp directory:
|
||||
```
|
||||
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
|
||||
```
|
||||
|
@ -561,7 +561,7 @@ On the oneAPI command line window, run the following and step into the llama.cpp
|
|||
Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
|
||||
|
||||
```
|
||||
build\bin\llama-ls-sycl-device.exe
|
||||
build\bin\jarvis-ls-sycl-device.exe
|
||||
```
|
||||
|
||||
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
||||
|
@ -589,7 +589,7 @@ Choose one of following methods to run.
|
|||
1. Script
|
||||
|
||||
```
|
||||
examples\sycl\win-run-llama2.bat
|
||||
examples\sycl\win-run-jarvis2.bat
|
||||
```
|
||||
|
||||
2. Command line
|
||||
|
@ -613,13 +613,13 @@ Examples:
|
|||
- Use device 0:
|
||||
|
||||
```
|
||||
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||
build\bin\jarvis-cli.exe -m models\jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||
```
|
||||
|
||||
- Use multiple devices:
|
||||
|
||||
```
|
||||
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||
build\bin\jarvis-cli.exe -m models\jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||
```
|
||||
|
||||
|
||||
|
@ -682,13 +682,13 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
|||
```
|
||||
Otherwise, please double-check the GPU driver installation steps.
|
||||
|
||||
- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend?
|
||||
- Can I report Ojarvis issue on Intel GPU to jarvis.cpp SYCL backend?
|
||||
|
||||
No. We can't support Ollama issue directly, because we aren't familiar with Ollama.
|
||||
No. We can't support Ojarvis issue directly, because we aren't familiar with Ojarvis.
|
||||
|
||||
Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it.
|
||||
Sugguest reproducing on jarvis.cpp and report similar issue to jarvis.cpp. We will surpport it.
|
||||
|
||||
It's same for other projects including llama.cpp SYCL backend.
|
||||
It's same for other projects including jarvis.cpp SYCL backend.
|
||||
|
||||
- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
|
||||
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
# Build llama.cpp locally
|
||||
# Build jarvis.cpp locally
|
||||
|
||||
**To get the Code:**
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
git clone https://github.com/ggerganov/jarvis.cpp
|
||||
cd jarvis.cpp
|
||||
```
|
||||
|
||||
In order to build llama.cpp you have four different options.
|
||||
In order to build jarvis.cpp you have four different options.
|
||||
|
||||
- Using `make`:
|
||||
- On Linux or MacOS:
|
||||
|
@ -21,17 +21,17 @@ In order to build llama.cpp you have four different options.
|
|||
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
||||
2. Extract `w64devkit` on your pc.
|
||||
3. Run `w64devkit.exe`.
|
||||
4. Use the `cd` command to reach the `llama.cpp` folder.
|
||||
4. Use the `cd` command to reach the `jarvis.cpp` folder.
|
||||
5. From here you can run:
|
||||
```bash
|
||||
make
|
||||
```
|
||||
|
||||
- Notes:
|
||||
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
|
||||
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_JARVISFILE=1` flag. For example, use `make GGML_NO_JARVISFILE=1`.
|
||||
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
|
||||
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||
- For debug builds, run `make LLAMA_DEBUG=1`
|
||||
- For debug builds, run `make JARVIS_DEBUG=1`
|
||||
|
||||
- Using `CMake`:
|
||||
|
||||
|
@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options.
|
|||
|
||||
**Notes**:
|
||||
|
||||
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
|
||||
- For `Q4_0_4_4` quantization type build, add the `-DGGML_JARVISFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_JARVISFILE=OFF`.
|
||||
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
||||
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||
- For debug builds, there are two cases:
|
||||
|
@ -118,7 +118,7 @@ This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS i
|
|||
4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
|
||||
5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
|
||||
6. Run `w64devkit.exe`.
|
||||
7. Use the `cd` command to reach the `llama.cpp` folder.
|
||||
7. Use the `cd` command to reach the `jarvis.cpp` folder.
|
||||
8. From here you can run:
|
||||
|
||||
```bash
|
||||
|
@ -140,13 +140,13 @@ Check [BLIS.md](./backend/BLIS.md) for more information.
|
|||
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
|
||||
|
||||
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
||||
jarvis.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
||||
|
||||
For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
|
||||
For detailed info, please refer to [jarvis.cpp for SYCL](./backend/SYCL.md).
|
||||
|
||||
### Intel oneMKL
|
||||
|
||||
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
|
||||
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [jarvis.cpp for SYCL](./backend/SYCL.md).
|
||||
|
||||
- Using manual oneAPI installation:
|
||||
By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
|
||||
|
@ -159,7 +159,7 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
|
|||
- Using oneAPI docker image:
|
||||
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
|
||||
|
||||
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
|
||||
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-jarvis2-on-intel-cpu.html) for more information.
|
||||
|
||||
### CUDA
|
||||
|
||||
|
@ -300,7 +300,7 @@ Libs: -lvulkan-1
|
|||
EOF
|
||||
|
||||
```
|
||||
Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
|
||||
Switch into the `jarvis.cpp` directory and run `make GGML_VULKAN=1`.
|
||||
|
||||
#### MSYS2
|
||||
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
|
||||
|
@ -311,7 +311,7 @@ Install [MSYS2](https://www.msys2.org/) and then run the following commands in a
|
|||
mingw-w64-ucrt-x86_64-vulkan-devel \
|
||||
mingw-w64-ucrt-x86_64-shaderc
|
||||
```
|
||||
Switch into `llama.cpp` directory and build using CMake.
|
||||
Switch into `jarvis.cpp` directory and build using CMake.
|
||||
```sh
|
||||
cmake -B build -DGGML_VULKAN=ON
|
||||
cmake --build build --config Release
|
||||
|
@ -323,10 +323,10 @@ You don't need to install Vulkan SDK. It will be installed inside the container.
|
|||
|
||||
```sh
|
||||
# Build the image
|
||||
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
|
||||
docker build -t jarvis-cpp-vulkan -f .devops/jarvis-cli-vulkan.Dockerfile .
|
||||
|
||||
# Then, use it:
|
||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 jarvis-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
```
|
||||
|
||||
**Without docker**:
|
||||
|
@ -348,13 +348,13 @@ Alternatively your package manager might be able to provide the appropriate libr
|
|||
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
|
||||
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
|
||||
|
||||
Then, build llama.cpp using the cmake command below:
|
||||
Then, build jarvis.cpp using the cmake command below:
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_VULKAN=1
|
||||
cmake --build build --config Release
|
||||
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
||||
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||
./bin/jarvis-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||
|
||||
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
||||
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
||||
|
@ -367,7 +367,7 @@ For more information about Ascend NPU in [Ascend Community](https://www.hiascend
|
|||
|
||||
Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
|
||||
|
||||
Go to `llama.cpp` directory and build using CMake.
|
||||
Go to `jarvis.cpp` directory and build using CMake.
|
||||
```bash
|
||||
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
|
||||
cmake --build build --config release
|
||||
|
@ -375,15 +375,15 @@ cmake --build build --config release
|
|||
|
||||
You can test with:
|
||||
|
||||
`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
|
||||
`./build/jarvis-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
|
||||
|
||||
If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
|
||||
If the fllowing info is output on screen, you are using `jarvis.cpp by CANN backend`:
|
||||
```bash
|
||||
llm_load_tensors: CANN buffer size = 13313.00 MiB
|
||||
llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
|
||||
jarvis_new_context_with_model: CANN compute buffer size = 1260.81 MiB
|
||||
```
|
||||
|
||||
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
|
||||
For detailed info, such as model/device supports, CANN install, please refer to [jarvis.cpp for CANN](./backend/CANN.md).
|
||||
|
||||
### Android
|
||||
|
||||
|
@ -391,6 +391,6 @@ To read documentation for how to build on Android, [click here](./android.md)
|
|||
|
||||
### Arm CPU optimized mulmat kernels
|
||||
|
||||
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
|
||||
Jarvis.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
|
||||
|
||||
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
|
||||
To support `Q4_0_4_4`, you must build with `GGML_NO_JARVISFILE=1` (`make`) or `-DGGML_JARVISFILE=OFF` (`cmake`).
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
# Add a new model architecture to `llama.cpp`
|
||||
# Add a new model architecture to `jarvis.cpp`
|
||||
|
||||
Adding a model requires few steps:
|
||||
|
||||
1. Convert the model to GGUF
|
||||
2. Define the model architecture in `llama.cpp`
|
||||
2. Define the model architecture in `jarvis.cpp`
|
||||
3. Build the GGML graph implementation
|
||||
|
||||
After following these steps, you can open PR.
|
||||
|
@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
|
|||
### 1. Convert the model to GGUF
|
||||
|
||||
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
||||
Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
|
||||
Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_jarvis.py](/examples/convert_legacy_jarvis.py) (for `jarvis/jarvis2` models in `.pth` format).
|
||||
|
||||
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
||||
|
||||
|
@ -81,26 +81,26 @@ Depending on the model configuration, tokenizer, code and tensors layout, you wi
|
|||
|
||||
NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
|
||||
|
||||
### 2. Define the model architecture in `llama.cpp`
|
||||
### 2. Define the model architecture in `jarvis.cpp`
|
||||
|
||||
The model params and tensors layout must be defined in `llama.cpp`:
|
||||
The model params and tensors layout must be defined in `jarvis.cpp`:
|
||||
1. Define a new `llm_arch`
|
||||
2. Define the tensors layout in `LLM_TENSOR_NAMES`
|
||||
3. Add any non standard metadata in `llm_load_hparams`
|
||||
4. Create the tensors for inference in `llm_load_tensors`
|
||||
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
|
||||
5. If the model has a RoPE operation, add the rope type in `jarvis_rope_type`
|
||||
|
||||
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
|
||||
|
||||
### 3. Build the GGML graph implementation
|
||||
|
||||
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
|
||||
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `jarvis_build_graph`.
|
||||
|
||||
Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
|
||||
Have a look at existing implementation like `build_jarvis`, `build_dbrx` or `build_bert`.
|
||||
|
||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
||||
|
||||
Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
|
||||
Note: to debug the inference graph: you can use [jarvis-eval-callback](/examples/eval-callback/).
|
||||
|
||||
## GGUF specification
|
||||
|
||||
|
@ -108,12 +108,12 @@ https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
|
|||
|
||||
## Resources
|
||||
|
||||
- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
|
||||
- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
|
||||
- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
|
||||
- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
|
||||
- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
|
||||
- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
|
||||
- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
|
||||
- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
|
||||
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
|
||||
- YaRN RoPE scaling https://github.com/ggerganov/jarvis.cpp/pull/2268
|
||||
- support Baichuan serial models https://github.com/ggerganov/jarvis.cpp/pull/3009
|
||||
- support attention bias https://github.com/ggerganov/jarvis.cpp/pull/4283
|
||||
- Mixtral support https://github.com/ggerganov/jarvis.cpp/pull/4406
|
||||
- BERT embeddings https://github.com/ggerganov/jarvis.cpp/pull/5423
|
||||
- Grok-1 support https://github.com/ggerganov/jarvis.cpp/pull/6204
|
||||
- Command R Plus support https://github.com/ggerganov/jarvis.cpp/pull/6491
|
||||
- support arch DBRX https://github.com/ggerganov/jarvis.cpp/pull/6515
|
||||
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/jarvis.cpp/discussions/2948
|
||||
|
|
|
@ -51,7 +51,7 @@ rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
|
|||
Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
|
||||
|
||||
```bash
|
||||
cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
|
||||
cmake -DCMAKE_BUILD_TYPE=Debug -DJARVIS_CUDA=1 -DJARVIS_FATAL_WARNINGS=ON ..
|
||||
make -j
|
||||
```
|
||||
|
||||
|
@ -71,12 +71,12 @@ This may return output similar to below (focusing on key lines to pay attention
|
|||
|
||||
```bash
|
||||
...
|
||||
1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
|
||||
1: Test command: ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf"
|
||||
1: Working Directory: .
|
||||
Labels: main
|
||||
Test #1: test-tokenizer-0-llama-spm
|
||||
Test #1: test-tokenizer-0-jarvis-spm
|
||||
...
|
||||
4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf"
|
||||
4: Test command: ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-falcon.gguf"
|
||||
4: Working Directory: .
|
||||
Labels: main
|
||||
Test #4: test-tokenizer-0-falcon
|
||||
|
@ -86,8 +86,8 @@ Labels: main
|
|||
#### Step 4: Identify Test Command for Debugging
|
||||
|
||||
So for test #1 above we can tell these two pieces of relevant information:
|
||||
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
|
||||
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
|
||||
* Test Binary: `~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0`
|
||||
* Test GGUF Model: `~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf`
|
||||
|
||||
#### Step 5: Run GDB on test command
|
||||
|
||||
|
@ -100,5 +100,5 @@ gdb --args ${Test Binary} ${Test GGUF Model}
|
|||
Example:
|
||||
|
||||
```bash
|
||||
gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
|
||||
gdb --args ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf"
|
||||
```
|
||||
|
|
|
@ -1,23 +1,23 @@
|
|||
# Token generation performance troubleshooting
|
||||
|
||||
## Verifying that the model is running on the GPU with CUDA
|
||||
Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
||||
Make sure you compiled jarvis with the correct env variables according to [this guide](/docs/build.md#cuda), so that jarvis accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running jarvis, you may configure `N` to be very large, and jarvis will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
||||
```shell
|
||||
./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
||||
./jarvis-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
|
||||
```
|
||||
|
||||
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
||||
When running jarvis, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
||||
```shell
|
||||
llama_model_load_internal: [cublas] offloading 60 layers to GPU
|
||||
llama_model_load_internal: [cublas] offloading output layer to GPU
|
||||
llama_model_load_internal: [cublas] total VRAM used: 17223 MB
|
||||
jarvis_model_load_internal: [cublas] offloading 60 layers to GPU
|
||||
jarvis_model_load_internal: [cublas] offloading output layer to GPU
|
||||
jarvis_model_load_internal: [cublas] total VRAM used: 17223 MB
|
||||
... rest of inference
|
||||
```
|
||||
|
||||
If you see these lines, then the GPU is being used.
|
||||
|
||||
## Verifying that the CPU is not oversaturated
|
||||
llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
|
||||
jarvis accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
|
||||
|
||||
# Example of runtime flags effect on inference speed benchmark
|
||||
These runs were tested on the following machine:
|
||||
|
@ -27,7 +27,7 @@ RAM: 32GB
|
|||
|
||||
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
|
||||
|
||||
Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
||||
Run command: `./jarvis-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
||||
|
||||
Result:
|
||||
|
||||
|
|
|
@ -2,26 +2,26 @@
|
|||
|
||||
## Prerequisites
|
||||
* Docker must be installed and running on your system.
|
||||
* Create a folder to store big models & intermediate files (ex. /llama/models)
|
||||
* Create a folder to store big models & intermediate files (ex. /jarvis/models)
|
||||
|
||||
## Images
|
||||
We have three Docker images available for this project:
|
||||
|
||||
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
1. `ghcr.io/ggerganov/jarvis.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggerganov/jarvis.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
3. `ghcr.io/ggerganov/jarvis.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
|
||||
Additionally, there the following images, similar to the above:
|
||||
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/jarvis.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/jarvis.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/jarvis.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/jarvis.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/jarvis.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/jarvis.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/jarvis.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/jarvis.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/jarvis.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
|
||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
|
||||
|
||||
|
@ -32,25 +32,25 @@ The easiest way to download the models, convert them to ggml and optimize them i
|
|||
Replace `/path/to/models` below with the actual path where you downloaded the models.
|
||||
|
||||
```bash
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:full --all-in-one "/models/" 7B
|
||||
```
|
||||
|
||||
On completion, you are ready to play!
|
||||
|
||||
```bash
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
or with a light image:
|
||||
|
||||
```bash
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
or with a server image:
|
||||
|
||||
```bash
|
||||
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
|
||||
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/jarvis.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
|
||||
```
|
||||
|
||||
## Docker With CUDA
|
||||
|
@ -60,9 +60,9 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
|
|||
## Building Docker locally
|
||||
|
||||
```bash
|
||||
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
|
||||
docker build -t local/jarvis.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
||||
docker build -t local/jarvis.cpp:light-cuda -f .devops/jarvis-cli-cuda.Dockerfile .
|
||||
docker build -t local/jarvis.cpp:server-cuda -f .devops/jarvis-server-cuda.Dockerfile .
|
||||
```
|
||||
|
||||
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
||||
|
@ -74,18 +74,18 @@ The defaults are:
|
|||
|
||||
The resulting images, are essentially the same as the non-CUDA images:
|
||||
|
||||
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
|
||||
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
|
||||
1. `local/jarvis.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||
2. `local/jarvis.cpp:light-cuda`: This image only includes the main executable file.
|
||||
3. `local/jarvis.cpp:server-cuda`: This image only includes the server executable file.
|
||||
|
||||
## Usage
|
||||
|
||||
After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
|
||||
|
||||
```bash
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||
```
|
||||
|
||||
## Docker With MUSA
|
||||
|
@ -95,9 +95,9 @@ Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/
|
|||
## Building Docker locally
|
||||
|
||||
```bash
|
||||
docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
|
||||
docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
|
||||
docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
|
||||
docker build -t local/jarvis.cpp:full-musa -f .devops/full-musa.Dockerfile .
|
||||
docker build -t local/jarvis.cpp:light-musa -f .devops/jarvis-cli-musa.Dockerfile .
|
||||
docker build -t local/jarvis.cpp:server-musa -f .devops/jarvis-server-musa.Dockerfile .
|
||||
```
|
||||
|
||||
You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.
|
||||
|
@ -108,16 +108,16 @@ The defaults are:
|
|||
|
||||
The resulting images, are essentially the same as the non-MUSA images:
|
||||
|
||||
1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||
2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
|
||||
3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
|
||||
1. `local/jarvis.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||
2. `local/jarvis.cpp:light-musa`: This image only includes the main executable file.
|
||||
3. `local/jarvis.cpp:server-musa`: This image only includes the server executable file.
|
||||
|
||||
## Usage
|
||||
|
||||
After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag.
|
||||
|
||||
```bash
|
||||
docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||
docker run -v /path/to/models:/models local/jarvis.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run -v /path/to/models:/models local/jarvis.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run -v /path/to/models:/models local/jarvis.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||
```
|
||||
|
|
|
@ -1,39 +1,39 @@
|
|||
# Install pre-built version of llama.cpp
|
||||
# Install pre-built version of jarvis.cpp
|
||||
|
||||
## Homebrew
|
||||
|
||||
On Mac and Linux, the homebrew package manager can be used via
|
||||
|
||||
```sh
|
||||
brew install llama.cpp
|
||||
brew install jarvis.cpp
|
||||
```
|
||||
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
|
||||
The formula is automatically updated with new `jarvis.cpp` releases. More info: https://github.com/ggerganov/jarvis.cpp/discussions/7668
|
||||
|
||||
## Nix
|
||||
|
||||
On Mac and Linux, the Nix package manager can be used via
|
||||
|
||||
```sh
|
||||
nix profile install nixpkgs#llama-cpp
|
||||
nix profile install nixpkgs#jarvis-cpp
|
||||
```
|
||||
For flake enabled installs.
|
||||
|
||||
Or
|
||||
|
||||
```sh
|
||||
nix-env --file '<nixpkgs>' --install --attr llama-cpp
|
||||
nix-env --file '<nixpkgs>' --install --attr jarvis-cpp
|
||||
```
|
||||
|
||||
For non-flake enabled installs.
|
||||
|
||||
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
|
||||
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/jarvis-cpp/package.nix#L164).
|
||||
|
||||
## Flox
|
||||
|
||||
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
|
||||
On Mac and Linux, Flox can be used to install jarvis.cpp within a Flox environment via
|
||||
|
||||
```sh
|
||||
flox install llama-cpp
|
||||
flox install jarvis-cpp
|
||||
```
|
||||
|
||||
Flox follows the nixpkgs build of llama.cpp.
|
||||
Flox follows the nixpkgs build of jarvis.cpp.
|
||||
|
|
|
@ -13,10 +13,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
|||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(baby-llama)
|
||||
add_subdirectory(baby-jarvis)
|
||||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(convert-jarvis2c-to-ggml)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(eval-callback)
|
||||
add_subdirectory(export-lora)
|
||||
|
@ -27,7 +27,7 @@ else()
|
|||
add_subdirectory(gritlm)
|
||||
add_subdirectory(imatrix)
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(jarvis-bench)
|
||||
add_subdirectory(llava)
|
||||
add_subdirectory(lookahead)
|
||||
add_subdirectory(lookup)
|
||||
|
@ -41,7 +41,7 @@ else()
|
|||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
if (LLAMA_BUILD_SERVER)
|
||||
if (JARVIS_BUILD_SERVER)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
if (GGML_SYCL)
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
set -e
|
||||
|
||||
AI_NAME="${AI_NAME:-Miku}"
|
||||
MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
|
||||
MODEL="${MODEL:-./models/jarvis-2-7b-chat.ggmlv3.q4_K_M.bin}"
|
||||
USER_NAME="${USER_NAME:-Anon}"
|
||||
|
||||
# Uncomment and adjust to the number of CPU cores you want to use.
|
||||
|
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
|
|||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||
fi
|
||||
|
||||
./llama-cli "${GEN_OPTIONS[@]}" \
|
||||
./jarvis-cli "${GEN_OPTIONS[@]}" \
|
||||
--model "$MODEL" \
|
||||
--in-prefix " " \
|
||||
--in-suffix "${AI_NAME}:" \
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
set(TARGET llama-baby-llama)
|
||||
add_executable(${TARGET} baby-llama.cpp)
|
||||
set(TARGET jarvis-baby-jarvis)
|
||||
add_executable(${TARGET} baby-jarvis.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
|
@ -11,8 +11,8 @@
|
|||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#ifdef LLAMA_DEFAULT_RMS_EPS
|
||||
constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
||||
#ifdef JARVIS_DEFAULT_RMS_EPS
|
||||
constexpr float rms_norm_eps = JARVIS_DEFAULT_RMS_EPS;
|
||||
#else
|
||||
constexpr float rms_norm_eps = 5e-6f;
|
||||
#endif
|
||||
|
@ -71,7 +71,7 @@ static struct ggml_tensor * randomize_tensor(
|
|||
return tensor;
|
||||
}
|
||||
|
||||
struct llama_hparams {
|
||||
struct jarvis_hparams {
|
||||
uint32_t n_vocab = 32000;
|
||||
uint32_t n_ctx = 512; // this is provided as user input?
|
||||
uint32_t n_embd = 4096;
|
||||
|
@ -80,17 +80,17 @@ struct llama_hparams {
|
|||
uint32_t n_layer = 32;
|
||||
uint32_t n_rot = 64;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
return memcmp(this, &other, sizeof(llama_hparams));
|
||||
bool operator!=(const jarvis_hparams & other) const {
|
||||
return memcmp(this, &other, sizeof(jarvis_hparams));
|
||||
}
|
||||
};
|
||||
|
||||
static uint32_t get_n_ff(const struct llama_hparams* hparams) {
|
||||
static uint32_t get_n_ff(const struct jarvis_hparams* hparams) {
|
||||
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
|
||||
return n_ff;
|
||||
}
|
||||
|
||||
struct llama_hparams_lora {
|
||||
struct jarvis_hparams_lora {
|
||||
uint32_t n_vocab = 32000;
|
||||
uint32_t n_ctx = 512; // this is provided as user input?
|
||||
uint32_t n_embd = 4096;
|
||||
|
@ -100,12 +100,12 @@ struct llama_hparams_lora {
|
|||
uint32_t n_rot = 64;
|
||||
uint32_t n_lora = 64;
|
||||
|
||||
bool operator!=(const llama_hparams_lora & other) const {
|
||||
return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
|
||||
bool operator!=(const jarvis_hparams_lora & other) const {
|
||||
return memcmp(this, &other, sizeof(jarvis_hparams_lora)) != 0;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_layer {
|
||||
struct jarvis_layer {
|
||||
// normalization
|
||||
struct ggml_tensor * attention_norm;
|
||||
|
||||
|
@ -124,7 +124,7 @@ struct llama_layer {
|
|||
struct ggml_tensor * w3;
|
||||
};
|
||||
|
||||
struct llama_layer_lora {
|
||||
struct jarvis_layer_lora {
|
||||
// normalization
|
||||
struct ggml_tensor * attention_norm;
|
||||
|
||||
|
@ -148,34 +148,34 @@ struct llama_layer_lora {
|
|||
};
|
||||
|
||||
|
||||
struct llama_kv_cache {
|
||||
struct jarvis_kv_cache {
|
||||
struct ggml_context * ctx = NULL;
|
||||
|
||||
struct ggml_tensor * k;
|
||||
struct ggml_tensor * v;
|
||||
|
||||
// llama_ctx_buffer buf;
|
||||
// jarvis_ctx_buffer buf;
|
||||
|
||||
int n; // number of tokens currently in the cache
|
||||
};
|
||||
|
||||
struct llama_model {
|
||||
struct jarvis_model {
|
||||
struct ggml_context * ctx = NULL;
|
||||
|
||||
llama_hparams hparams;
|
||||
jarvis_hparams hparams;
|
||||
|
||||
struct ggml_tensor * tok_embeddings;
|
||||
|
||||
struct ggml_tensor * norm;
|
||||
struct ggml_tensor * output;
|
||||
|
||||
std::vector<llama_layer> layers;
|
||||
std::vector<jarvis_layer> layers;
|
||||
};
|
||||
|
||||
struct llama_model_lora {
|
||||
struct jarvis_model_lora {
|
||||
struct ggml_context * ctx = NULL;
|
||||
|
||||
llama_hparams_lora hparams;
|
||||
jarvis_hparams_lora hparams;
|
||||
|
||||
struct ggml_tensor * tok_embeddings;
|
||||
|
||||
|
@ -183,10 +183,10 @@ struct llama_model_lora {
|
|||
struct ggml_tensor * outputa;
|
||||
struct ggml_tensor * outputb;
|
||||
|
||||
std::vector<llama_layer_lora> layers;
|
||||
std::vector<jarvis_layer_lora> layers;
|
||||
};
|
||||
|
||||
static void init_model(struct llama_model * model) {
|
||||
static void init_model(struct jarvis_model * model) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
|
@ -223,7 +223,7 @@ static void init_model(struct llama_model * model) {
|
|||
}
|
||||
|
||||
|
||||
static void init_model_lora(struct llama_model_lora * model) {
|
||||
static void init_model_lora(struct jarvis_model_lora * model) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
|
@ -266,7 +266,7 @@ static void init_model_lora(struct llama_model_lora * model) {
|
|||
}
|
||||
}
|
||||
|
||||
static void set_param_model(struct llama_model * model) {
|
||||
static void set_param_model(struct jarvis_model * model) {
|
||||
const auto& hparams = model->hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
|
@ -292,7 +292,7 @@ static void set_param_model(struct llama_model * model) {
|
|||
}
|
||||
}
|
||||
|
||||
static void set_param_model_lora(struct llama_model_lora * model) {
|
||||
static void set_param_model_lora(struct jarvis_model_lora * model) {
|
||||
const auto& hparams = model->hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
|
@ -323,7 +323,7 @@ static void set_param_model_lora(struct llama_model_lora * model) {
|
|||
}
|
||||
}
|
||||
|
||||
static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
|
||||
static void randomize_model(struct jarvis_model * model, int seed, float mean, float std, float min, float max) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
|
@ -355,7 +355,7 @@ static void randomize_model(struct llama_model * model, int seed, float mean, fl
|
|||
|
||||
|
||||
static void randomize_model_lora(
|
||||
struct llama_model_lora * model, int seed, float mean, float std, float min, float max
|
||||
struct jarvis_model_lora * model, int seed, float mean, float std, float min, float max
|
||||
) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
|
@ -391,7 +391,7 @@ static void randomize_model_lora(
|
|||
free_random_normal_distribution(rnd);
|
||||
}
|
||||
|
||||
static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
|
||||
static void init_kv_cache(struct jarvis_kv_cache* cache, struct jarvis_model * model, int n_batch) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_ctx = hparams.n_ctx;
|
||||
|
@ -425,7 +425,7 @@ static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * mod
|
|||
cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
|
||||
}
|
||||
|
||||
static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
|
||||
static bool init_kv_cache_lora(struct jarvis_kv_cache* cache, struct jarvis_model_lora * model, int n_batch) {
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const uint32_t n_ctx = hparams.n_ctx;
|
||||
|
@ -462,8 +462,8 @@ static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_
|
|||
}
|
||||
|
||||
static struct ggml_tensor * forward(
|
||||
struct llama_model * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct jarvis_model * model,
|
||||
struct jarvis_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
|
@ -472,7 +472,7 @@ static struct ggml_tensor * forward(
|
|||
) {
|
||||
const int N = n_tokens;
|
||||
|
||||
struct llama_kv_cache& kv_self = *cache;
|
||||
struct jarvis_kv_cache& kv_self = *cache;
|
||||
const auto & hparams = model->hparams;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_embd = hparams.n_embd;
|
||||
|
@ -692,8 +692,8 @@ static struct ggml_tensor * forward(
|
|||
}
|
||||
|
||||
static struct ggml_tensor * forward_batch(
|
||||
struct llama_model * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct jarvis_model * model,
|
||||
struct jarvis_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
|
@ -703,7 +703,7 @@ static struct ggml_tensor * forward_batch(
|
|||
) {
|
||||
const int N = n_tokens;
|
||||
|
||||
struct llama_kv_cache& kv_self = *cache;
|
||||
struct jarvis_kv_cache& kv_self = *cache;
|
||||
const auto & hparams = model->hparams;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
@ -989,8 +989,8 @@ static struct ggml_tensor * forward_batch(
|
|||
}
|
||||
|
||||
static struct ggml_tensor * forward_lora(
|
||||
struct llama_model_lora * model,
|
||||
struct llama_kv_cache * cache,
|
||||
struct jarvis_model_lora * model,
|
||||
struct jarvis_kv_cache * cache,
|
||||
struct ggml_context * ctx0,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * tokens_input,
|
||||
|
@ -999,7 +999,7 @@ static struct ggml_tensor * forward_lora(
|
|||
) {
|
||||
const int N = n_tokens;
|
||||
|
||||
struct llama_kv_cache& kv_self = *cache;
|
||||
struct jarvis_kv_cache& kv_self = *cache;
|
||||
const auto & hparams = model->hparams;
|
||||
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
|
@ -1444,7 +1444,7 @@ int main(int argc, char ** argv) {
|
|||
lcparams.mem_buffer = NULL;
|
||||
lcparams.no_alloc = false;
|
||||
|
||||
struct llama_model model;
|
||||
struct jarvis_model model;
|
||||
model.hparams.n_vocab = 8;
|
||||
model.hparams.n_ctx = 8;
|
||||
model.hparams.n_embd = 32;
|
||||
|
@ -1467,7 +1467,7 @@ int main(int argc, char ** argv) {
|
|||
randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);
|
||||
|
||||
/*
|
||||
struct llama_model_lora model_lora;
|
||||
struct jarvis_model_lora model_lora;
|
||||
// model.hparams.n_vocab = 6;
|
||||
// model.hparams.n_ctx = 64;
|
||||
// model.hparams.n_embd = 128;
|
||||
|
@ -1501,7 +1501,7 @@ int main(int argc, char ** argv) {
|
|||
*/
|
||||
int n_batch = 8;
|
||||
// key + value cache for the self attention
|
||||
struct llama_kv_cache kv_self;
|
||||
struct jarvis_kv_cache kv_self;
|
||||
printf("init_kv_cache\n");
|
||||
kv_self.ctx = model.ctx;
|
||||
init_kv_cache(&kv_self, &model, n_batch);
|
||||
|
@ -1533,7 +1533,7 @@ int main(int argc, char ** argv) {
|
|||
int n_past = 0;
|
||||
|
||||
struct ggml_cgraph * gf = NULL;
|
||||
gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
|
||||
gf = ggml_new_graph_custom(ctx0, JARVIS_TRAIN_MAX_NODES, true);
|
||||
|
||||
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
|
||||
|
||||
|
@ -1601,7 +1601,7 @@ int main(int argc, char ** argv) {
|
|||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
struct ggml_cgraph * gf = NULL;
|
||||
gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
|
||||
gf = ggml_new_graph_custom(ctx0, JARVIS_TRAIN_MAX_NODES, true);
|
||||
|
||||
int n_past = 0;
|
||||
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
#
|
||||
# Usage:
|
||||
#
|
||||
# cd llama.cpp
|
||||
# cd jarvis.cpp
|
||||
# make -j
|
||||
#
|
||||
# ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
|
||||
|
@ -21,7 +21,7 @@ if [ $# -gt 2 ]; then
|
|||
eargs="${@:3}"
|
||||
fi
|
||||
|
||||
ftmp="__llama.cpp_example_tmp__.txt"
|
||||
ftmp="__jarvis.cpp_example_tmp__.txt"
|
||||
trap "rm -f $ftmp" EXIT
|
||||
|
||||
echo "Translate from English to French:
|
||||
|
@ -58,4 +58,4 @@ echo "$2
|
|||
model=$1
|
||||
|
||||
# generate the most likely continuation until the string "===" is found
|
||||
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
||||
./jarvis-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
set(TARGET llama-batched-bench)
|
||||
set(TARGET jarvis-batched-bench)
|
||||
add_executable(${TARGET} batched-bench.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# llama.cpp/example/batched-bench
|
||||
# jarvis.cpp/example/batched-bench
|
||||
|
||||
Benchmark the batched decoding performance of `llama.cpp`
|
||||
Benchmark the batched decoding performance of `jarvis.cpp`
|
||||
|
||||
## Usage
|
||||
|
||||
|
@ -10,16 +10,16 @@ There are 2 modes of operation:
|
|||
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||
|
||||
```bash
|
||||
./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
||||
./jarvis-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
||||
|
||||
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||
./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
||||
./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
||||
|
||||
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
||||
./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
||||
|
||||
# custom set of batches
|
||||
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
||||
./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
||||
```
|
||||
|
||||
## Sample results
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "jarvis.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
|
@ -17,7 +17,7 @@ static void print_usage(int, char ** argv) {
|
|||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
|
||||
if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_BENCH, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -31,42 +31,42 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// init LLM
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
jarvis_backend_init();
|
||||
jarvis_numa_init(params.numa);
|
||||
|
||||
// initialize the model
|
||||
|
||||
llama_model_params model_params = common_model_params_to_llama(params);
|
||||
jarvis_model_params model_params = common_model_params_to_jarvis(params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
jarvis_model * model = jarvis_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = common_context_params_to_llama(params);
|
||||
jarvis_context_params ctx_params = common_context_params_to_jarvis(params);
|
||||
|
||||
// ensure enough sequences are available
|
||||
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
|
||||
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
jarvis_context * ctx = jarvis_new_context_with_model(model, ctx_params);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||
fprintf(stderr , "%s: error: failed to create the jarvis_context\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int32_t n_kv_max = llama_n_ctx(ctx);
|
||||
const int32_t n_kv_max = jarvis_n_ctx(ctx);
|
||||
|
||||
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
|
||||
jarvis_batch batch = jarvis_batch_init(n_kv_max, 0, 1);
|
||||
|
||||
// decode in batches of ctx_params.n_batch tokens
|
||||
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
|
||||
auto decode_helper = [](jarvis_context * ctx, jarvis_batch & batch, int32_t n_batch) {
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
|
||||
llama_batch batch_view = {
|
||||
jarvis_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
|
@ -76,13 +76,13 @@ int main(int argc, char ** argv) {
|
|||
batch.logits + i,
|
||||
};
|
||||
|
||||
const int ret = llama_decode(ctx, batch_view);
|
||||
const int ret = jarvis_decode(ctx, batch_view);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
llama_synchronize(ctx);
|
||||
jarvis_synchronize(ctx);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -95,7 +95,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
LOG_ERR("%s: jarvis_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
@ -132,16 +132,16 @@ int main(int argc, char ** argv) {
|
|||
|
||||
const auto t_pp_start = ggml_time_us();
|
||||
|
||||
llama_kv_cache_clear(ctx);
|
||||
jarvis_kv_cache_clear(ctx);
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
LOG_ERR("%s: jarvis_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (is_pp_shared) {
|
||||
for (int32_t i = 1; i < pl; ++i) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
jarvis_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -157,7 +157,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
LOG_ERR("%s: jarvis_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
@ -189,14 +189,14 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
LOG("\n");
|
||||
llama_perf_context_print(ctx);
|
||||
jarvis_perf_context_print(ctx);
|
||||
|
||||
llama_batch_free(batch);
|
||||
jarvis_batch_free(batch);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
jarvis_free(ctx);
|
||||
jarvis_free_model(model);
|
||||
|
||||
llama_backend_free();
|
||||
jarvis_backend_free();
|
||||
|
||||
LOG("\n\n");
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
.PHONY: build
|
||||
|
||||
build:
|
||||
xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
|
||||
rm -f ./llama-batched-swift
|
||||
ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
|
||||
xcodebuild -scheme jarvis-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
|
||||
rm -f ./jarvis-batched-swift
|
||||
ln -s ./build/Build/Products/Debug/jarvis-batched-swift ./jarvis-batched-swift
|
||||
|
|
|
@ -4,17 +4,17 @@
|
|||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "llama-batched-swift",
|
||||
name: "jarvis-batched-swift",
|
||||
platforms: [.macOS(.v12)],
|
||||
dependencies: [
|
||||
.package(name: "llama", path: "../../"),
|
||||
.package(name: "jarvis", path: "../../"),
|
||||
],
|
||||
targets: [
|
||||
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||
// Targets can depend on other targets in this package and products from dependencies.
|
||||
.executableTarget(
|
||||
name: "llama-batched-swift",
|
||||
dependencies: ["llama"],
|
||||
name: "jarvis-batched-swift",
|
||||
dependencies: ["jarvis"],
|
||||
path: "Sources",
|
||||
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
||||
),
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue