Changed "llama" to "jarvis"

2024-10-26 11:10:29 -05:00 · 2024-10-26 11:10:29 -05:00 · 52ab617954
commit 52ab617954
parent 4dfbcf9646
372 changed files with 8788 additions and 8788 deletions
--- a/.devops/cloud-v-pipeline
+++ b/.devops/cloud-v-pipeline
@ -7,16 +7,16 @@ node('x86_runner1'){            // Running on x86 runner containing latest vecto
            checkout scm        // Clone the repo on Runner
        }
    }
-    stage('Compiling llama.cpp'){
+    stage('Compiling jarvis.cpp'){
        sh'''#!/bin/bash
-            make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
+            make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling jarvis for RISC-V
        '''
    }
-    stage('Running llama.cpp'){
+    stage('Running jarvis.cpp'){
        sh'''#!/bin/bash
            module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
-            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
-            cat llama_log.txt                   # Printing results
+            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./jarvis-cli -m /home/alitariq/codejarvis-7b.Q4_K_M.gguf -p "Anything" -n 9 > jarvis_log.txt            # Running jarvis.cpp on vector qemu-riscv64
+            cat jarvis_log.txt                   # Printing results
        '''
    }
 }
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -26,7 +26,7 @@ COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
-    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_CUDA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc) && \
    cp build/bin/* .

--- a/.devops/full-musa.Dockerfile
+++ b/.devops/full-musa.Dockerfile
@ -19,7 +19,7 @@ WORKDIR /app

 COPY . .

-RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+RUN cmake -B build -DGGML_MUSA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc) && \
    cp build/bin/* .

--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH="\
    gfx803 \
@ -41,7 +41,7 @@ ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

 # Enable cURL
-ENV LLAMA_CURL=1
+ENV JARVIS_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev

--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -15,7 +15,7 @@ WORKDIR /app

 COPY . .

-ENV LLAMA_CURL=1
+ENV JARVIS_CURL=1


 RUN make -j$(nproc)
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@ -23,11 +23,11 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 RUN echo "Building with static libs" && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
    cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
-    cmake --build build --config Release --target llama-cli
+    cmake --build build --config Release --target jarvis-cli

 # TODO: use image with NNRT
 FROM cosdt/cann:$ASCEND_VERSION AS runtime
-COPY --from=build /app/build/bin/llama-cli /llama-cli
+COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli

 ENV LC_ALL=C.utf8

@ -41,4 +41,4 @@ ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
 ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
 ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}

-ENTRYPOINT ["/llama-cli" ]
+ENTRYPOINT ["/jarvis-cli" ]
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@ -23,7 +23,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-cli -j$(nproc)
+    cmake --build build --config Release --target jarvis-cli -j$(nproc)

 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime

@ -31,7 +31,7 @@ RUN apt-get update && \
    apt-get install -y libgomp1

 COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
-COPY --from=build /app/build/src/libllama.so /libllama.so
-COPY --from=build /app/build/bin/llama-cli /llama-cli
+COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
+COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli

-ENTRYPOINT [ "/llama-cli" ]
+ENTRYPOINT [ "/jarvis-cli" ]
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@ -17,12 +17,12 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
    echo "Building with static libs" && \
    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
-    cmake --build build --config Release --target llama-cli
+    cmake --build build --config Release --target jarvis-cli

 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime

-COPY --from=build /app/build/bin/llama-cli /llama-cli
+COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli

 ENV LC_ALL=C.utf8

-ENTRYPOINT [ "/llama-cli" ]
+ENTRYPOINT [ "/jarvis-cli" ]
--- a/.devops/llama-cli-musa.Dockerfile
+++ b/.devops/llama-cli-musa.Dockerfile
@ -16,7 +16,7 @@ WORKDIR /app
 COPY . .

 RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-cli -j$(nproc)
+    cmake --build build --config Release --target jarvis-cli -j$(nproc)

 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime

@ -24,7 +24,7 @@ RUN apt-get update && \
    apt-get install -y libgomp1

 COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
-COPY --from=build /app/build/src/libllama.so /libllama.so
-COPY --from=build /app/build/bin/llama-cli /llama-cli
+COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
+COPY --from=build /app/build/bin/jarvis-cli /jarvis-cli

-ENTRYPOINT [ "/llama-cli" ]
+ENTRYPOINT [ "/jarvis-cli" ]
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH="\
    gfx803 \
@ -40,6 +40,6 @@ ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

-RUN make -j$(nproc) llama-cli
+RUN make -j$(nproc) jarvis-cli

-ENTRYPOINT [ "/app/llama-cli" ]
+ENTRYPOINT [ "/app/jarvis-cli" ]
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@ -15,13 +15,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DGGML_VULKAN=1 && \
-    cmake --build build --config Release --target llama-cli
+    cmake --build build --config Release --target jarvis-cli

 # Clean up
 WORKDIR /
-RUN cp /app/build/bin/llama-cli /llama-cli && \
+RUN cp /app/build/bin/jarvis-cli /jarvis-cli && \
    rm -rf /app

 ENV LC_ALL=C.utf8

-ENTRYPOINT [ "/llama-cli" ]
+ENTRYPOINT [ "/jarvis-cli" ]
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/llama-cli.Dockerfile
@ -9,15 +9,15 @@ WORKDIR /app

 COPY . .

-RUN make -j$(nproc) llama-cli
+RUN make -j$(nproc) jarvis-cli

 FROM ubuntu:$UBUNTU_VERSION AS runtime

 RUN apt-get update && \
    apt-get install -y libgomp1

-COPY --from=build /app/llama-cli /llama-cli
+COPY --from=build /app/jarvis-cli /jarvis-cli

 ENV LC_ALL=C.utf8

-ENTRYPOINT [ "/llama-cli" ]
+ENTRYPOINT [ "/jarvis-cli" ]
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@ -3,7 +3,7 @@
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal

-# Notes for llama.cpp:
+# Notes for jarvis.cpp:
 # 1. Tags are currently based on hash - which will not sort asciibetically.
 #    We need to declare standard versioning if people want to sort latest releases.
 # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
@ -12,44 +12,44 @@
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.

-Name:           llama.cpp-cuda
+Name:           jarvis.cpp-cuda
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+Source0:        https://github.com/ggerganov/jarvis.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
 Requires:       cuda-toolkit
-URL:            https://github.com/ggerganov/llama.cpp
+URL:            https://github.com/ggerganov/jarvis.cpp

 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0

 %description
-CPU inference for Meta's Lllama2 models using default options.
+CPU inference for Meta's Ljarvis2 models using default options.

 %prep
-%setup -n llama.cpp-master
+%setup -n jarvis.cpp-master

 %build
 make -j GGML_CUDA=1

 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
-cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
-cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
+cp -p jarvis-cli %{buildroot}%{_bindir}/jarvis-cuda-cli
+cp -p jarvis-server %{buildroot}%{_bindir}/jarvis-cuda-server
+cp -p jarvis-simple %{buildroot}%{_bindir}/jarvis-cuda-simple

 mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/jarviscuda.service
 [Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
+Description=Jarvis.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target

 [Service]
 Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
+EnvironmentFile=/etc/sysconfig/jarvis
+ExecStart=/usr/bin/jarvis-cuda-server $JARVIS_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never

@ -58,8 +58,8 @@ WantedBy=default.target
 EOF

 mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/jarvis
+JARVIS_ARGS="-m /opt/jarvis2/ggml-model-f32.bin"
 EOF

 %clean
@ -67,11 +67,11 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*

 %files
-%{_bindir}/llama-cuda-cli
-%{_bindir}/llama-cuda-server
-%{_bindir}/llama-cuda-simple
-/usr/lib/systemd/system/llamacuda.service
-%config /etc/sysconfig/llama
+%{_bindir}/jarvis-cuda-cli
+%{_bindir}/jarvis-cuda-server
+%{_bindir}/jarvis-cuda-simple
+/usr/lib/systemd/system/jarviscuda.service
+%config /etc/sysconfig/jarvis

 %pre

--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@ -3,7 +3,7 @@
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal

-# Notes for llama.cpp:
+# Notes for jarvis.cpp:
 # 1. Tags are currently based on hash - which will not sort asciibetically.
 #    We need to declare standard versioning if people want to sort latest releases.
 #    In the meantime, YYYYMMDD format will be used.
@ -13,45 +13,45 @@
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.

-Name:           llama.cpp
+Name:           jarvis.cpp
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+Source0:        https://github.com/ggerganov/jarvis.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
 Requires:       libstdc++
-URL:            https://github.com/ggerganov/llama.cpp
+URL:            https://github.com/ggerganov/jarvis.cpp

 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0

 %description
-CPU inference for Meta's Lllama2 models using default options.
+CPU inference for Meta's Ljarvis2 models using default options.
 Models are not included in this package and must be downloaded separately.

 %prep
-%setup -n llama.cpp-master
+%setup -n jarvis.cpp-master

 %build
 make -j

 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
-cp -p llama-server %{buildroot}%{_bindir}/llama-server
-cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
+cp -p jarvis-cli %{buildroot}%{_bindir}/jarvis-cli
+cp -p jarvis-server %{buildroot}%{_bindir}/jarvis-server
+cp -p jarvis-simple %{buildroot}%{_bindir}/jarvis-simple

 mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/jarvis.service
 [Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
+Description=Jarvis.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target

 [Service]
 Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llama-server $LLAMA_ARGS
+EnvironmentFile=/etc/sysconfig/jarvis
+ExecStart=/usr/bin/jarvis-server $JARVIS_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never

@ -60,8 +60,8 @@ WantedBy=default.target
 EOF

 mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/jarvis
+JARVIS_ARGS="-m /opt/jarvis2/ggml-model-f32.bin"
 EOF

 %clean
@ -69,11 +69,11 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*

 %files
-%{_bindir}/llama-cli
-%{_bindir}/llama-server
-%{_bindir}/llama-simple
-/usr/lib/systemd/system/llama.service
-%config /etc/sysconfig/llama
+%{_bindir}/jarvis-cli
+%{_bindir}/jarvis-server
+%{_bindir}/jarvis-simple
+/usr/lib/systemd/system/jarvis.service
+%config /etc/sysconfig/jarvis

 %pre

--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@ -22,8 +22,8 @@ COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
-    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-server -j$(nproc)
+    cmake -B build -DGGML_CUDA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target jarvis-server -j$(nproc)

 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime

@ -31,12 +31,12 @@ RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl

 COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
-COPY --from=build /app/build/src/libllama.so /libllama.so
-COPY --from=build /app/build/bin/llama-server /llama-server
+COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
+COPY --from=build /app/build/bin/jarvis-server /jarvis-server

 # Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
+ENV JARVIS_ARG_HOST=0.0.0.0

 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

-ENTRYPOINT [ "/llama-server" ]
+ENTRYPOINT [ "/jarvis-server" ]
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@ -15,20 +15,20 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with dynamic libs" && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
-    cmake --build build --config Release --target llama-server
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DJARVIS_CURL=ON ${OPT_SYCL_F16} && \
+    cmake --build build --config Release --target jarvis-server

 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime

 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev curl

-COPY --from=build /app/build/bin/llama-server /llama-server
+COPY --from=build /app/build/bin/jarvis-server /jarvis-server

 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
+ENV JARVIS_ARG_HOST=0.0.0.0

 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

-ENTRYPOINT [ "/llama-server" ]
+ENTRYPOINT [ "/jarvis-server" ]
--- a/.devops/llama-server-musa.Dockerfile
+++ b/.devops/llama-server-musa.Dockerfile
@ -15,8 +15,8 @@ WORKDIR /app

 COPY . .

-RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-server -j$(nproc)
+RUN cmake -B build -DGGML_MUSA=ON -DJARVIS_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target jarvis-server -j$(nproc)

 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime

@ -24,12 +24,12 @@ RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl

 COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
-COPY --from=build /app/build/src/libllama.so /libllama.so
-COPY --from=build /app/build/bin/llama-server /llama-server
+COPY --from=build /app/build/src/libjarvis.so /libjarvis.so
+COPY --from=build /app/build/bin/jarvis-server /jarvis-server

 # Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
+ENV JARVIS_ARG_HOST=0.0.0.0

 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

-ENTRYPOINT [ "/llama-server" ]
+ENTRYPOINT [ "/jarvis-server" ]
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@ -9,7 +9,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# List from https://github.com/ggerganov/jarvis.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH="\
    gfx803 \
@ -40,15 +40,15 @@ ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
+ENV JARVIS_ARG_HOST=0.0.0.0

 # Enable cURL
-ENV LLAMA_CURL=1
+ENV JARVIS_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev curl

-RUN make -j$(nproc) llama-server
+RUN make -j$(nproc) jarvis-server

 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

-ENTRYPOINT [ "/app/llama-server" ]
+ENTRYPOINT [ "/app/jarvis-server" ]
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@ -14,18 +14,18 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
-    cmake --build build --config Release --target llama-server
+RUN cmake -B build -DGGML_VULKAN=1 -DJARVIS_CURL=1 && \
+    cmake --build build --config Release --target jarvis-server

 # Clean up
 WORKDIR /
-RUN cp /app/build/bin/llama-server /llama-server && \
+RUN cp /app/build/bin/jarvis-server /jarvis-server && \
    rm -rf /app

 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
+ENV JARVIS_ARG_HOST=0.0.0.0

 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

-ENTRYPOINT [ "/llama-server" ]
+ENTRYPOINT [ "/jarvis-server" ]
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@ -9,21 +9,21 @@ WORKDIR /app

 COPY . .

-ENV LLAMA_CURL=1
+ENV JARVIS_CURL=1

-RUN make -j$(nproc) llama-server
+RUN make -j$(nproc) jarvis-server

 FROM ubuntu:$UBUNTU_VERSION AS runtime

 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl

-COPY --from=build /app/llama-server /llama-server
+COPY --from=build /app/jarvis-server /jarvis-server

 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
-ENV LLAMA_ARG_HOST=0.0.0.0
+ENV JARVIS_ARG_HOST=0.0.0.0

 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

-ENTRYPOINT [ "/llama-server" ]
+ENTRYPOINT [ "/jarvis-server" ]
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@ -6,10 +6,10 @@
        let
          inherit (config.packages) default;
          binaries = [
-            "llama-cli"
-            "llama-embedding"
-            "llama-server"
-            "llama-quantize"
+            "jarvis-cli"
+            "jarvis-embedding"
+            "jarvis-server"
+            "jarvis-quantize"
          ];
          mkApp = name: {
            type = "app";
--- a/.devops/nix/docker.nix
+++ b/.devops/nix/docker.nix
@ -2,14 +2,14 @@
  lib,
  dockerTools,
  buildEnv,
-  llama-cpp,
+  jarvis-cpp,
  interactive ? true,
  coreutils,
 }:

 # A tar that can be fed into `docker load`:
 #
-# $ nix build .#llamaPackages.docker
+# $ nix build .#jarvisPackages.docker
 # $ docker load < result

 # For details and variations cf.
@ -19,16 +19,16 @@

 # Approximate (compressed) sizes, at the time of writing, are:
 #
-# .#llamaPackages.docker: 125M;
-# .#llamaPackagesCuda.docker: 537M;
-# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
+# .#jarvisPackages.docker: 125M;
+# .#jarvisPackagesCuda.docker: 537M;
+# .#legacyPackages.aarch64-linux.jarvisPackagesXavier.docker: 415M.

 dockerTools.buildLayeredImage {
-  name = llama-cpp.pname;
+  name = jarvis-cpp.pname;
  tag = "latest";

  contents =
-    [ llama-cpp ]
+    [ jarvis-cpp ]
    ++ lib.optionals interactive [
      coreutils
      dockerTools.binSh
--- a/.devops/nix/jetson-support.nix
+++ b/.devops/nix/jetson-support.nix
@ -11,10 +11,10 @@
    {
      legacyPackages =
        let
-          caps.llamaPackagesXavier = "7.2";
-          caps.llamaPackagesOrin = "8.7";
-          caps.llamaPackagesTX2 = "6.2";
-          caps.llamaPackagesNano = "5.3";
+          caps.jarvisPackagesXavier = "7.2";
+          caps.jarvisPackagesOrin = "8.7";
+          caps.jarvisPackagesTX2 = "6.2";
+          caps.jarvisPackagesNano = "5.3";

          pkgsFor =
            cap:
@ -31,9 +31,9 @@
        builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;

      packages = lib.optionalAttrs (system == "aarch64-linux") {
-        jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
-        jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
-        jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
+        jetson-xavier = config.legacyPackages.jarvisPackagesXavier.jarvis-cpp;
+        jetson-orin = config.legacyPackages.jarvisPackagesOrin.jarvis-cpp;
+        jetson-nano = config.legacyPackages.jarvisPackagesNano.jarvis-cpp;
      };
    };
 }
--- a/.devops/nix/package-gguf-py.nix
+++ b/.devops/nix/package-gguf-py.nix
@ -1,6 +1,6 @@
 {
  lib,
-  llamaVersion,
+  jarvisVersion,
  numpy,
  tqdm,
  sentencepiece,
@ -12,7 +12,7 @@

 buildPythonPackage {
  pname = "gguf";
-  version = llamaVersion;
+  version = jarvisVersion;
  pyproject = true;
  nativeBuildInputs = [ poetry-core ];
  propagatedBuildInputs = [
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -33,7 +33,7 @@
  useRocm ? config.rocmSupport,
  enableCurl ? true,
  useVulkan ? false,
-  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
+  jarvisVersion ? "0.0.0", # Arbitrary version, substituted by the flake

  # It's necessary to consistently use backendStdenv when building with CUDA support,
  # otherwise we get libstdc++ errors downstream.
@ -103,8 +103,8 @@ let
 in

 effectiveStdenv.mkDerivation (finalAttrs: {
-  pname = "llama-cpp${pnameSuffix}";
-  version = llamaVersion;
+  pname = "jarvis-cpp${pnameSuffix}";
+  version = jarvisVersion;

  # Note: none of the files discarded here are visible in the sandbox or
  # affect the output hash. This also means they can be modified without
@ -132,12 +132,12 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
  '';

-  # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
+  # With PR#6015 https://github.com/ggerganov/jarvis.cpp/pull/6015,
  # `default.metallib` may be compiled with Metal compiler from XCode
  # and we need to escape sandbox on MacOS to access Metal compiler.
  # `xcrun` is used find the path of the Metal compiler, which is varible
  # and not on $PATH
-  # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
+  # see https://github.com/ggerganov/jarvis.cpp/pull/6118 for discussion
  __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;

  nativeBuildInputs =
@ -166,10 +166,10 @@ effectiveStdenv.mkDerivation (finalAttrs: {

  cmakeFlags =
    [
-      (cmakeBool "LLAMA_BUILD_SERVER" true)
+      (cmakeBool "JARVIS_BUILD_SERVER" true)
      (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
      (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
-      (cmakeBool "LLAMA_CURL" enableCurl)
+      (cmakeBool "JARVIS_CURL" enableCurl)
      (cmakeBool "GGML_NATIVE" false)
      (cmakeBool "GGML_BLAS" useBlas)
      (cmakeBool "GGML_CUDA" useCuda)
@ -205,7 +205,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
  # if they haven't been added yet.
  postInstall = ''
    mkdir -p $out/include
-    cp $src/include/llama.h $out/include/
+    cp $src/include/jarvis.h $out/include/
  '';

  meta = {
@ -219,11 +219,11 @@ effectiveStdenv.mkDerivation (finalAttrs: {
    broken = (useMetalKit && !effectiveStdenv.isDarwin);

    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-    homepage = "https://github.com/ggerganov/llama.cpp/";
+    homepage = "https://github.com/ggerganov/jarvis.cpp/";
    license = lib.licenses.mit;

    # Accommodates `nix run` and `lib.getExe`
-    mainProgram = "llama-cli";
+    mainProgram = "jarvis-cli";

    # These people might respond, on the best effort basis, if you ping them
    # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
--- a/.devops/nix/python-scripts.nix
+++ b/.devops/nix/python-scripts.nix
@ -9,7 +9,7 @@
 }@inputs:

 let
-  llama-python-deps = with python3Packages; [
+  jarvis-python-deps = with python3Packages; [
    numpy
    sentencepiece
    transformers
@ -18,7 +18,7 @@ let
    gguf-py
    tqdm

-    # for scripts/compare-llama-bench.py
+    # for scripts/compare-jarvis-bench.py
    gitpython
    tabulate

@ -28,7 +28,7 @@ let

  ];

-  llama-python-test-deps = with python3Packages; [
+  jarvis-python-test-deps = with python3Packages; [
    # Server bench
    matplotlib

@ -40,7 +40,7 @@ let
 in

 buildPythonPackage ({
-  pname = "llama-scripts";
+  pname = "jarvis-scripts";
  version = "0.0.0";
  pyproject = true;

@ -61,6 +61,6 @@ buildPythonPackage ({
    src = lib.cleanSource ../../.;
  };
  nativeBuildInputs = [ poetry-core ];
-  nativeCheckInputs = llama-python-test-deps;
-  dependencies = llama-python-deps;
+  nativeCheckInputs = jarvis-python-test-deps;
+  dependencies = jarvis-python-deps;
 })
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@ -2,7 +2,7 @@
  lib,
  newScope,
  python3,
-  llamaVersion ? "0.0.0",
+  jarvisVersion ? "0.0.0",
 }:

 let
@ -21,7 +21,7 @@ in
 # Cf. https://noogle.dev/f/lib/makeScope

 lib.makeScope newScope (self: {
-  inherit llamaVersion;
+  inherit jarvisVersion;
  gguf-py = self.callPackage ./package-gguf-py.nix {
    inherit
      buildPythonPackage
@ -34,7 +34,7 @@ lib.makeScope newScope (self: {
      ;
  };
  python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
-  llama-cpp = self.callPackage ./package.nix { };
+  jarvis-cpp = self.callPackage ./package.nix { };
  docker = self.callPackage ./docker.nix { };
  docker-min = self.callPackage ./docker.nix { interactive = false; };
  sif = self.callPackage ./sif.nix { };
--- a/.devops/nix/sif.nix
+++ b/.devops/nix/sif.nix
@ -1,7 +1,7 @@
 {
  lib,
  singularity-tools,
-  llama-cpp,
+  jarvis-cpp,
  bashInteractive,
  interactive ? false,
 }:
@ -10,8 +10,8 @@ let
  optionalInt = cond: x: if cond then x else 0;
 in
 singularity-tools.buildImage rec {
-  inherit (llama-cpp) name;
-  contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
+  inherit (jarvis-cpp) name;
+  contents = [ jarvis-cpp ] ++ lib.optionals interactive [ bashInteractive ];

  # These are excessive (but safe) for most variants. Building singularity
  # images requires superuser privileges, so we build them inside a VM in a
@ -22,6 +22,6 @@ singularity-tools.buildImage rec {
  # Expected image sizes:
  # - cpu/blas: 150M,
  # - cuda, all gencodes: 560M,
-  diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
+  diskSize = 4096 + optionalInt jarvis-cpp.useRocm 16384;
  memSize = diskSize;
 }
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -10,9 +10,9 @@ shift
 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
    python3 ./convert_hf_to_gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    ./llama-quantize "$@"
+    ./jarvis-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    ./llama-cli "$@"
+    ./jarvis-cli "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -20,17 +20,17 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
+            ./jarvis-quantize "$i" "${i/f16/q4_0}" q4_0
        fi
    done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    ./llama-server "$@"
+    ./jarvis-server "$@"
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
    echo "  --run (-r): Run a model previously converted into ggml"
    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
-    echo "  --convert (-c): Convert a llama model into ggml"
+    echo "  --convert (-c): Convert a jarvis model into ggml"
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
--- a/.dockerignore
+++ b/.dockerignore
@ -12,8 +12,8 @@ build*/

 models/*

-/llama-cli
-/llama-quantize
+/jarvis-cli
+/jarvis-quantize

 arm_neon.h
 compile_commands.json
--- a/.editorconfig
+++ b/.editorconfig
@ -24,7 +24,7 @@ insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2

-[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
+[examples/jarvis.swiftui/jarvis.swiftui.xcodeproj/*]
 indent_style = tab

 [examples/cvector-generator/*.txt]
--- a/.github/ISSUE_TEMPLATE/01-bug-low.yml
+++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml
@ -1,5 +1,5 @@
 name: Low Severity Bugs
-description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
+description: Used to report low severity bugs in jarvis.cpp (e.g. cosmetic issues, non critical UI glitches)
 title: "Bug: "
 labels: ["bug-unconfirmed", "low severity"]
 body:
@ -8,7 +8,7 @@ body:
      value: |
        Thanks for taking the time to fill out this bug report!
        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
+        and the version of jarvis.cpp that you are using.
        If possible, please provide a minimal code example that reproduces the bug.
  - type: textarea
    id: what-happened
@ -24,7 +24,7 @@ body:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
-        $./llama-cli --version
+        $./jarvis-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
--- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml
+++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
@ -1,5 +1,5 @@
 name: Medium Severity Bug
-description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
+description: Used to report medium severity bugs in jarvis.cpp (e.g. Malfunctioning Features but generally still useable)
 title: "Bug: "
 labels: ["bug-unconfirmed", "medium severity"]
 body:
@ -8,7 +8,7 @@ body:
      value: |
        Thanks for taking the time to fill out this bug report!
        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
+        and the version of jarvis.cpp that you are using.
        If possible, please provide a minimal code example that reproduces the bug.
  - type: textarea
    id: what-happened
@ -24,7 +24,7 @@ body:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
-        $./llama-cli --version
+        $./jarvis-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
--- a/.github/ISSUE_TEMPLATE/03-bug-high.yml
+++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml
@ -1,5 +1,5 @@
 name: High Severity Bug
-description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
+description: Used to report high severity bugs in jarvis.cpp (e.g. Malfunctioning features hindering important common workflow)
 title: "Bug: "
 labels: ["bug-unconfirmed", "high severity"]
 body:
@ -8,7 +8,7 @@ body:
      value: |
        Thanks for taking the time to fill out this bug report!
        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
+        and the version of jarvis.cpp that you are using.
        If possible, please provide a minimal code example that reproduces the bug.
  - type: textarea
    id: what-happened
@ -24,7 +24,7 @@ body:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
-        $./llama-cli --version
+        $./jarvis-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
--- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml
+++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
@ -1,5 +1,5 @@
 name: Critical Severity Bug
-description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
+description: Used to report critical severity bugs in jarvis.cpp (e.g. Crashing, Corrupted, Dataloss)
 title: "Bug: "
 labels: ["bug-unconfirmed", "critical severity"]
 body:
@ -8,7 +8,7 @@ body:
      value: |
        Thanks for taking the time to fill out this bug report!
        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
+        and the version of jarvis.cpp that you are using.
        If possible, please provide a minimal code example that reproduces the bug.
  - type: textarea
    id: what-happened
@ -24,7 +24,7 @@ body:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
-        $./llama-cli --version
+        $./jarvis-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
--- a/.github/ISSUE_TEMPLATE/05-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/05-enhancement.yml
@ -1,12 +1,12 @@
 name: Enhancement
-description: Used to request enhancements for llama.cpp
+description: Used to request enhancements for jarvis.cpp
 title: "Feature Request: "
 labels: ["enhancement"]
 body:
  - type: markdown
    attributes:
      value: |
-        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
+        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/jarvis.cpp/discussions/categories/ideas)

  - type: checkboxes
    id: prerequisites
@ -16,18 +16,18 @@ body:
      options:
        - label: I am running the latest code. Mention the version if possible as well.
          required: true
-        - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+        - label: I carefully followed the [README.md](https://github.com/ggerganov/jarvis.cpp/blob/master/README.md).
          required: true
        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
          required: true
-        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
+        - label: I reviewed the [Discussions](https://github.com/ggerganov/jarvis.cpp/discussions), and have a new and useful enhancement to share.
          required: true

  - type: textarea
    id: feature-description
    attributes:
      label: Feature Description
-      description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
+      description: Please provide a detailed written description of what you were trying to do, and what you expected `jarvis.cpp` to do as an enhancement.
      placeholder: Detailed description of the enhancement
    validations:
      required: true
@ -36,7 +36,7 @@ body:
    id: motivation
    attributes:
      label: Motivation
-      description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
+      description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `jarvis.cpp` users.
      placeholder: Explanation of why this feature is needed and its benefits
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/06-research.yml
+++ b/.github/ISSUE_TEMPLATE/06-research.yml
@ -6,7 +6,7 @@ body:
  - type: markdown
    attributes:
      value: |
-        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
+        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)

  - type: checkboxes
    id: research-stage
--- a/.github/ISSUE_TEMPLATE/07-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/07-refactor.yml
@ -6,8 +6,8 @@ body:
  - type: markdown
    attributes:
      value: |
-        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
-        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
+        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
+        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/jarvis.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.

  - type: textarea
    id: background-description
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1,11 +1,11 @@
 blank_issues_enabled: true
 contact_links:
  - name: Got an idea?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
+    url: https://github.com/ggerganov/jarvis.cpp/discussions/categories/ideas
    about: Pop it there. It may then become an enhancement ticket.
  - name: Got a question?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
+    url: https://github.com/ggerganov/jarvis.cpp/discussions/categories/q-a
    about: Ask a question there!
  - name: Want to contribute?
-    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
+    url: https://github.com/ggerganov/jarvis.cpp/wiki/contribute
    about: Head to the contribution guide page of the wiki for areas you can help with
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -67,7 +67,7 @@ script:
 android:
    - changed-files:
        - any-glob-to-any-file:
-            - examples/llama.android/**
+            - examples/jarvis.android/**
 server:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -1,6 +1,6 @@


- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
+- [x] I have read the [contributing guidelines](https://github.com/ggerganov/jarvis.cpp/blob/master/CONTRIBUTING.md)
 - Self-reported review complexity:
  - [ ] Low
  - [ ] Medium
--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@ -1,5 +1,5 @@
 # TODO: there have been some issues with the workflow, so disabling for now
-#       https://github.com/ggerganov/llama.cpp/issues/7893
+#       https://github.com/ggerganov/jarvis.cpp/issues/7893
 #
 # Benchmark
 name: Benchmark
@ -27,10 +27,10 @@ on:
  push:
    branches:
      - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+    paths: ['jarvis.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  pull_request_target:
    types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+    paths: ['jarvis.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  schedule:
    -  cron: '04 2 * * *'

@ -113,16 +113,16 @@ jobs:
          set -eux
          cmake -B build \
              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
-              -DLLAMA_CUBLAS=ON \
+              -DJARVIS_BUILD_SERVER=ON \
+              -DJARVIS_CURL=ON \
+              -DJARVIS_CUBLAS=ON \
              -DCUDAToolkit_ROOT=/usr/local/cuda \
              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
              -DCMAKE_CUDA_ARCHITECTURES=75 \
-              -DLLAMA_FATAL_WARNINGS=OFF \
-              -DLLAMA_ALL_WARNINGS=OFF \
+              -DJARVIS_FATAL_WARNINGS=OFF \
+              -DJARVIS_ALL_WARNINGS=OFF \
              -DCMAKE_BUILD_TYPE=Release;
-          cmake --build build --config Release -j $(nproc) --target llama-server
+          cmake --build build --config Release -j $(nproc) --target jarvis-server

      - name: Download the dataset
        id: download_dataset
@ -240,7 +240,7 @@ jobs:
          message: |
            <p align="center">

-            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+            📈 **jarvis.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀

            </p>

@ -249,9 +249,9 @@ jobs:
            <summary>Expand details for performance related PR only</summary>

            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
-            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
-            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
-            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
+            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.JARVISCPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.JARVISCPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
+            - Prompt processing (pp): avg=${{ env.JARVISCPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.JARVISCPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
+            - Token generation  (tg): avg=${{ env.JARVISCPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.JARVISCPP_TOKENS_SECOND_P_95_ }}tk/s
            - ${{ env.BENCH_GRAPH_XLABEL }}


--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -28,9 +28,9 @@ env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  JARVIS_LOG_COLORS: 1
+  JARVIS_LOG_PREFIX: 1
+  JARVIS_LOG_TIMESTAMPS: 1

 jobs:
  macOS-latest-cmake-arm64:
@ -55,7 +55,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
+          cmake -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@ -82,14 +82,14 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
+          zip -r jarvis-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
-          name: llama-bin-macos-arm64.zip
+          path: jarvis-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
+          name: jarvis-bin-macos-arm64.zip

  macOS-latest-cmake-x64:
    runs-on: macos-12
@ -112,8 +112,8 @@ jobs:
        run: |
          sysctl -a
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
+          # https://github.com/ggerganov/jarvis.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          cmake -B build -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@ -140,20 +140,20 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+          zip -r jarvis-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
-          name: llama-bin-macos-x64.zip
+          path: jarvis-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
+          name: jarvis-bin-macos-x64.zip

  ubuntu-focal-make:
    runs-on: ubuntu-20.04
    env:
-      LLAMA_NODE_AVAILABLE: true
-      LLAMA_PYTHON_AVAILABLE: true
+      JARVIS_NODE_AVAILABLE: true
+      JARVIS_PYTHON_AVAILABLE: true

    steps:
      - name: Clone
@ -177,7 +177,7 @@ jobs:
      - name: Build
        id: make_build
        env:
-            LLAMA_FATAL_WARNINGS: 1
+            JARVIS_FATAL_WARNINGS: 1
        run: |
          CC=gcc-8 make -j $(nproc)

@ -204,8 +204,8 @@ jobs:
      - name: Build
        id: make_build
        env:
-          LLAMA_FATAL_WARNINGS: 1
-          LLAMA_CURL: 1
+          JARVIS_FATAL_WARNINGS: 1
+          JARVIS_CURL: 1
        run: |
          CC=gcc-8 make -j $(nproc)

@ -230,7 +230,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
+          cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
          cmake --build . --config Release -j $(nproc)

      - name: Test
@ -239,16 +239,16 @@ jobs:
          cd build
          ctest -L 'main|curl' --verbose --timeout 900

-      - name: Test llama2c conversion
-        id: llama2c_test
+      - name: Test jarvis2c conversion
+        id: jarvis2c_test
        run: |
          cd build
          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+          wget https://huggingface.co/karpathy/tinyjarviss/resolve/main/stories260K/tok512.bin
+          echo "Fetch jarvis2c model"
+          wget https://huggingface.co/karpathy/tinyjarviss/resolve/main/stories260K/stories260K.bin
+          ./bin/jarvis-convert-jarvis2c-to-ggml --copy-vocab-from-model ./tok512.bin --jarvis2c-model stories260K.bin --jarvis2c-output-model stories260K.gguf
+          ./bin/jarvis-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

      - name: Determine tag name
        id: tag
@ -268,14 +268,14 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
+          zip -r jarvis-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
-          name: llama-bin-ubuntu-x64.zip
+          path: jarvis-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
+          name: jarvis-bin-ubuntu-x64.zip

  ubuntu-latest-cmake-sanitizer:
    runs-on: ubuntu-latest
@ -304,7 +304,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)

      - name: Build (no OpenMP)
@ -313,7 +313,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
+          cmake .. -DJARVIS_FATAL_WARNINGS=ON -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)

      - name: Test
@ -487,7 +487,7 @@ jobs:

  # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
+  #       ref: https://github.com/ggerganov/jarvis.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
  macOS-latest-make:
    runs-on: macos-latest

@ -505,7 +505,7 @@ jobs:
      - name: Build
        id: make_build
        env:
-            LLAMA_FATAL_WARNINGS: 1
+            JARVIS_FATAL_WARNINGS: 1
        run: |
          GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)

@ -517,7 +517,7 @@ jobs:

  # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
+  #       ref: https://github.com/ggerganov/jarvis.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
  #       would be great if we fix these
  macOS-latest-cmake:
    runs-on: macos-latest
@ -539,7 +539,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
+          cmake -DJARVIS_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@ -570,9 +570,9 @@ jobs:
          cd build
          cmake -G Xcode .. \
            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
+            -DJARVIS_BUILD_EXAMPLES=OFF \
+            -DJARVIS_BUILD_TESTS=OFF \
+            -DJARVIS_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=iOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
@ -600,9 +600,9 @@ jobs:
          cd build
          cmake -G Xcode .. \
            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
+            -DJARVIS_BUILD_EXAMPLES=OFF \
+            -DJARVIS_BUILD_TESTS=OFF \
+            -DJARVIS_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=tvOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
@ -629,7 +629,7 @@ jobs:
      - name: xcodebuild for swift package
        id: xcodebuild
        run: |
-          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
+          xcodebuild -scheme jarvis -destination "${{ matrix.destination }}"

      - name: Build Swift Example
        id: make_build_swift_example
@ -705,23 +705,23 @@ jobs:
      matrix:
        include:
          - build: 'noavx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx2-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'avx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx512-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'openblas-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
          - build: 'kompute-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'llvm-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'msvc-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'

    steps:
      - name: Clone
@ -807,7 +807,7 @@ jobs:
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
          cd build
-          $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
+          $env:JARVIS_SKIP_TESTS_SLOW_ON_EMULATOR = 1
          & $sde -future -- ctest -L main -C Release --verbose --timeout 900

      - name: Determine tag name
@ -827,15 +827,15 @@ jobs:
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
-          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
+          Copy-Item LICENSE .\build\bin\Release\jarvis.cpp.txt
+          7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
-          name: llama-bin-win-${{ matrix.build }}.zip
+          path: jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
+          name: jarvis-bin-win-${{ matrix.build }}.zip

  windows-latest-cmake-cuda:
    runs-on: windows-2019
@ -865,7 +865,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
+          cmake .. -DGGML_NATIVE=OFF -DJARVIS_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
          cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

@ -886,28 +886,28 @@ jobs:
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+          7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          path: jarvis-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+          name: jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip

      - name: Copy and pack Cuda runtime
        run: |
          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
          $dst='.\build\bin\cudart\'
          robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
+          7z a cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*

      - name: Upload Cuda runtime
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
-          path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          path: cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip
+          name: cudart-jarvis-bin-win-cu${{ matrix.cuda }}-x64.zip

  windows-latest-cmake-sycl:
    runs-on: windows-latest
@ -963,14 +963,14 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
+          7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
-          name: llama-bin-win-sycl-x64.zip
+          path: jarvis-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+          name: jarvis-bin-win-sycl-x64.zip

  windows-latest-cmake-hip:
    if: ${{ github.event.inputs.create_release != 'true' }}
@ -1060,13 +1060,13 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
+          7z a jarvis-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          path: jarvis-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          name: jarvis-bin-win-hip-x64-${{ matrix.gpu_target }}.zip

  ios-xcode-build:
    runs-on: macos-latest
@ -1076,7 +1076,7 @@ jobs:
        uses: actions/checkout@v4

      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
+        run: xcodebuild -project examples/jarvis.swiftui/jarvis.swiftui.xcodeproj -scheme jarvis.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build

  android-build:
    runs-on: ubuntu-latest
@ -1098,7 +1098,7 @@ jobs:

      - name: Build
        run: |
-          cd examples/llama.android
+          cd examples/jarvis.android

          ./gradlew build --no-daemon

@ -1261,7 +1261,7 @@ jobs:
 #          sudo apt-get install cmake
 #
 #      - name: Configure
-#        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
+#        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON
 #
 #      - name: Build
 #        run: |
@ -1300,7 +1300,7 @@ jobs:
 #      - name: Upload binaries
 #        uses: actions/upload-artifact@v4
 #        with:
-#          name: llama-bin-${{ matrix.arch }}
+#          name: jarvis-bin-${{ matrix.arch }}
 #          path: build/bin/${{ matrix.build }}
 #
 #  windows-blas:
@ -1339,7 +1339,7 @@ jobs:
 #        run: >
 #          cmake -S . -B ./build -A ${{ matrix.arch }}
 #          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#          -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
+#          -DJARVIS_SUPPORT_OPENBLAS=${{ matrix.blas }}
 #          -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
 #
 #      - name: Build
@ -1355,7 +1355,7 @@ jobs:
 #        if: matrix.blas == 'ON'
 #        uses: actions/upload-artifact@v4
 #        with:
-#          name: llama-blas-bin-${{ matrix.arch }}
+#          name: jarvis-blas-bin-${{ matrix.arch }}
 #          path: build/bin/${{ matrix.build }}
 #
 #  emscripten:
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -37,21 +37,21 @@ jobs:
    strategy:
      matrix:
        config:
-          - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light", dockerfile: ".devops/jarvis-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "server", dockerfile: ".devops/jarvis-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "light-cuda", dockerfile: ".devops/jarvis-cli-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-cuda", dockerfile: ".devops/jarvis-server-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "light-musa", dockerfile: ".devops/jarvis-cli-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-musa", dockerfile: ".devops/jarvis-server-musa.Dockerfile", platforms: "linux/amd64" }
          - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          #- { tag: "light-rocm", dockerfile: ".devops/jarvis-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          #- { tag: "server-rocm", dockerfile: ".devops/jarvis-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "light-intel", dockerfile: ".devops/jarvis-cli-intel.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-intel", dockerfile: ".devops/jarvis-server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@ -11,7 +11,7 @@ jobs:
    steps:
    - uses: actions/checkout@v4
      with:
-        repository: "ggerganov/llama.cpp"
+        repository: "ggerganov/jarvis.cpp"
    - uses: actions/labeler@v5
      with:
        configuration-path: '.github/labeler.yml'
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@ -47,8 +47,8 @@ jobs:
        extra-conf: |
          extra-platforms = aarch64-linux
          extra-system-features = nixos-test kvm
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@ -56,7 +56,7 @@ jobs:
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: llama-cpp
+        name: jarvis-cpp
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@ -34,8 +34,8 @@ jobs:
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@ -61,8 +61,8 @@ jobs:
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://jarvis-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = jarvis-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@ -70,7 +70,7 @@ jobs:
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: llama-cpp
+        name: jarvis-cpp
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -21,10 +21,10 @@ on:
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']

 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  JARVIS_LOG_COLORS: 1
+  JARVIS_LOG_PREFIX: 1
+  JARVIS_LOG_TIMESTAMPS: 1
+  JARVIS_LOG_VERBOSITY: 10

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@ -41,7 +41,7 @@ jobs:
        include:
          - build_type: Release
            sanitizer: ""
-      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
+      fail-fast: false # While -DJARVIS_SANITIZE_THREAD=ON is broken

    steps:
      - name: Dependencies
@ -99,12 +99,12 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
+              -DJARVIS_BUILD_SERVER=ON \
+              -DJARVIS_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+              -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON \
              -DGGML_OPENMP=OFF ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target jarvis-server

      - name: Build
        id: cmake_build
@ -112,11 +112,11 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
+              -DJARVIS_BUILD_SERVER=ON \
+              -DJARVIS_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+              -DJARVIS_SANITIZE_${{ matrix.sanitizer }}=ON ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target jarvis-server

      - name: Tests
        id: server_integration_tests
@ -155,8 +155,8 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
+          cmake -B build -DJARVIS_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target jarvis-server

      - name: Python setup
        id: setup_python
@ -180,7 +180,7 @@ jobs:
        run: |
          cd examples/server/tests
          $env:PYTHONIOENCODING = ":replace"
-          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags jarvis.cpp

      - name: Slow tests
        id: server_integration_tests_slow
--- a/.gitignore
+++ b/.gitignore
@ -48,8 +48,8 @@ build*
 !build-info.sh
 !build.zig
 !docs/build.md
-/libllama.so
-/llama-*
+/libjarvis.so
+/jarvis-*
 /vulkan-shaders-gen
 android-ndk-*
 arm_neon.h
@ -57,7 +57,7 @@ cmake-build-*
 CMakeSettings.json
 compile_commands.json
 ggml-metal-embed.metal
-llama-batched-swift
+jarvis-batched-swift
 /rpc-server
 out/
 tmp/
@ -118,7 +118,7 @@ poetry.toml
 /tests/test-double-float
 /tests/test-grad0
 /tests/test-grammar-parser
-/tests/test-llama-grammar
+/tests/test-jarvis-grammar
 /tests/test-opt
 /tests/test-quantize-fns
 /tests/test-quantize-perf
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
-project("llama.cpp" C CXX)
+project("jarvis.cpp" C CXX)
 include(CheckIncludeFileCXX)

 #set(CMAKE_WARN_DEPRECATED YES)
@ -18,20 +18,20 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
-    set(LLAMA_STANDALONE ON)
+    set(JARVIS_STANDALONE ON)

    include(git-vars)

    # configure project version
    # TODO
 else()
-    set(LLAMA_STANDALONE OFF)
+    set(JARVIS_STANDALONE OFF)
 endif()

 if (EMSCRIPTEN)
    set(BUILD_SHARED_LIBS_DEFAULT OFF)

-    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
+    option(JARVIS_WASM_SINGLE_FILE "jarvis: embed WASM inside the generated jarvis.js" ON)
 else()
    if (MINGW)
        set(BUILD_SHARED_LIBS_DEFAULT OFF)
@ -51,41 +51,41 @@ endif()
 #

 # debug
-option(LLAMA_ALL_WARNINGS           "llama: enable all compiler warnings"                   ON)
-option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
+option(JARVIS_ALL_WARNINGS           "jarvis: enable all compiler warnings"                   ON)
+option(JARVIS_ALL_WARNINGS_3RD_PARTY "jarvis: enable all compiler warnings in 3rd party libs" OFF)

 # build
-option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF)
+option(JARVIS_FATAL_WARNINGS "jarvis: enable -Werror flag" OFF)

 # sanitizers
-option(LLAMA_SANITIZE_THREAD    "llama: enable thread sanitizer"    OFF)
-option(LLAMA_SANITIZE_ADDRESS   "llama: enable address sanitizer"   OFF)
-option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
+option(JARVIS_SANITIZE_THREAD    "jarvis: enable thread sanitizer"    OFF)
+option(JARVIS_SANITIZE_ADDRESS   "jarvis: enable address sanitizer"   OFF)
+option(JARVIS_SANITIZE_UNDEFINED "jarvis: enable undefined sanitizer" OFF)

 # utils
-option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
+option(JARVIS_BUILD_COMMON "jarvis: build common utils library" ${JARVIS_STANDALONE})

 # extra artifacts
-option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
+option(JARVIS_BUILD_TESTS    "jarvis: build tests"          ${JARVIS_STANDALONE})
+option(JARVIS_BUILD_EXAMPLES "jarvis: build examples"       ${JARVIS_STANDALONE})
+option(JARVIS_BUILD_SERVER   "jarvis: build server example" ${JARVIS_STANDALONE})

 # 3rd party libs
-option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
+option(JARVIS_CURL "jarvis: use libcurl to download model from an URL" OFF)

 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)

 # override ggml options
-set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
-set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
-set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
-set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
-set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})
+set(GGML_SANITIZE_THREAD    ${JARVIS_SANITIZE_THREAD})
+set(GGML_SANITIZE_ADDRESS   ${JARVIS_SANITIZE_ADDRESS})
+set(GGML_SANITIZE_UNDEFINED ${JARVIS_SANITIZE_UNDEFINED})
+set(GGML_ALL_WARNINGS       ${JARVIS_ALL_WARNINGS})
+set(GGML_FATAL_WARNINGS     ${JARVIS_FATAL_WARNINGS})

 # change the default for these ggml options
-if (NOT DEFINED GGML_LLAMAFILE)
-    set(GGML_LLAMAFILE_DEFAULT ON)
+if (NOT DEFINED GGML_JARVISFILE)
+    set(GGML_JARVISFILE_DEFAULT ON)
 endif()

 if (NOT DEFINED GGML_AMX)
@ -97,23 +97,23 @@ if (NOT DEFINED GGML_CUDA_GRAPHS)
 endif()

 # transition helpers
-function (llama_option_depr TYPE OLD NEW)
+function (jarvis_option_depr TYPE OLD NEW)
    if (${OLD})
        message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
        set(${NEW} ON PARENT_SCOPE)
    endif()
 endfunction()

-llama_option_depr(FATAL_ERROR LLAMA_CUBLAS              GGML_CUDA)
-llama_option_depr(WARNING     LLAMA_CUDA                GGML_CUDA)
-llama_option_depr(WARNING     LLAMA_KOMPUTE             GGML_KOMPUTE)
-llama_option_depr(WARNING     LLAMA_METAL               GGML_METAL)
-llama_option_depr(WARNING     LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
-llama_option_depr(WARNING     LLAMA_NATIVE              GGML_NATIVE)
-llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
-llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
-llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
-llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)
+jarvis_option_depr(FATAL_ERROR JARVIS_CUBLAS              GGML_CUDA)
+jarvis_option_depr(WARNING     JARVIS_CUDA                GGML_CUDA)
+jarvis_option_depr(WARNING     JARVIS_KOMPUTE             GGML_KOMPUTE)
+jarvis_option_depr(WARNING     JARVIS_METAL               GGML_METAL)
+jarvis_option_depr(WARNING     JARVIS_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
+jarvis_option_depr(WARNING     JARVIS_NATIVE              GGML_NATIVE)
+jarvis_option_depr(WARNING     JARVIS_RPC                 GGML_RPC)
+jarvis_option_depr(WARNING     JARVIS_SYCL                GGML_SYCL)
+jarvis_option_depr(WARNING     JARVIS_SYCL_F16            GGML_SYCL_F16)
+jarvis_option_depr(WARNING     JARVIS_CANN                GGML_CANN)

 #
 # build the library
@ -132,18 +132,18 @@ add_subdirectory(src)
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)

-set(LLAMA_BUILD_NUMBER        ${BUILD_NUMBER})
-set(LLAMA_BUILD_COMMIT        ${BUILD_COMMIT})
-set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
+set(JARVIS_BUILD_NUMBER        ${BUILD_NUMBER})
+set(JARVIS_BUILD_COMMIT        ${BUILD_COMMIT})
+set(JARVIS_INSTALL_VERSION 0.0.${BUILD_NUMBER})

-set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
-set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
-set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
+set(JARVIS_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
+set(JARVIS_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
+set(JARVIS_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")


 # At the moment some compile definitions are placed within the ggml/src
 # directory but not exported on the `ggml` target. This could be improved by
-# determining _precisely_ which defines are necessary for the llama-config
+# determining _precisely_ which defines are necessary for the jarvis-config
 # package.
 #
 set(GGML_TRANSIENT_DEFINES)
@ -158,25 +158,25 @@ if (GGML_TARGET_DEFINES)
 endif()
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)

-set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
-install(TARGETS llama LIBRARY PUBLIC_HEADER)
+set_target_properties(jarvis PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/jarvis.h)
+install(TARGETS jarvis LIBRARY PUBLIC_HEADER)

 configure_package_config_file(
-        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
-        ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
-    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama
-    PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
-              LLAMA_LIB_INSTALL_DIR
-              LLAMA_BIN_INSTALL_DIR )
+        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/jarvis-config.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/jarvis-config.cmake
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/jarvis
+    PATH_VARS JARVIS_INCLUDE_INSTALL_DIR
+              JARVIS_LIB_INSTALL_DIR
+              JARVIS_BIN_INSTALL_DIR )

 write_basic_package_version_file(
-        ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
-    VERSION ${LLAMA_INSTALL_VERSION}
+        ${CMAKE_CURRENT_BINARY_DIR}/jarvis-version.cmake
+    VERSION ${JARVIS_INSTALL_VERSION}
    COMPATIBILITY SameMajorVersion)

-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
-              ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
-        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/jarvis-config.cmake
+              ${CMAKE_CURRENT_BINARY_DIR}/jarvis-version.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/jarvis)

 install(
    FILES convert_hf_to_gguf.py
@ -190,27 +190,27 @@ install(
        WORLD_EXECUTE
    DESTINATION ${CMAKE_INSTALL_BINDIR})

-configure_file(cmake/llama.pc.in
-        "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
+configure_file(cmake/jarvis.pc.in
+        "${CMAKE_CURRENT_BINARY_DIR}/jarvis.pc"
        @ONLY)

-install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/jarvis.pc"
        DESTINATION lib/pkgconfig)

 #
 # utils, programs, examples and tests
 #

-if (LLAMA_BUILD_COMMON)
+if (JARVIS_BUILD_COMMON)
    add_subdirectory(common)
 endif()

-if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
+if (JARVIS_BUILD_COMMON AND JARVIS_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
    include(CTest)
    add_subdirectory(tests)
 endif()

-if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
+if (JARVIS_BUILD_COMMON AND JARVIS_BUILD_EXAMPLES)
    add_subdirectory(examples)
    add_subdirectory(pocs)
 endif()
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -11,7 +11,7 @@

 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
+- Optionally pick a `<module>` from here: https://github.com/ggerganov/jarvis.cpp/wiki/Modules

 # Coding guidelines

@ -22,7 +22,7 @@
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
+- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/jarvis.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$

 ![matmul](media/matmul.png)

@ -30,4 +30,4 @@

 The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:

-https://github.com/ggerganov/llama.cpp/projects
+https://github.com/ggerganov/jarvis.cpp/projects
--- a/LLMCLI.java
+++ b/LLMCLI.java
@ -7,7 +7,7 @@ import java.util.Scanner;
 public class LLMCLI {
    public static void main(String[] args) {
        // Path to the .exe file
-        String exePath = "bin/llama-cli.exe";
+        String exePath = "bin/jarvis-cli.exe";

        System.out.println("Enter -h for help");
        // Scanner to take user input for various commands
--- a/388
+++ b/388
@ -1,44 +1,44 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	libllava.a \
-	llama-baby-llama \
-	llama-batched \
-	llama-batched-bench \
-	llama-bench \
-	llama-cli \
-	llama-convert-llama2c-to-ggml \
-	llama-embedding \
-	llama-eval-callback \
-	llama-export-lora \
-	llama-gbnf-validator \
-	llama-gguf \
-	llama-gguf-hash \
-	llama-gguf-split \
-	llama-gritlm \
-	llama-imatrix \
-	llama-infill \
-	llama-llava-cli \
-	llama-minicpmv-cli\
-	llama-lookahead \
-	llama-lookup \
-	llama-lookup-create \
-	llama-lookup-merge \
-	llama-lookup-stats \
-	llama-parallel \
-	llama-passkey \
-	llama-perplexity \
-	llama-q8dot \
-	llama-quantize \
-	llama-quantize-stats \
-	llama-retrieval \
-	llama-save-load-state \
-	llama-server \
-	llama-simple \
-	llama-speculative \
-	llama-tokenize \
-	llama-vdot \
-	llama-cvector-generator \
-	llama-gen-docs \
+	jarvis-baby-jarvis \
+	jarvis-batched \
+	jarvis-batched-bench \
+	jarvis-bench \
+	jarvis-cli \
+	jarvis-convert-jarvis2c-to-ggml \
+	jarvis-embedding \
+	jarvis-eval-callback \
+	jarvis-export-lora \
+	jarvis-gbnf-validator \
+	jarvis-gguf \
+	jarvis-gguf-hash \
+	jarvis-gguf-split \
+	jarvis-gritlm \
+	jarvis-imatrix \
+	jarvis-infill \
+	jarvis-llava-cli \
+	jarvis-minicpmv-cli\
+	jarvis-lookahead \
+	jarvis-lookup \
+	jarvis-lookup-create \
+	jarvis-lookup-merge \
+	jarvis-lookup-stats \
+	jarvis-parallel \
+	jarvis-passkey \
+	jarvis-perplexity \
+	jarvis-q8dot \
+	jarvis-quantize \
+	jarvis-quantize-stats \
+	jarvis-retrieval \
+	jarvis-save-load-state \
+	jarvis-server \
+	jarvis-simple \
+	jarvis-speculative \
+	jarvis-tokenize \
+	jarvis-vdot \
+	jarvis-cvector-generator \
+	jarvis-gen-docs \
 	tests/test-c.o

 # Binaries only useful for tests
@ -52,7 +52,7 @@ TEST_TARGETS = \
 	tests/test-grammar-integration \
 	tests/test-grammar-parser \
 	tests/test-json-schema-to-grammar \
-	tests/test-llama-grammar \
+	tests/test-jarvis-grammar \
 	tests/test-log \
 	tests/test-model-load-cancel \
 	tests/test-opt \
@ -65,8 +65,8 @@ TEST_TARGETS = \
 	tests/test-tokenizer-1-spm

 # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
-LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
+LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-jarvis2c-to-ggml \
+	simple batched batched-bench save-load-state server gguf gguf-split eval-callback jarvis-bench libllava.a llava-cli baby-jarvis \
 	retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm

 # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
@ -74,80 +74,80 @@ LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding
 LEGACY_TARGETS_BUILD = main quantize perplexity embedding server

 # Deprecation aliases
-ifdef LLAMA_CUBLAS
-$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
+ifdef JARVIS_CUBLAS
+$(error JARVIS_CUBLAS is removed. Use GGML_CUDA instead.)
 endif

-ifdef LLAMA_CUDA
+ifdef JARVIS_CUDA
 GGML_CUDA := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_KOMPUTE
+ifdef JARVIS_KOMPUTE
 GGML_KOMPUTE := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_METAL
+ifdef JARVIS_METAL
 GGML_METAL := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_RPC
+ifdef JARVIS_RPC
 GGML_RPC := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_SYCL
+ifdef JARVIS_SYCL
 GGML_SYCL := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_SYCL_F16
+ifdef JARVIS_SYCL_F16
 GGML_SYCL_F16 := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_OPENBLAS
+ifdef JARVIS_OPENBLAS
 GGML_OPENBLAS := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_OPENBLAS64
+ifdef JARVIS_OPENBLAS64
 GGML_OPENBLAS64 := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_BLIS
+ifdef JARVIS_BLIS
 GGML_BLIS := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_NO_LLAMAFILE
-GGML_NO_LLAMAFILE := 1
+ifdef JARVIS_NO_JARVISFILE
+GGML_NO_JARVISFILE := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_NO_ACCELERATE
+ifdef JARVIS_NO_ACCELERATE
 GGML_NO_ACCELERATE := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_NO_OPENMP
+ifdef JARVIS_NO_OPENMP
 GGML_NO_OPENMP := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_NO_METAL
+ifdef JARVIS_NO_METAL
 GGML_NO_METAL := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_DISABLE_LOGS
+ifdef JARVIS_DISABLE_LOGS
 REMOVE_WARNING := 1
 endif

-ifdef LLAMA_SERVER_VERBOSE
+ifdef JARVIS_SERVER_VERBOSE
 REMOVE_WARNING := 1
 endif

@ -211,8 +211,8 @@ test: $(TEST_TARGETS)
 	@failures=0; \
 	for test_target in $(TEST_TARGETS); do \
 		if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-jarvis-spm.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-jarvis-bpe.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
@ -257,7 +257,7 @@ MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11

-ifdef LLAMA_NO_CCACHE
+ifdef JARVIS_NO_CCACHE
 GGML_NO_CCACHE := 1
 DEPRECATE_WARNING := 1
 endif
@ -320,7 +320,7 @@ ifdef GGML_SCHED_MAX_COPIES
 	MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES)
 endif

-ifdef LLAMA_DEBUG
+ifdef JARVIS_DEBUG
 	MK_CFLAGS    += -O0 -g
 	MK_CXXFLAGS  += -O0 -g
 	MK_LDFLAGS   += -g
@ -336,25 +336,25 @@ else
 	MK_NVCCFLAGS  += -O3 -g
 endif

-ifdef LLAMA_SANITIZE_THREAD
+ifdef JARVIS_SANITIZE_THREAD
 	MK_CFLAGS   += -fsanitize=thread -g
 	MK_CXXFLAGS += -fsanitize=thread -g
 	MK_LDFLAGS  += -fsanitize=thread -g
 endif

-ifdef LLAMA_SANITIZE_ADDRESS
+ifdef JARVIS_SANITIZE_ADDRESS
 	MK_CFLAGS   += -fsanitize=address -fno-omit-frame-pointer -g
 	MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
 	MK_LDFLAGS  += -fsanitize=address -fno-omit-frame-pointer -g
 endif

-ifdef LLAMA_SANITIZE_UNDEFINED
+ifdef JARVIS_SANITIZE_UNDEFINED
 	MK_CFLAGS   += -fsanitize=undefined -g
 	MK_CXXFLAGS += -fsanitize=undefined -g
 	MK_LDFLAGS  += -fsanitize=undefined -g
 endif

-ifdef LLAMA_SERVER_SSL
+ifdef JARVIS_SERVER_SSL
 	MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
 	MK_LDFLAGS += -lssl -lcrypto
 endif
@ -381,7 +381,7 @@ MK_CXXFLAGS += \
 	-Wmissing-declarations \
 	-Wmissing-noreturn

-ifeq ($(LLAMA_FATAL_WARNINGS),1)
+ifeq ($(JARVIS_FATAL_WARNINGS),1)
 	MK_CFLAGS   += -Werror
 	MK_CXXFLAGS += -Werror
 endif
@ -420,7 +420,7 @@ ifeq ($(_WIN32),1)
 	LWINSOCK2 := -lws2_32
 endif

-ifdef LLAMA_GPROF
+ifdef JARVIS_GPROF
 	MK_CFLAGS   += -pg
 	MK_CXXFLAGS += -pg
 endif
@ -448,7 +448,7 @@ endif
 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
 	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
-	# https://github.com/ggerganov/llama.cpp/issues/2922
+	# https://github.com/ggerganov/jarvis.cpp/issues/2922
 	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
 	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move

@ -574,9 +574,9 @@ ifdef GGML_NVPL
 	OBJ_GGML    += ggml/src/ggml-blas.o
 endif # GGML_NVPL

-ifndef GGML_NO_LLAMAFILE
-	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
-	OBJ_GGML    += ggml/src/llamafile/sgemm.o
+ifndef GGML_NO_JARVISFILE
+	MK_CPPFLAGS += -DGGML_USE_JARVISFILE
+	OBJ_GGML    += ggml/src/jarvisfile/sgemm.o
 endif

 ifndef GGML_NO_AMX
@ -627,9 +627,9 @@ ifdef GGML_CUDA
 	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
 	OBJ_GGML += $(OBJ_CUDA_TMPL)

-ifdef LLAMA_FATAL_WARNINGS
+ifdef JARVIS_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
-endif # LLAMA_FATAL_WARNINGS
+endif # JARVIS_FATAL_WARNINGS

 ifndef GGML_MUSA
 ifndef JETSON_EOL_MODULE_DETECT
@ -637,9 +637,9 @@ ifndef JETSON_EOL_MODULE_DETECT
 endif # JETSON_EOL_MODULE_DETECT
 endif # GGML_MUSA

-ifdef LLAMA_DEBUG
+ifdef JARVIS_DEBUG
 	MK_NVCCFLAGS += -lineinfo
-endif # LLAMA_DEBUG
+endif # JARVIS_DEBUG

 ifdef GGML_CUDA_DEBUG
 	MK_NVCCFLAGS += --device-debug
@ -920,11 +920,11 @@ OBJ_GGML += \
 	ggml/src/ggml-quants.o \
 	ggml/src/ggml-aarch64.o

-OBJ_LLAMA = \
-	src/llama.o \
-	src/llama-vocab.o \
-	src/llama-grammar.o \
-	src/llama-sampling.o \
+OBJ_JARVIS = \
+	src/jarvis.o \
+	src/jarvis-vocab.o \
+	src/jarvis-grammar.o \
+	src/jarvis-sampling.o \
 	src/unicode.o \
 	src/unicode-data.o

@ -939,19 +939,19 @@ OBJ_COMMON = \
 	common/build-info.o \
 	common/json-schema-to-grammar.o

-OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
+OBJ_ALL = $(OBJ_GGML) $(OBJ_JARVIS) $(OBJ_COMMON)

 LIB_GGML   = $(LIB_PRE)ggml$(DSO_EXT)
 LIB_GGML_S = $(LIB_PRE)ggml.a

-LIB_LLAMA   = $(LIB_PRE)llama$(DSO_EXT)
-LIB_LLAMA_S = $(LIB_PRE)llama.a
+LIB_JARVIS   = $(LIB_PRE)jarvis$(DSO_EXT)
+LIB_JARVIS_S = $(LIB_PRE)jarvis.a

 LIB_COMMON   = $(LIB_PRE)common$(DSO_EXT)
 LIB_COMMON_S = $(LIB_PRE)common.a

-LIB_ALL   = $(LIB_GGML)   $(LIB_LLAMA)   $(LIB_COMMON)
-LIB_ALL_S = $(LIB_GGML_S) $(LIB_LLAMA_S) $(LIB_COMMON_S)
+LIB_ALL   = $(LIB_GGML)   $(LIB_JARVIS)   $(LIB_COMMON)
+LIB_ALL_S = $(LIB_GGML_S) $(LIB_JARVIS_S) $(LIB_COMMON_S)

 GF_CC := $(CC)
 include scripts/get-flags.mk
@ -971,8 +971,8 @@ include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
 endif

-ifdef LLAMA_CURL
-override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
+ifdef JARVIS_CURL
+override CXXFLAGS := $(CXXFLAGS) -DJARVIS_USE_CURL
 override LDFLAGS  := $(LDFLAGS) -lcurl
 endif

@ -980,7 +980,7 @@ endif
 # Print build information
 #

-$(info I llama.cpp build info: )
+$(info I jarvis.cpp build info: )
 $(info I UNAME_S:   $(UNAME_S))
 $(info I UNAME_P:   $(UNAME_P))
 $(info I UNAME_M:   $(UNAME_M))
@ -1009,30 +1009,30 @@ $(info )

 ifdef DEPRECATE_WARNING
 $(info !!! DEPRECATION WARNING !!!)
-$(info The following LLAMA_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead)
-$(info   - LLAMA_CUDA)
-$(info   - LLAMA_METAL)
-$(info   - LLAMA_METAL_EMBED_LIBRARY)
-$(info   - LLAMA_OPENMP)
-$(info   - LLAMA_RPC)
-$(info   - LLAMA_SYCL)
-$(info   - LLAMA_SYCL_F16)
-$(info   - LLAMA_OPENBLAS)
-$(info   - LLAMA_OPENBLAS64)
-$(info   - LLAMA_BLIS)
-$(info   - LLAMA_NO_LLAMAFILE)
-$(info   - LLAMA_NO_ACCELERATE)
-$(info   - LLAMA_NO_OPENMP)
-$(info   - LLAMA_NO_METAL)
-$(info   - LLAMA_NO_CCACHE)
+$(info The following JARVIS_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead)
+$(info   - JARVIS_CUDA)
+$(info   - JARVIS_METAL)
+$(info   - JARVIS_METAL_EMBED_LIBRARY)
+$(info   - JARVIS_OPENMP)
+$(info   - JARVIS_RPC)
+$(info   - JARVIS_SYCL)
+$(info   - JARVIS_SYCL_F16)
+$(info   - JARVIS_OPENBLAS)
+$(info   - JARVIS_OPENBLAS64)
+$(info   - JARVIS_BLIS)
+$(info   - JARVIS_NO_JARVISFILE)
+$(info   - JARVIS_NO_ACCELERATE)
+$(info   - JARVIS_NO_OPENMP)
+$(info   - JARVIS_NO_METAL)
+$(info   - JARVIS_NO_CCACHE)
 $(info )
 endif

 ifdef REMOVE_WARNING
 $(info !!! REMOVAL WARNING !!!)
-$(info The following LLAMA_ options have been removed and are no longer supported)
-$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggerganov/llama.cpp/pull/9418))
-$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info The following JARVIS_ options have been removed and are no longer supported)
+$(info   - JARVIS_DISABLE_LOGS   (https://github.com/ggerganov/jarvis.cpp/pull/9418))
+$(info   - JARVIS_SERVER_VERBOSE (https://github.com/ggerganov/jarvis.cpp/pull/9418))
 $(info )
 endif

@ -1079,13 +1079,13 @@ ggml/src/ggml-blas.o: \
 	ggml/include/ggml-blas.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-ifndef GGML_NO_LLAMAFILE
-ggml/src/llamafile/sgemm.o: \
-	ggml/src/llamafile/sgemm.cpp \
-	ggml/src/llamafile/sgemm.h \
+ifndef GGML_NO_JARVISFILE
+ggml/src/jarvisfile/sgemm.o: \
+	ggml/src/jarvisfile/sgemm.cpp \
+	ggml/src/jarvisfile/sgemm.h \
 	ggml/include/ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif # GGML_NO_LLAMAFILE
+endif # GGML_NO_JARVISFILE

 ifndef GGML_NO_AMX
 ggml/src/ggml-amx.o: \
@ -1115,7 +1115,7 @@ $(LIB_GGML_S): \
 	$(OBJ_GGML)
 	ar rcs $(LIB_GGML_S) $^

-# llama
+# jarvis

 src/unicode.o: \
 	src/unicode.cpp \
@ -1127,14 +1127,14 @@ src/unicode-data.o: \
 	src/unicode-data.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-src/llama.o: \
-	src/llama.cpp \
-	src/llama-impl.h \
-	src/llama-vocab.h \
-	src/llama-grammar.h \
-	src/llama-sampling.h \
+src/jarvis.o: \
+	src/jarvis.cpp \
+	src/jarvis-impl.h \
+	src/jarvis-vocab.h \
+	src/jarvis-grammar.h \
+	src/jarvis-sampling.h \
 	src/unicode.h \
-	include/llama.h \
+	include/jarvis.h \
 	ggml/include/ggml-cuda.h \
 	ggml/include/ggml-metal.h \
 	ggml/include/ggml.h \
@ -1142,37 +1142,37 @@ src/llama.o: \
 	ggml/include/ggml-backend.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-src/llama-vocab.o: \
-	src/llama-vocab.cpp \
-	src/llama-vocab.h \
-	src/llama-impl.h \
-	include/llama.h
+src/jarvis-vocab.o: \
+	src/jarvis-vocab.cpp \
+	src/jarvis-vocab.h \
+	src/jarvis-impl.h \
+	include/jarvis.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-src/llama-grammar.o: \
-	src/llama-grammar.cpp \
-	src/llama-grammar.h \
-	src/llama-impl.h \
-	src/llama-vocab.h \
-	src/llama-sampling.h \
-	include/llama.h
+src/jarvis-grammar.o: \
+	src/jarvis-grammar.cpp \
+	src/jarvis-grammar.h \
+	src/jarvis-impl.h \
+	src/jarvis-vocab.h \
+	src/jarvis-sampling.h \
+	include/jarvis.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-src/llama-sampling.o: \
-	src/llama-sampling.cpp \
-	src/llama-sampling.h \
-	src/llama-impl.h \
-	include/llama.h
+src/jarvis-sampling.o: \
+	src/jarvis-sampling.cpp \
+	src/jarvis-sampling.h \
+	src/jarvis-impl.h \
+	include/jarvis.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-$(LIB_LLAMA): \
-	$(OBJ_LLAMA) \
+$(LIB_JARVIS): \
+	$(OBJ_JARVIS) \
 	$(LIB_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

-$(LIB_LLAMA_S): \
-	$(OBJ_LLAMA)
-	ar rcs $(LIB_LLAMA_S) $^
+$(LIB_JARVIS_S): \
+	$(OBJ_JARVIS)
+	ar rcs $(LIB_JARVIS_S) $^

 # common

@ -1183,7 +1183,7 @@ common/common.o: \
 	common/sampling.h \
 	common/json.hpp \
 	common/json-schema-to-grammar.h \
-	include/llama.h
+	include/jarvis.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 common/arg.o: \
@ -1199,7 +1199,7 @@ common/log.o: \
 common/sampling.o: \
 	common/sampling.cpp \
 	common/sampling.h \
-	include/llama.h
+	include/jarvis.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 common/console.o: \
@ -1224,7 +1224,7 @@ common/ngram-cache.o: \

 $(LIB_COMMON): \
 	$(OBJ_COMMON) \
-	$(LIB_LLAMA) \
+	$(LIB_JARVIS) \
 	$(LIB_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

@ -1246,7 +1246,7 @@ clean:
 	rm -rvf ggml/*.dll
 	rm -rvf ggml/*.so
 	rm -vrf ggml/src/*.o
-	rm -rvf ggml/src/llamafile/*.o
+	rm -rvf ggml/src/jarvisfile/*.o
 	rm -rvf common/build-info.cpp
 	rm -vrf ggml/src/ggml-metal-embed.metal
 	rm -vrf ggml/src/ggml-cuda/*.o
@ -1269,75 +1269,75 @@ clean:
 # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))

-llama-cli: examples/main/main.cpp \
+jarvis-cli: examples/main/main.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo
-	@echo '====  Run ./llama-cli -h for help.  ===='
+	@echo '====  Run ./jarvis-cli -h for help.  ===='
 	@echo

-llama-infill: examples/infill/infill.cpp \
+jarvis-infill: examples/infill/infill.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-simple: examples/simple/simple.cpp \
+jarvis-simple: examples/simple/simple.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-tokenize: examples/tokenize/tokenize.cpp \
+jarvis-tokenize: examples/tokenize/tokenize.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-batched: examples/batched/batched.cpp \
+jarvis-batched: examples/batched/batched.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-batched-bench: examples/batched-bench/batched-bench.cpp \
+jarvis-batched-bench: examples/batched-bench/batched-bench.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-quantize: examples/quantize/quantize.cpp \
+jarvis-quantize: examples/quantize/quantize.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
+jarvis-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-perplexity: examples/perplexity/perplexity.cpp \
+jarvis-perplexity: examples/perplexity/perplexity.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-imatrix: examples/imatrix/imatrix.cpp \
+jarvis-imatrix: examples/imatrix/imatrix.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-embedding: examples/embedding/embedding.cpp \
+jarvis-embedding: examples/embedding/embedding.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-gritlm: examples/gritlm/gritlm.cpp \
+jarvis-gritlm: examples/gritlm/gritlm.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-save-load-state: examples/save-load-state/save-load-state.cpp \
+jarvis-save-load-state: examples/save-load-state/save-load-state.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-gguf: examples/gguf/gguf.cpp \
+jarvis-gguf: examples/gguf/gguf.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1354,92 +1354,92 @@ examples/gguf-hash/deps/sha256/sha256.o: \
 	examples/gguf-hash/deps/sha256/sha256.c
 	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@

-llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
+jarvis-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-gguf-split: examples/gguf-split/gguf-split.cpp \
+jarvis-gguf-split: examples/gguf-split/gguf-split.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-eval-callback: examples/eval-callback/eval-callback.cpp \
+jarvis-eval-callback: examples/eval-callback/eval-callback.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
+jarvis-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
+jarvis-convert-jarvis2c-to-ggml: examples/convert-jarvis2c-to-ggml/convert-jarvis2c-to-ggml.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-bench: examples/llama-bench/llama-bench.cpp \
+jarvis-bench: examples/jarvis-bench/jarvis-bench.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-baby-llama: examples/baby-llama/baby-llama.cpp \
+jarvis-baby-jarvis: examples/baby-jarvis/baby-jarvis.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-export-lora: examples/export-lora/export-lora.cpp \
+jarvis-export-lora: examples/export-lora/export-lora.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-retrieval: examples/retrieval/retrieval.cpp \
+jarvis-retrieval: examples/retrieval/retrieval.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-speculative: examples/speculative/speculative.cpp \
+jarvis-speculative: examples/speculative/speculative.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-parallel: examples/parallel/parallel.cpp \
+jarvis-parallel: examples/parallel/parallel.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-lookahead: examples/lookahead/lookahead.cpp \
+jarvis-lookahead: examples/lookahead/lookahead.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-lookup: examples/lookup/lookup.cpp \
+jarvis-lookup: examples/lookup/lookup.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-lookup-create: examples/lookup/lookup-create.cpp \
+jarvis-lookup-create: examples/lookup/lookup-create.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-lookup-merge: examples/lookup/lookup-merge.cpp \
+jarvis-lookup-merge: examples/lookup/lookup-merge.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-lookup-stats: examples/lookup/lookup-stats.cpp \
+jarvis-lookup-stats: examples/lookup/lookup-stats.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-passkey: examples/passkey/passkey.cpp \
+jarvis-passkey: examples/passkey/passkey.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
+jarvis-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1450,7 +1450,7 @@ rpc-server: examples/rpc/rpc-server.cpp \
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 endif # GGML_RPC

-llama-server: \
+jarvis-server: \
 	examples/server/server.cpp \
 	examples/server/utils.hpp \
 	examples/server/httplib.h \
@ -1485,7 +1485,7 @@ examples/server/%.hpp: examples/server/public/% Makefile
 		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
 	) > $@

-llama-gen-docs: examples/gen-docs/gen-docs.cpp \
+jarvis-gen-docs: examples/gen-docs/gen-docs.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1499,7 +1499,7 @@ libllava.a: examples/llava/llava.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual

-llama-llava-cli: examples/llava/llava-cli.cpp \
+jarvis-llava-cli: examples/llava/llava-cli.cpp \
 	examples/llava/llava.cpp \
 	examples/llava/llava.h \
 	examples/llava/clip.cpp \
@ -1507,7 +1507,7 @@ llama-llava-cli: examples/llava/llava-cli.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

-llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
+jarvis-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
 	examples/llava/llava.cpp \
 	examples/llava/llava.h \
 	examples/llava/clip.cpp \
@ -1542,7 +1542,7 @@ tests/test-arg-parser: tests/test-arg-parser.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-llama-grammar: tests/test-llama-grammar.cpp \
+tests/test-jarvis-grammar: tests/test-jarvis-grammar.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1616,7 +1616,7 @@ tests/test-rope: tests/test-rope.cpp ggml/src/ggml.o \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-c.o: tests/test-c.c include/llama.h
+tests/test-c.o: tests/test-c.c include/jarvis.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@

 tests/test-backend-ops: tests/test-backend-ops.cpp \
@ -1643,12 +1643,12 @@ tests/test-chat-template: tests/test-chat-template.cpp \
 # PoCs
 #

-llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
+jarvis-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
+jarvis-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1667,17 +1667,17 @@ examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning
 #  Eventually we will want to remove these target from building all the time.
 main: examples/deprecation-warning/deprecation-warning.o
 	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
-	@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
+	@echo "NOTICE: The 'main' binary is deprecated. Please use 'jarvis-cli' instead."

 server: examples/deprecation-warning/deprecation-warning.o
 	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
-	@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
+	@echo "NOTICE: The 'server' binary is deprecated. Please use 'jarvis-server' instead."

 quantize: examples/deprecation-warning/deprecation-warning.o
 ifneq (,$(wildcard quantize))
 	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
 	@echo "#########"
-	@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
+	@echo "WARNING: The 'quantize' binary is deprecated. Please use 'jarvis-quantize' instead."
 	@echo "  Remove the 'quantize' binary to remove this warning."
 	@echo "#########"
 endif
@ -1686,7 +1686,7 @@ perplexity: examples/deprecation-warning/deprecation-warning.o
 ifneq (,$(wildcard perplexity))
 	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
 	@echo "#########"
-	@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
+	@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'jarvis-perplexity' instead."
 	@echo "  Remove the 'perplexity' binary to remove this warning."
 	@echo "#########"
 endif
@ -1695,7 +1695,7 @@ embedding: examples/deprecation-warning/deprecation-warning.o
 ifneq (,$(wildcard embedding))
 	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
 	@echo "#########"
-	@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
+	@echo "WARNING: The 'embedding' binary is deprecated. Please use 'jarvis-embedding' instead."
 	@echo "  Remove the 'embedding' binary to remove this warning."
 	@echo "#########"
 endif
--- a/Package.swift
+++ b/Package.swift
@ -3,10 +3,10 @@
 import PackageDescription

 var sources = [
-    "src/llama.cpp",
-    "src/llama-vocab.cpp",
-    "src/llama-grammar.cpp",
-    "src/llama-sampling.cpp",
+    "src/jarvis.cpp",
+    "src/jarvis-vocab.cpp",
+    "src/jarvis-grammar.cpp",
+    "src/jarvis-sampling.cpp",
    "src/unicode.cpp",
    "src/unicode-data.cpp",
    "ggml/src/ggml.c",
@ -45,7 +45,7 @@ cSettings.append(
 #endif

 let package = Package(
-    name: "llama",
+    name: "jarvis",
    platforms: [
        .macOS(.v12),
        .iOS(.v14),
@ -53,11 +53,11 @@ let package = Package(
        .tvOS(.v14)
    ],
    products: [
-        .library(name: "llama", targets: ["llama"]),
+        .library(name: "jarvis", targets: ["jarvis"]),
    ],
    targets: [
        .target(
-            name: "llama",
+            name: "jarvis",
            path: ".",
            exclude: [
               "cmake",
--- a/README.md
+++ b/README.md
@ -1,30 +1,30 @@
-# llama.cpp
+# jarvis.cpp

-![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
+![jarvis](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
-[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
+[![Server](https://github.com/ggerganov/jarvis.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/jarvis.cpp/actions/workflows/server.yml)
+[![Conan Center](https://shields.io/conan/v/jarvis-cpp)](https://conan.io/center/jarvis-cpp)

-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/jarvis.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/jarvis.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)

 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++

 ## Recent API changes

- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
+- [Changelog for `libjarvis` API](https://github.com/ggerganov/jarvis.cpp/issues/9289)
+- [Changelog for `jarvis-server` REST API](https://github.com/ggerganov/jarvis.cpp/issues/9291)

 ## Hot topics

- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/jarvis.cpp/discussions/9669**
+- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/jarvis.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)

 ----

 ## Description

-The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
+The main goal of `jarvis.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
 variety of hardware - locally and in the cloud.

 - Plain C/C++ implementation without any dependencies
@ -35,7 +35,7 @@ variety of hardware - locally and in the cloud.
 - Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity

-Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
+Since its [inception](https://github.com/ggerganov/jarvis.cpp/issues/33#issuecomment-1465108022), the project has
 improved significantly thanks to many contributions. It is the main playground for developing new features for the
 [ggml](https://github.com/ggerganov/ggml) library.

@ -52,22 +52,22 @@ Typically finetunes of the base models below are supported as well.
 - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
+- [X] [BERT](https://github.com/ggerganov/jarvis.cpp/pull/5423)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
 - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
+- [X] [Starcoder models](https://github.com/ggerganov/jarvis.cpp/pull/3187)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
+- [X] [MPT](https://github.com/ggerganov/jarvis.cpp/pull/3417)
+- [X] [Bloom](https://github.com/ggerganov/jarvis.cpp/pull/3553)
 - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
 - [X] [StableLM models](https://huggingface.co/stabilityai)
 - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
 - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
+- [x] [PLaMo-13B](https://github.com/ggerganov/jarvis.cpp/pull/3557)
 - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
 - [x] [GPT-2](https://huggingface.co/gpt2)
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
+- [x] [Orion 14B](https://github.com/ggerganov/jarvis.cpp/pull/5118)
 - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
@ -111,36 +111,36 @@ Typically finetunes of the base models below are supported as well.

 **Bindings:**

- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
+- Python: [abetlen/jarvis-cpp-python](https://github.com/abetlen/jarvis-cpp-python)
+- Go: [go-skynet/go-jarvis.cpp](https://github.com/go-skynet/go-jarvis.cpp)
+- Node.js: [withcatai/node-jarvis-cpp](https://github.com/withcatai/node-jarvis-cpp)
+- JS/TS (jarvis.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/jarviscpp)
 - JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
+- JavaScript/Wasm (works in browser): [tangledgroup/jarvis-cpp-wasm](https://github.com/tangledgroup/jarvis-cpp-wasm)
+- Typescript/Wasm (nicer API, available on npm): [ngxson/wjarvis](https://github.com/ngxson/wjarvis)
+- Ruby: [yoshoku/jarvis_cpp.rb](https://github.com/yoshoku/jarvis_cpp.rb)
+- Rust (more features): [edgenai/jarvis_cpp-rs](https://github.com/edgenai/jarvis_cpp-rs)
+- Rust (nicer API): [mdrokz/rust-jarvis.cpp](https://github.com/mdrokz/rust-jarvis.cpp)
+- Rust (more direct bindings): [utilityai/jarvis-cpp-rs](https://github.com/utilityai/jarvis-cpp-rs)
+- C#/.NET: [SciSharp/JarvisSharp](https://github.com/SciSharp/JarvisSharp)
 - C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
+- Clojure: [phronmophobic/jarvis.clj](https://github.com/phronmophobic/jarvis.clj)
+- React Native: [mybigday/jarvis.rn](https://github.com/mybigday/jarvis.rn)
+- Java: [kherud/java-jarvis.cpp](https://github.com/kherud/java-jarvis.cpp)
+- Zig: [deins/jarvis.cpp.zig](https://github.com/Deins/jarvis.cpp.zig)
+- Flutter/Dart: [netdur/jarvis_cpp_dart](https://github.com/netdur/jarvis_cpp_dart)
+- PHP (API bindings and features built on top of jarvis.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/jarvis.cpp/pull/6326)
+- Guile Scheme: [guile_jarvis_cpp](https://savannah.nongnu.org/projects/guile-jarvis-cpp)
+- Swift [srgtuszy/jarvis-cpp-swift](https://github.com/srgtuszy/jarvis-cpp-swift)
+- Swift [ShenghaiWang/SwiftJarvis](https://github.com/ShenghaiWang/SwiftJarvis)

 **UI:**

 Unless otherwise noted these projects are open-source with permissive licensing:

 - [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA)
+- [iohub/cojarvis](https://github.com/iohub/coLLaMA)
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [Faraday](https://faraday.dev/) (proprietary)
@ -149,9 +149,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [ramalama](https://github.com/containers/ramalama) (MIT)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
+- [Mozilla-Ocho/jarvisfile](https://github.com/Mozilla-Ocho/jarvisfile)
 - [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
- [ollama/ollama](https://github.com/ollama/ollama)
+- [ojarvis/ojarvis](https://github.com/ojarvis/ojarvis)
 - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
 - [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
 - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
@ -173,24 +173,24 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
 - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
 - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
+- [Jarvis Assistant](https://github.com/vietanhdev/jarvis-assistant) (GPL)
 - [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)

-*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
+*(to have a project listed here, it should clearly state that it depends on `jarvis.cpp`)*

 **Tools:**

 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
+- [akx/ojarvis-dl](https://github.com/akx/ojarvis-dl) – download models from the Ojarvis library to be used directly with jarvis.cpp
+- [crashr/gppm](https://github.com/crashr/gppm) – launch jarvis.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
+- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-jarvis-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)

 **Infrastructure:**

- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
+- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for jarvis.cpp
 - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
+- [jarvis_cpp_canister](https://github.com/onicai/jarvis_cpp_canister) - jarvis.cpp as a smart contract on the Internet Computer, using WebAssembly

 **Games:**
 - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
@ -201,8 +201,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 <summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>

 ```
-$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
-I llama.cpp build info:
+$ make -j && ./jarvis-cli -m models/jarvis-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
+I jarvis.cpp build info:
 I UNAME_S:  Darwin
 I UNAME_P:  arm
 I UNAME_M:  arm64
@ -215,12 +215,12 @@ I CXX:      Apple clang version 14.0.3 (clang-1403.0.22.14.1)
 make: Nothing to be done for `default'.
 main: build = 1041 (cf658ad)
 main: seed  = 1692823051
-llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
-llama_model_loader: - type  f32:   81 tensors
-llama_model_loader: - type q4_0:  281 tensors
-llama_model_loader: - type q6_K:    1 tensors
+jarvis_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/jarvis-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
+jarvis_model_loader: - type  f32:   81 tensors
+jarvis_model_loader: - type q4_0:  281 tensors
+jarvis_model_loader: - type q6_K:    1 tensors
 llm_load_print_meta: format         = GGUF V1 (latest)
-llm_load_print_meta: arch           = llama
+llm_load_print_meta: arch           = jarvis
 llm_load_print_meta: vocab type     = SPM
 llm_load_print_meta: n_vocab        = 32000
 llm_load_print_meta: n_merges       = 0
@ -248,8 +248,8 @@ llm_load_print_meta: LF token  = 13 '<0x0A>'
 llm_load_tensors: ggml ctx size =    0.11 MB
 llm_load_tensors: mem required  = 7024.01 MB (+  400.00 MB per state)
 ...................................................................................................
-llama_new_context_with_model: kv self size  =  400.00 MB
-llama_new_context_with_model: compute buffer total size =   75.41 MB
+jarvis_new_context_with_model: kv self size  =  400.00 MB
+jarvis_new_context_with_model: compute buffer total size =   75.41 MB

 system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
 sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
@ -271,11 +271,11 @@ How does a Website Work?
 A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
 The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
 How to
-llama_print_timings:        load time =   576.45 ms
-llama_print_timings:      sample time =   283.10 ms /   400 runs   (    0.71 ms per token,  1412.91 tokens per second)
-llama_print_timings: prompt eval time =   599.83 ms /    19 tokens (   31.57 ms per token,    31.68 tokens per second)
-llama_print_timings:        eval time = 24513.59 ms /   399 runs   (   61.44 ms per token,    16.28 tokens per second)
-llama_print_timings:       total time = 25431.49 ms
+jarvis_print_timings:        load time =   576.45 ms
+jarvis_print_timings:      sample time =   283.10 ms /   400 runs   (    0.71 ms per token,  1412.91 tokens per second)
+jarvis_print_timings: prompt eval time =   599.83 ms /    19 tokens (   31.57 ms per token,    31.68 tokens per second)
+jarvis_print_timings:        eval time = 24513.59 ms /   399 runs   (   61.44 ms per token,    16.28 tokens per second)
+jarvis_print_timings:       total time = 25431.49 ms
 ```

 </details>
@ -297,14 +297,14 @@ Here are the end-to-end binary build and model conversion steps for most support

 Firstly, you need to get the binary. There are different methods that you can follow:
 - Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
+- Method 2: If you are using MacOS or Linux, you can install jarvis.cpp via [brew, flox or nix](./docs/install.md)
 - Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
+- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/jarvis.cpp/releases)

 You can run a basic completion using this command:

 ```bash
-llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
+jarvis-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128

 # Output:
 # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
@ -317,7 +317,7 @@ See [this page](./examples/main/README.md) for a full list of parameters.
 If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:

 ```bash
-llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
+jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv

 # Output:
 # > hi, who are you?
@ -327,26 +327,26 @@ llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
 # Easy peasy! The answer to 1+1 is... 2!
 ```

-By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
+By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/jarvis.cpp/wiki/Templates-supported-by-jarvis_chat_apply_template)

 ```bash
-./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
+./jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
 ```

 You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:

 ```bash
-./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
+./jarvis-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
 ```

 ### Web server

-[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
+[jarvis.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.

 Example usage:

 ```bash
-./llama-server -m your_model.gguf --port 8080
+./jarvis-server -m your_model.gguf --port 8080

 # Basic web UI can be accessed via browser: http://localhost:8080
 # Chat completion endpoint: http://localhost:8080/v1/chat/completions
@ -369,16 +369,16 @@ Here is an example of a few-shot interaction, invoked with the command
 ./examples/chat-13B.sh

 # custom arguments using a 13B model
-./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
+./jarvis-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```

-Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
+Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `jarvis-cli` example program.

 ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)

 ### Persistent Interaction

-The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
+The prompt, user inputs, and model generations can be saved and resumed across calls to `./jarvis-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.

 ```bash
 # Start a new chat
@ -397,10 +397,10 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \

 ### Constrained output with grammars

-`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
+`jarvis.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:

 ```bash
-./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
+./jarvis-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
 ```

 The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
@ -409,7 +409,7 @@ For authoring more complex JSON grammars, you can also check out https://grammar

 ## Build

-Please refer to [Build llama.cpp locally](./docs/build.md)
+Please refer to [Build jarvis.cpp locally](./docs/build.md)

 ## Supported backends

@ -430,11 +430,11 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
 ### Prepare and Quantize

 > [!NOTE]
-> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
+> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `jarvis.cpp` main every 6 hours.

-To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
+To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-jarvis-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.

-Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
+Note: `convert.py` has been moved to `examples/convert_legacy_jarvis.py` and shouldn't be used for anything other than `Jarvis/Jarvis2/Mistral` models and their derivatives.
 It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.

 To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
@ -444,17 +444,17 @@ To learn more about quantizing model, [read this documentation](./examples/quant
 You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
 For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).

-To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
+To learn more how to measure perplexity using jarvis.cpp, [read this documentation](./examples/perplexity/README.md)

 ## Contributing

 - Contributors can open PRs
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
+- Collaborators can push to branches in the `jarvis.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
 - Any help with managing issues, PRs and projects is very appreciated!
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
+- See [good first issues](https://github.com/ggerganov/jarvis.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
+- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/jarvis.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)

 ## Other documentations
@ -470,13 +470,13 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 - [Running on Docker](./docs/docker.md)
 - [Build on Android](./docs/android.md)
 - [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [GGML tips & tricks](https://github.com/ggerganov/jarvis.cpp/wiki/GGML-Tips-&-Tricks)

 **Seminal papers and background on the models**

 If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
 - LLaMA:
-    - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
+    - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-jarvis-meta-ai/)
    - [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
 - GPT-3
    - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
--- a/SECURITY.md
+++ b/SECURITY.md
@ -1,6 +1,6 @@
 # Security Policy

- - [**Using llama.cpp securely**](#using-llamacpp-securely)
+ - [**Using jarvis.cpp securely**](#using-jarviscpp-securely)
   - [Untrusted models](#untrusted-models)
   - [Untrusted inputs](#untrusted-inputs)
   - [Data privacy](#data-privacy)
@ -8,7 +8,7 @@
   - [Multi-Tenant environments](#multi-tenant-environments)
 - [**Reporting a vulnerability**](#reporting-a-vulnerability)

-## Using llama.cpp securely
+## Using jarvis.cpp securely

 ### Untrusted models
 Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
@ -57,11 +57,11 @@ If you intend to run multiple models in parallel with shared memory, it is your

 ## Reporting a vulnerability

-Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
+Beware that none of the topics under [Using jarvis.cpp securely](#using-jarviscpp-securely) are considered vulnerabilities of LLaMA C++.

 <!-- normal version -->
 However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.

-Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
+Please disclose it as a private [security advisory](https://github.com/ggerganov/jarvis.cpp/security/advisories/new).

 A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
--- a/ci/README.md
+++ b/ci/README.md
@ -1,11 +1,11 @@
 # CI

-In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
+In addition to [Github Actions](https://github.com/ggerganov/jarvis.cpp/actions) `jarvis.cpp` uses a custom CI framework:

 https://github.com/ggml-org/ci

 It monitors the `master` branch for new commits and runs the
-[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
+[ci/run.sh](https://github.com/ggerganov/jarvis.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
 to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
 to cover various hardware architectures, including GPU and Apple Silicon instances.

--- a/ci/run.sh
+++ b/ci/run.sh
@ -36,7 +36,7 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`

-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
+CMAKE_EXTRA="-DJARVIS_FATAL_WARNINGS=ON"

 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@ -217,7 +217,7 @@ function gg_sum_test_scripts_release {
 function gg_get_model {
    local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
    local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
-    local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+    local gguf_2="$MNT/models/open-jarvis/7B-v2/ggml-model-f16.gguf"
    if [[ -s $gguf_0 ]]; then
        echo -n "$gguf_0"
    elif [[ -s $gguf_1 ]]; then
@ -236,7 +236,7 @@ function gg_run_ctest_with_model_debug {
    local model; model=$(gg_get_model)
    cd build-ci-debug
    set -e
-    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (JARVISCPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
    set +e
    cd ..
 }
@ -247,7 +247,7 @@ function gg_run_ctest_with_model_release {
    local model; model=$(gg_get_model)
    cd build-ci-release
    set -e
-    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (JARVISCPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
    set +e
    cd ..
 }
@ -272,24 +272,24 @@ function gg_sum_ctest_with_model_release {
    gg_printf '```\n'
 }

-# open_llama_7b_v2
+# open_jarvis_7b_v2

-function gg_run_open_llama_7b_v2 {
+function gg_run_open_jarvis_7b_v2 {
    cd ${SRC}

-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/config.json
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/tokenizer.model
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/tokenizer_config.json
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/special_tokens_map.json
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/pytorch_model.bin.index.json
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
+    gg_wget models-mnt/open-jarvis/7B-v2/ https://huggingface.co/openlm-research/open_jarvis_7b_v2/raw/main/generation_config.json

    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/

-    path_models="../models-mnt/open-llama/7B-v2"
+    path_models="../models-mnt/open-jarvis/7B-v2"
    path_wiki="../models-mnt/wikitext/wikitext-2-raw"

    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
@ -299,7 +299,7 @@ function gg_run_open_llama_7b_v2 {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../examples/convert_legacy_jarvis.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -315,47 +315,47 @@ function gg_run_open_llama_7b_v2 {

    wiki_test="${path_wiki}/wiki.test.raw"

-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/jarvis-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/jarvis-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/jarvis-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/jarvis-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/jarvis-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/jarvis-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/jarvis-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/jarvis-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/jarvis-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/jarvis-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/jarvis-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/jarvis-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@ -387,7 +387,7 @@ function gg_run_open_llama_7b_v2 {
    set +e
 }

-function gg_sum_open_llama_7b_v2 {
+function gg_sum_open_jarvis_7b_v2 {
    gg_printf '### %s\n\n' "${ci}"

    gg_printf 'OpenLLaMA 7B-v2:\n'
@ -449,45 +449,45 @@ function gg_run_pythia_1_4b {

    wiki_test_60="${path_wiki}/wiki.test-60.raw"

-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/jarvis-cli --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/jarvis-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/jarvis-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/jarvis-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/jarvis-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/jarvis-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/jarvis-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/jarvis-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/jarvis-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/jarvis-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/jarvis-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/jarvis-perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@ -580,47 +580,47 @@ function gg_run_pythia_2_8b {

    wiki_test="${path_wiki}/wiki.test.raw"

-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/jarvis-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/jarvis-quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/jarvis-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/jarvis-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/jarvis-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/jarvis-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/jarvis-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/jarvis-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/jarvis-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/jarvis-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/jarvis-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/jarvis-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/jarvis-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/jarvis-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/jarvis-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/jarvis-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/jarvis-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/jarvis-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/jarvis-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@ -704,10 +704,10 @@ function gg_run_embd_bge_small {
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"

-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/jarvis-quantize ${model_f16} ${model_q8_0} q8_0

-    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/jarvis-embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/jarvis-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log

    set +e
 }
@ -752,7 +752,7 @@ function gg_run_rerank_tiny {
    model_f16="${path_models}/ggml-model-f16.gguf"

    # for this model, the SEP token is "</s>"
-    (time ./bin/llama-embedding --model ${model_f16}  -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    (time ./bin/jarvis-embedding --model ${model_f16}  -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log

    # sample output
    # rerank score 0:    0.029
@ -804,11 +804,11 @@ function gg_check_build_requirements {

 ## main

-export LLAMA_LOG_PREFIX=1
-export LLAMA_LOG_TIMESTAMPS=1
+export JARVIS_LOG_PREFIX=1
+export JARVIS_LOG_TIMESTAMPS=1

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
+    # Create symlink: ./jarvis.cpp/models-mnt -> $MNT/models/models-mnt
    rm -rf ${SRC}/models-mnt
    mnt_models=${MNT}/models
    mkdir -p ${mnt_models}
@ -841,7 +841,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
            test $ret -eq 0 && gg_run pythia_1_4b
        else
            test $ret -eq 0 && gg_run pythia_2_8b
-            #test $ret -eq 0 && gg_run open_llama_7b_v2
+            #test $ret -eq 0 && gg_run open_jarvis_7b_v2
        fi
        test $ret -eq 0 && gg_run ctest_with_model_debug
        test $ret -eq 0 && gg_run ctest_with_model_release
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@ -1,7 +1,7 @@
-set(LLAMA_VERSION      @LLAMA_INSTALL_VERSION@)
-set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
-set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
-set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
+set(JARVIS_VERSION      @JARVIS_INSTALL_VERSION@)
+set(JARVIS_BUILD_COMMIT @JARVIS_BUILD_COMMIT@)
+set(JARVIS_BUILD_NUMBER @JARVIS_BUILD_NUMBER@)
+set(JARVIS_SHARED_LIB   @BUILD_SHARED_LIBS@)

 set(GGML_BLAS       @GGML_BLAS@)
 set(GGML_CUDA       @GGML_CUDA@)
@ -18,9 +18,9 @@ set(GGML_OPENMP @GGML_OPENMP@)

@PACKAGE_INIT@

-set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
-set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
-set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
+set_and_check(JARVIS_INCLUDE_DIR "@PACKAGE_JARVIS_INCLUDE_INSTALL_DIR@")
+set_and_check(JARVIS_LIB_DIR     "@PACKAGE_JARVIS_LIB_INSTALL_DIR@")
+set_and_check(JARVIS_BIN_DIR     "@PACKAGE_JARVIS_BIN_INSTALL_DIR@")

 # Ensure transient dependencies satisfied

@ -66,25 +66,25 @@ endif()

 find_library(ggml_LIBRARY ggml
    REQUIRED
-    HINTS ${LLAMA_LIB_DIR})
+    HINTS ${JARVIS_LIB_DIR})

-find_library(llama_LIBRARY llama
+find_library(jarvis_LIBRARY jarvis
    REQUIRED
-    HINTS ${LLAMA_LIB_DIR})
+    HINTS ${JARVIS_LIB_DIR})

-set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
-set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
+set(_jarvis_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
+set(_jarvis_transient_defines "@GGML_TRANSIENT_DEFINES@")

-add_library(llama UNKNOWN IMPORTED)
+add_library(jarvis UNKNOWN IMPORTED)

-set_target_properties(llama
+set_target_properties(jarvis
    PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
-        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
-        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
+        INTERFACE_INCLUDE_DIRECTORIES "${JARVIS_INCLUDE_DIR}"
+        INTERFACE_LINK_LIBRARIES "${_jarvis_link_deps}"
+        INTERFACE_COMPILE_DEFINITIONS "${_jarvis_transient_defines}"
        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
-        IMPORTED_LOCATION "${llama_LIBRARY}"
+        IMPORTED_LOCATION "${jarvis_LIBRARY}"
        INTERFACE_COMPILE_FEATURES cxx_std_11
        POSITION_INDEPENDENT_CODE ON )

-check_required_components(Llama)
+check_required_components(Jarvis)
--- a/cmake/llama.pc.in
+++ b/cmake/llama.pc.in
@ -3,8 +3,8 @@ exec_prefix=${prefix}
 libdir=${exec_prefix}/lib
 includedir=${prefix}/include

-Name: llama
+Name: jarvis
 Description: Port of Facebook's LLaMA model in C/C++
 Version: @PROJECT_VERSION@
-Libs: -L${libdir} -lllama
+Libs: -L${libdir} -ljarvis
 Cflags: -I${includedir}
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -74,17 +74,17 @@ if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

-set(LLAMA_COMMON_EXTRA_LIBS build_info)
+set(JARVIS_COMMON_EXTRA_LIBS build_info)

 # Use curl to download model url
-if (LLAMA_CURL)
+if (JARVIS_CURL)
    find_package(CURL REQUIRED)
-    add_definitions(-DLLAMA_USE_CURL)
+    add_definitions(-DJARVIS_USE_CURL)
    include_directories(${CURL_INCLUDE_DIRS})
    find_library(CURL_LIBRARY curl REQUIRED)
-    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
+    set(JARVIS_COMMON_EXTRA_LIBS ${JARVIS_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
 endif ()

 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features   (${TARGET} PUBLIC cxx_std_11)
-target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+target_link_libraries     (${TARGET} PRIVATE ${JARVIS_COMMON_EXTRA_LIBS} PUBLIC jarvis Threads::Threads)
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@ -11,7 +11,7 @@
 //

 struct common_arg {
-    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
+    std::set<enum jarvis_example> examples = {JARVIS_EXAMPLE_COMMON};
    std::vector<const char *> args;
    const char * value_hint   = nullptr; // help text or example for arg value
    const char * value_hint_2 = nullptr; // for second arg value
@ -52,17 +52,17 @@ struct common_arg {
        void (*handler)(common_params & params, const std::string &, const std::string &)
    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}

-    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
+    common_arg & set_examples(std::initializer_list<enum jarvis_example> examples);
    common_arg & set_env(const char * env);
    common_arg & set_sparam();
-    bool in_example(enum llama_example ex);
+    bool in_example(enum jarvis_example ex);
    bool get_value_from_env(std::string & output);
    bool has_value_from_env();
    std::string to_string();
 };

 struct common_params_context {
-    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
+    enum jarvis_example ex = JARVIS_EXAMPLE_COMMON;
    common_params & params;
    std::vector<common_arg> options;
    void(*print_usage)(int, char **) = nullptr;
@ -71,7 +71,7 @@ struct common_params_context {

 // parse input arguments from CLI
 // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
-bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+bool common_params_parse(int argc, char ** argv, common_params & params, jarvis_example ex, void(*print_usage)(int, char **) = nullptr);

 // function to be used by test-arg-parser
-common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+common_params_context common_params_parser_init(common_params & params, jarvis_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/build-info.cpp.in
+++ b/common/build-info.cpp.in
@ -1,4 +1,4 @@
-int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
-char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
-char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
-char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
+int JARVIS_BUILD_NUMBER = @BUILD_NUMBER@;
+char const *JARVIS_COMMIT = "@BUILD_COMMIT@";
+char const *JARVIS_COMPILER = "@BUILD_COMPILER@";
+char const *JARVIS_BUILD_TARGET = "@BUILD_TARGET@";
--- a/common/common.cpp
+++ b/common/common.cpp
@ -8,7 +8,7 @@
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
 #include "json-schema-to-grammar.h"
-#include "llama.h"
+#include "jarvis.h"

 #include <algorithm>
 #include <cinttypes>
@ -48,7 +48,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
-#if defined(LLAMA_USE_CURL)
+#if defined(JARVIS_USE_CURL)
 #include <curl/curl.h>
 #include <curl/easy.h>
 #include <future>
@ -58,7 +58,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-#if defined(LLAMA_USE_CURL)
+#if defined(JARVIS_USE_CURL)
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
@ -66,8 +66,8 @@
 #else
 #include <sys/syslimits.h>
 #endif
-#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
-#endif // LLAMA_USE_CURL
+#define JARVIS_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+#endif // JARVIS_USE_CURL

 using json = nlohmann::ordered_json;

@ -364,8 +364,8 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
 }

 void common_init() {
-    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
-        if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
+    jarvis_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_JARVIS <= common_log_verbosity_thold) {
            common_log_add(common_log_main(), level, "%s", text);
        }
    }, NULL);
@ -376,7 +376,7 @@ void common_init() {
    const char * build_type = " (debug)";
 #endif

-    LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+    LOG_INF("build: %d (%s) with %s for %s%s\n", JARVIS_BUILD_NUMBER, JARVIS_COMMIT, JARVIS_COMPILER, JARVIS_BUILD_TARGET, build_type);
 }

 std::string common_params_get_system_info(const common_params & params) {
@ -389,9 +389,9 @@ std::string common_params_get_system_info(const common_params & params) {
 #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
    // TODO: windows + arm64 + mingw64
    DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
-    os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
+    os << " / " << logicalProcessorCount << " | " << jarvis_print_system_info();
 #else
-    os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
+    os << " / " << std::thread::hardware_concurrency() << " | " << jarvis_print_system_info();
 #endif

    return os.str();
@ -483,7 +483,7 @@ std::string string_from(const std::vector<int> & values) {
    return buf.str();
 }

-std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
+std::string string_from(const struct jarvis_context * ctx, const std::vector<jarvis_token> & tokens) {
    std::stringstream buf;

    buf << "[ ";
@ -514,7 +514,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
    return buf.str();
 }

-std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
+std::string string_from(const struct jarvis_context * ctx, const struct jarvis_batch & batch) {
    std::stringstream buf;

    buf << "[ ";
@ -586,27 +586,27 @@ void string_process_escapes(std::string & input) {
    input.resize(output_idx);
 }

-bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
+bool string_parse_kv_override(const char * data, std::vector<jarvis_model_kv_override> & overrides) {
    const char * sep = strchr(data, '=');
    if (sep == nullptr || sep - data >= 128) {
        LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
        return false;
    }
-    llama_model_kv_override kvo;
+    jarvis_model_kv_override kvo;
    std::strncpy(kvo.key, data, sep - data);
    kvo.key[sep - data] = 0;
    sep++;
    if (strncmp(sep, "int:", 4) == 0) {
        sep += 4;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+        kvo.tag = JARVIS_KV_OVERRIDE_TYPE_INT;
        kvo.val_i64 = std::atol(sep);
    } else if (strncmp(sep, "float:", 6) == 0) {
        sep += 6;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+        kvo.tag = JARVIS_KV_OVERRIDE_TYPE_FLOAT;
        kvo.val_f64 = std::atof(sep);
    } else if (strncmp(sep, "bool:", 5) == 0) {
        sep += 5;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+        kvo.tag = JARVIS_KV_OVERRIDE_TYPE_BOOL;
        if (std::strcmp(sep, "true") == 0) {
            kvo.val_bool = true;
        } else if (std::strcmp(sep, "false") == 0) {
@ -617,7 +617,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
        }
    } else if (strncmp(sep, "str:", 4) == 0) {
        sep += 4;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+        kvo.tag = JARVIS_KV_OVERRIDE_TYPE_STR;
        if (strlen(sep) > 127) {
            LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
            return false;
@ -788,8 +788,8 @@ std::string fs_get_cache_directory() {
        }
        return p;
    };
-    if (getenv("LLAMA_CACHE")) {
-        cache_directory = std::getenv("LLAMA_CACHE");
+    if (getenv("JARVIS_CACHE")) {
+        cache_directory = std::getenv("JARVIS_CACHE");
    } else {
 #ifdef __linux__
        if (std::getenv("XDG_CACHE_HOME")) {
@ -803,7 +803,7 @@ std::string fs_get_cache_directory() {
        cache_directory = std::getenv("LOCALAPPDATA");
 #endif // __linux__
        cache_directory = ensure_trailing_slash(cache_directory);
-        cache_directory += "llama.cpp";
+        cache_directory += "jarvis.cpp";
    }
    return ensure_trailing_slash(cache_directory);
 }
@ -824,16 +824,16 @@ std::string fs_get_cache_file(const std::string & filename) {
 //
 struct common_init_result common_init_from_params(common_params & params) {
    common_init_result iparams;
-    auto mparams = common_model_params_to_llama(params);
+    auto mparams = common_model_params_to_jarvis(params);

-    llama_model * model = nullptr;
+    jarvis_model * model = nullptr;

    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
        model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
    } else if (!params.model_url.empty()) {
        model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
    } else {
-        model = llama_load_model_from_file(params.model.c_str(), mparams);
+        model = jarvis_load_model_from_file(params.model.c_str(), mparams);
    }

    if (model == NULL) {
@ -844,58 +844,58 @@ struct common_init_result common_init_from_params(common_params & params) {
    if (params.reranking) {
        bool ok = true;

-        if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
+        if (jarvis_token_bos(model) == JARVIS_TOKEN_NULL) {
            LOG_WRN("%s: warning: model does not have a  BOS token, reranking will not work\n", __func__);
            ok = false;
        }

-        if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
+        if (jarvis_token_eos(model) == JARVIS_TOKEN_NULL) {
            LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
            ok = false;
        }

-        if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
+        if (jarvis_token_sep(model) == JARVIS_TOKEN_NULL) {
            LOG_WRN("%s: warning: model does not have a  SEP token, reranking will not work\n", __func__);
            ok = false;
        }

        if (!ok) {
-            llama_free_model(model);
+            jarvis_free_model(model);

            return iparams;
        }
    }

-    auto cparams = common_context_params_to_llama(params);
+    auto cparams = common_context_params_to_jarvis(params);

-    llama_context * lctx = llama_new_context_with_model(model, cparams);
+    jarvis_context * lctx = jarvis_new_context_with_model(model, cparams);
    if (lctx == NULL) {
        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
-        llama_free_model(model);
+        jarvis_free_model(model);
        return iparams;
    }

    if (!params.control_vectors.empty()) {
        if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
-        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model);
+        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = jarvis_n_layer(model);

        const auto cvec = common_control_vector_load(params.control_vectors);
        if (cvec.n_embd == -1) {
-            llama_free(lctx);
-            llama_free_model(model);
+            jarvis_free(lctx);
+            jarvis_free_model(model);

            return iparams;
        }

-        int err = llama_control_vector_apply(lctx,
+        int err = jarvis_control_vector_apply(lctx,
                                             cvec.data.data(),
                                             cvec.data.size(),
                                             cvec.n_embd,
                                             params.control_vector_layer_start,
                                             params.control_vector_layer_end);
        if (err) {
-            llama_free(lctx);
-            llama_free_model(model);
+            jarvis_free(lctx);
+            jarvis_free_model(model);

            return iparams;
        }
@ -906,11 +906,11 @@ struct common_init_result common_init_from_params(common_params & params) {
        common_lora_adapter_container loaded_la;
        loaded_la.path = la.path;
        loaded_la.scale = la.scale;
-        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
+        loaded_la.adapter = jarvis_lora_adapter_init(model, la.path.c_str());
        if (loaded_la.adapter == nullptr) {
            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
-            llama_free(lctx);
-            llama_free_model(model);
+            jarvis_free(lctx);
+            jarvis_free_model(model);
            return iparams;
        }
        iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
@ -919,7 +919,7 @@ struct common_init_result common_init_from_params(common_params & params) {
        common_lora_adapters_apply(lctx, iparams.lora_adapters);
    }

-    if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
+    if (params.sparams.ignore_eos && jarvis_token_eos(model) == JARVIS_TOKEN_NULL) {
        LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
        params.sparams.ignore_eos = false;
    }
@ -927,35 +927,35 @@ struct common_init_result common_init_from_params(common_params & params) {
    if (params.warmup) {
        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

-        std::vector<llama_token> tmp;
-        llama_token bos = llama_token_bos(model);
-        llama_token eos = llama_token_eos(model);
+        std::vector<jarvis_token> tmp;
+        jarvis_token bos = jarvis_token_bos(model);
+        jarvis_token eos = jarvis_token_eos(model);
        // some models (e.g. T5) don't have a BOS token
-        if (bos != LLAMA_TOKEN_NULL) {
+        if (bos != JARVIS_TOKEN_NULL) {
            tmp.push_back(bos);
        }
-        if (eos != LLAMA_TOKEN_NULL) {
+        if (eos != JARVIS_TOKEN_NULL) {
            tmp.push_back(eos);
        }
        if (tmp.empty()) {
            tmp.push_back(0);
        }

-        if (llama_model_has_encoder(model)) {
-            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
-            llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+        if (jarvis_model_has_encoder(model)) {
+            jarvis_encode(lctx, jarvis_batch_get_one(tmp.data(), tmp.size()));
+            jarvis_token decoder_start_token_id = jarvis_model_decoder_start_token(model);
            if (decoder_start_token_id == -1) {
                decoder_start_token_id = bos;
            }
            tmp.clear();
            tmp.push_back(decoder_start_token_id);
        }
-        if (llama_model_has_decoder(model)) {
-            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
+        if (jarvis_model_has_decoder(model)) {
+            jarvis_decode(lctx, jarvis_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
        }
-        llama_kv_cache_clear(lctx);
-        llama_synchronize(lctx);
-        llama_perf_context_reset(lctx);
+        jarvis_kv_cache_clear(lctx);
+        jarvis_synchronize(lctx);
+        jarvis_perf_context_reset(lctx);
    }

    iparams.model   = model;
@ -964,17 +964,17 @@ struct common_init_result common_init_from_params(common_params & params) {
    return iparams;
 }

-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
-    llama_lora_adapter_clear(ctx);
+void common_lora_adapters_apply(struct jarvis_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
+    jarvis_lora_adapter_clear(ctx);
    for (auto & la : lora_adapters) {
        if (la.scale != 0.0f) {
-            llama_lora_adapter_set(ctx, la.adapter, la.scale);
+            jarvis_lora_adapter_set(ctx, la.adapter, la.scale);
        }
    }
 }

-struct llama_model_params common_model_params_to_llama(const common_params & params) {
-    auto mparams = llama_model_default_params();
+struct jarvis_model_params common_model_params_to_jarvis(const common_params & params) {
+    auto mparams = jarvis_model_default_params();

    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
@ -1025,8 +1025,8 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
    throw std::runtime_error("Unsupported cache type: " + s);
 }

-struct llama_context_params common_context_params_to_llama(const common_params & params) {
-    auto cparams = llama_context_default_params();
+struct jarvis_context_params common_context_params_to_jarvis(const common_params & params) {
+    auto cparams = jarvis_context_default_params();

    cparams.n_ctx             = params.n_ctx;
    cparams.n_seq_max         = params.n_parallel;
@ -1056,7 +1056,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &

    if (params.reranking) {
        cparams.embeddings    = true;
-        cparams.pooling_type  = LLAMA_POOLING_TYPE_RANK;
+        cparams.pooling_type  = JARVIS_POOLING_TYPE_RANK;
    }

    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
@ -1081,7 +1081,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
    return tpp;
 }

-#ifdef LLAMA_USE_CURL
+#ifdef JARVIS_USE_CURL

 #define CURL_MAX_RETRY 3
 #define CURL_RETRY_DELAY_SECONDS 2
@ -1279,7 +1279,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);

        // helper function to hide password in URL
-        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
+        auto jarvis_download_hide_password_in_url = [](const std::string & url) -> std::string {
            std::size_t protocol_pos = url.find("://");
            if (protocol_pos == std::string::npos) {
                return url;  // Malformed URL
@ -1295,7 +1295,7 @@ static bool common_download_file(const std::string & url, const std::string & pa

        // start the download
        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
+            jarvis_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
        if (!was_perform_successful) {
            return false;
@ -1329,11 +1329,11 @@ static bool common_download_file(const std::string & url, const std::string & pa
    return true;
 }

-struct llama_model * common_load_model_from_url(
+struct jarvis_model * common_load_model_from_url(
        const char * model_url,
        const char * path_model,
        const char * hf_token,
-        const struct llama_model_params & params) {
+        const struct jarvis_model_params & params) {
    // Basic validation of the model_url
    if (!model_url || strlen(model_url) == 0) {
        LOG_ERR("%s: invalid model_url\n", __func__);
@ -1367,17 +1367,17 @@ struct llama_model * common_load_model_from_url(

    if (n_split > 1) {
        char split_prefix[PATH_MAX] = {0};
-        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+        char split_url_prefix[JARVIS_CURL_MAX_URL_LENGTH] = {0};

        // Verify the first split file format
        // and extract split URL and PATH prefixes
        {
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
+            if (!jarvis_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
                return NULL;
            }

-            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
+            if (!jarvis_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
                return NULL;
            }
@ -1388,10 +1388,10 @@ struct llama_model * common_load_model_from_url(
        for (int idx = 1; idx < n_split; idx++) {
            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
                char split_path[PATH_MAX] = {0};
-                llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
+                jarvis_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);

-                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
+                char split_url[JARVIS_CURL_MAX_URL_LENGTH] = {0};
+                jarvis_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);

                return common_download_file(split_url, split_path, hf_token);
            }, idx));
@ -1405,19 +1405,19 @@ struct llama_model * common_load_model_from_url(
        }
    }

-    return llama_load_model_from_file(path_model, params);
+    return jarvis_load_model_from_file(path_model, params);
 }

-struct llama_model * common_load_model_from_hf(
+struct jarvis_model * common_load_model_from_hf(
        const char * repo,
        const char * model,
        const char * path_model,
        const char * hf_token,
-        const struct llama_model_params & params) {
+        const struct jarvis_model_params & params) {
    // construct hugging face model url:
    //
-    //  --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
-    //    https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
+    //  --repo ggml-org/models --file tinyjarvis-1.1b/ggml-model-f16.gguf
+    //    https://huggingface.co/ggml-org/models/resolve/main/tinyjarvis-1.1b/ggml-model-f16.gguf
    //
    //  --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
    //    https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
@ -1433,42 +1433,42 @@ struct llama_model * common_load_model_from_hf(

 #else

-struct llama_model * common_load_model_from_url(
+struct jarvis_model * common_load_model_from_url(
        const char * /*model_url*/,
        const char * /*path_model*/,
        const char * /*hf_token*/,
-        const struct llama_model_params & /*params*/) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
+        const struct jarvis_model_params & /*params*/) {
+    LOG_WRN("%s: jarvis.cpp built without libcurl, downloading from an url not supported.\n", __func__);
    return nullptr;
 }

-struct llama_model * common_load_model_from_hf(
+struct jarvis_model * common_load_model_from_hf(
        const char * /*repo*/,
        const char * /*model*/,
        const char * /*path_model*/,
        const char * /*hf_token*/,
-        const struct llama_model_params & /*params*/) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+        const struct jarvis_model_params & /*params*/) {
+    LOG_WRN("%s: jarvis.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
    return nullptr;
 }

-#endif // LLAMA_USE_CURL
+#endif // JARVIS_USE_CURL

 //
 // Batch utils
 //

-void common_batch_clear(struct llama_batch & batch) {
+void common_batch_clear(struct jarvis_batch & batch) {
    batch.n_tokens = 0;
 }

 void common_batch_add(
-                 struct llama_batch & batch,
-                        llama_token   id,
-                          llama_pos   pos,
-    const std::vector<llama_seq_id> & seq_ids,
+                 struct jarvis_batch & batch,
+                        jarvis_token   id,
+                          jarvis_pos   pos,
+    const std::vector<jarvis_seq_id> & seq_ids,
                               bool   logits) {
-    GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
+    GGML_ASSERT(batch.seq_id[batch.n_tokens] && "jarvis_batch size exceeded");

    batch.token   [batch.n_tokens] = id;
    batch.pos     [batch.n_tokens] = pos;
@ -1485,26 +1485,26 @@ void common_batch_add(
 // Vocab utils
 //

-std::vector<llama_token> common_tokenize(
-  const struct llama_context * ctx,
+std::vector<jarvis_token> common_tokenize(
+  const struct jarvis_context * ctx,
           const std::string & text,
                        bool   add_special,
                        bool   parse_special) {
-    return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
+    return common_tokenize(jarvis_get_model(ctx), text, add_special, parse_special);
 }

-std::vector<llama_token> common_tokenize(
-    const struct llama_model * model,
+std::vector<jarvis_token> common_tokenize(
+    const struct jarvis_model * model,
           const std::string & text,
                        bool   add_special,
                        bool   parse_special) {
    // upper limit for the number of tokens
    int n_tokens = text.length() + 2 * add_special;
-    std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    std::vector<jarvis_token> result(n_tokens);
+    n_tokens = jarvis_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        int check = jarvis_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@ -1512,13 +1512,13 @@ std::vector<llama_token> common_tokenize(
    return result;
 }

-std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
+std::string common_token_to_piece(const struct jarvis_context * ctx, jarvis_token token, bool special) {
    std::string piece;
    piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
-    const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
+    const int n_chars = jarvis_token_to_piece(jarvis_get_model(ctx), token, &piece[0], piece.size(), 0, special);
    if (n_chars < 0) {
        piece.resize(-n_chars);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
+        int check = jarvis_token_to_piece(jarvis_get_model(ctx), token, &piece[0], piece.size(), 0, special);
        GGML_ASSERT(check == -n_chars);
    }
    else {
@ -1528,13 +1528,13 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
    return piece;
 }

-std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
+std::string common_detokenize(jarvis_context * ctx, const std::vector<jarvis_token> & tokens, bool special) {
    std::string text;
    text.resize(std::max(text.capacity(), tokens.size()));
-    int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+    int32_t n_chars = jarvis_detokenize(jarvis_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
    if (n_chars < 0) {
        text.resize(-n_chars);
-        n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+        n_chars = jarvis_detokenize(jarvis_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
    }

@ -1549,18 +1549,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
 //

 bool common_chat_verify_template(const std::string & tmpl) {
-    llama_chat_message chat[] = {{"user", "test"}};
-    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
+    jarvis_chat_message chat[] = {{"user", "test"}};
+    int res = jarvis_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
    return res >= 0;
 }

-std::string common_chat_apply_template(const struct llama_model * model,
+std::string common_chat_apply_template(const struct jarvis_model * model,
        const std::string & tmpl,
        const std::vector<common_chat_msg> & msgs,
        bool add_ass) {
    int alloc_size = 0;
    bool fallback = false; // indicate if we must fallback to default chatml
-    std::vector<llama_chat_message> chat;
+    std::vector<jarvis_chat_message> chat;
    for (auto & msg : msgs) {
        chat.push_back({msg.role.c_str(), msg.content.c_str()});
        alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
@ -1570,17 +1570,17 @@ std::string common_chat_apply_template(const struct llama_model * model,
    std::vector<char> buf(alloc_size);

    // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+    int32_t res = jarvis_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());

    // error: chat template is not supported
    if (res < 0) {
        if (ptr_tmpl != nullptr) {
            // if the custom "tmpl" is not supported, we throw an error
-            // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
+            // this is a bit redundant (for good), since we're not sure if user validated the custom template with jarvis_chat_verify_template()
            throw std::runtime_error("this custom template is not supported");
        } else {
            // If the built-in template is not supported, we default to chatml
-            res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+            res = jarvis_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
            fallback = true;
        }
    }
@ -1588,7 +1588,7 @@ std::string common_chat_apply_template(const struct llama_model * model,
    // if it turns out that our buffer is too small, we resize it
    if ((size_t) res > buf.size()) {
        buf.resize(res);
-        res = llama_chat_apply_template(
+        res = jarvis_chat_apply_template(
            fallback ? nullptr : model,
            fallback ? "chatml" : ptr_tmpl,
            chat.data(), chat.size(), add_ass, buf.data(), buf.size());
@ -1598,7 +1598,7 @@ std::string common_chat_apply_template(const struct llama_model * model,
    return formatted_chat;
 }

-std::string common_chat_format_single(const struct llama_model * model,
+std::string common_chat_format_single(const struct jarvis_model * model,
        const std::string & tmpl,
        const std::vector<common_chat_msg> & past_msg,
        const common_chat_msg & new_msg,
@ -1618,7 +1618,7 @@ std::string common_chat_format_single(const struct llama_model * model,
    return ss.str();
 }

-std::string common_chat_format_example(const struct llama_model * model,
+std::string common_chat_format_example(const struct jarvis_model * model,
        const std::string & tmpl) {
    std::vector<common_chat_msg> msgs = {
        {"system",    "You are a helpful assistant"},
@ -1633,14 +1633,14 @@ std::string common_chat_format_example(const struct llama_model * model,
 // KV cache utils
 //

-void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
+void common_kv_cache_dump_view(const jarvis_kv_cache_view & view, int row_size) {
    static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";

    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
        view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);

-    llama_kv_cache_view_cell * c_curr = view.cells;
-    llama_seq_id * cs_curr = view.cells_sequences;
+    jarvis_kv_cache_view_cell * c_curr = view.cells;
+    jarvis_seq_id * cs_curr = view.cells_sequences;

    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
        if (i % row_size == 0) {
@ -1656,15 +1656,15 @@ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
    printf("\n=== Done dumping\n");
 }

-void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
+void common_kv_cache_dump_view_seqs(const jarvis_kv_cache_view & view, int row_size) {
    static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";

    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
        view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);

-    std::unordered_map<llama_seq_id, size_t> seqs;
-    llama_kv_cache_view_cell * c_curr = view.cells;
-    llama_seq_id * cs_curr = view.cells_sequences;
+    std::unordered_map<jarvis_seq_id, size_t> seqs;
+    jarvis_kv_cache_view_cell * c_curr = view.cells;
+    jarvis_seq_id * cs_curr = view.cells_sequences;

    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
        for (int j = 0; j < view.n_seq_max; j++) {
@ -1949,12 +1949,12 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
    }
 }

-void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
+void yaml_dump_non_result_info(FILE * stream, const common_params & params, const jarvis_context * lctx,
                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
    const auto & sparams = params.sparams;

-    fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
-    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
+    fprintf(stream, "build_commit: %s\n",        JARVIS_COMMIT);
+    fprintf(stream, "build_number: %d\n",        JARVIS_BUILD_NUMBER);
    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
    fprintf(stream, "cpu_has_avx_vnni: %s\n",    ggml_cpu_has_avx_vnni()    ? "true" : "false");
@ -1985,7 +1985,7 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
 #endif // NDEBUG

    fprintf(stream, "model_desc: %s\n", model_desc);
-    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
+    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", jarvis_n_vocab(jarvis_get_model(lctx)));

 #ifdef __OPTIMIZE__
    fprintf(stream, "optimize: true\n");
@ -2087,7 +2087,7 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
    fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);

-    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
+    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + jarvis_max_devices());
    yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);

    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
--- a/common/common.h
+++ b/common/common.h
@ -2,7 +2,7 @@

 #pragma once

-#include "llama.h"
+#include "jarvis.h"

 #include <string>
 #include <vector>
@ -18,8 +18,8 @@
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

 #define print_build_info() do {                                                                     \
-    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
-    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
+    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, JARVIS_BUILD_NUMBER, JARVIS_COMMIT);      \
+    fprintf(stderr, "%s: built with %s for %s\n", __func__, JARVIS_COMPILER, JARVIS_BUILD_TARGET);    \
 } while(0)

 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
@ -30,14 +30,14 @@ struct common_lora_adapter_info {
 };

 struct common_lora_adapter_container : common_lora_adapter_info {
-    struct llama_lora_adapter * adapter;
+    struct jarvis_lora_adapter * adapter;
 };

 // build info
-extern int LLAMA_BUILD_NUMBER;
-extern char const * LLAMA_COMMIT;
-extern char const * LLAMA_COMPILER;
-extern char const * LLAMA_BUILD_TARGET;
+extern int JARVIS_BUILD_NUMBER;
+extern char const * JARVIS_COMMIT;
+extern char const * JARVIS_COMPILER;
+extern char const * JARVIS_BUILD_TARGET;

 struct common_control_vector_load_info;

@ -61,25 +61,25 @@ int32_t cpu_get_num_math();
 // Common params
 //

-enum llama_example {
-    LLAMA_EXAMPLE_COMMON,
-    LLAMA_EXAMPLE_SPECULATIVE,
-    LLAMA_EXAMPLE_MAIN,
-    LLAMA_EXAMPLE_INFILL,
-    LLAMA_EXAMPLE_EMBEDDING,
-    LLAMA_EXAMPLE_PERPLEXITY,
-    LLAMA_EXAMPLE_RETRIEVAL,
-    LLAMA_EXAMPLE_PASSKEY,
-    LLAMA_EXAMPLE_IMATRIX,
-    LLAMA_EXAMPLE_BENCH,
-    LLAMA_EXAMPLE_SERVER,
-    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
-    LLAMA_EXAMPLE_EXPORT_LORA,
-    LLAMA_EXAMPLE_LLAVA,
-    LLAMA_EXAMPLE_LOOKUP,
-    LLAMA_EXAMPLE_PARALLEL,
+enum jarvis_example {
+    JARVIS_EXAMPLE_COMMON,
+    JARVIS_EXAMPLE_SPECULATIVE,
+    JARVIS_EXAMPLE_MAIN,
+    JARVIS_EXAMPLE_INFILL,
+    JARVIS_EXAMPLE_EMBEDDING,
+    JARVIS_EXAMPLE_PERPLEXITY,
+    JARVIS_EXAMPLE_RETRIEVAL,
+    JARVIS_EXAMPLE_PASSKEY,
+    JARVIS_EXAMPLE_IMATRIX,
+    JARVIS_EXAMPLE_BENCH,
+    JARVIS_EXAMPLE_SERVER,
+    JARVIS_EXAMPLE_CVECTOR_GENERATOR,
+    JARVIS_EXAMPLE_EXPORT_LORA,
+    JARVIS_EXAMPLE_LLAVA,
+    JARVIS_EXAMPLE_LOOKUP,
+    JARVIS_EXAMPLE_PARALLEL,

-    LLAMA_EXAMPLE_COUNT,
+    JARVIS_EXAMPLE_COUNT,
 };

 enum common_sampler_type {
@ -103,7 +103,7 @@ enum dimre_method {

 // sampler parameters
 struct common_sampler_params {
-    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
+    uint32_t seed = JARVIS_DEFAULT_SEED; // the seed used to initialize jarvis_sampler

    int32_t n_prev             = 64;    // number of previous tokens to remember
    int32_t n_probs            = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
@ -149,7 +149,7 @@ struct common_sampler_params {

    std::string grammar; // optional BNF-like grammar to constrain sampling

-    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
+    std::vector<jarvis_logit_bias> logit_bias; // logit biases to apply

    // print the parameters into a string
    std::string print() const;
@ -192,10 +192,10 @@ struct common_params {

    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;

-    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
-    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
-    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
-    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
+    enum jarvis_split_mode        split_mode        = JARVIS_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    enum jarvis_rope_scaling_type rope_scaling_type = JARVIS_ROPE_SCALING_TYPE_UNSPECIFIED;
+    enum jarvis_pooling_type      pooling_type      = JARVIS_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
+    enum jarvis_attention_type    attention_type    = JARVIS_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings

    struct common_sampler_params sparams;

@ -219,9 +219,9 @@ struct common_params {

    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
-    std::vector<llama_model_kv_override> kv_overrides;
+    std::vector<jarvis_model_kv_override> kv_overrides;

-    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
+    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using jarvis_lora_adapter_apply)
    std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale

    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
@ -377,15 +377,15 @@ bool set_process_priority(enum ggml_sched_priority prio);

 #ifdef __GNUC__
 #ifdef __MINGW32__
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
 #else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#define JARVIS_COMMON_ATTRIBUTE_FORMAT(...)
 #endif

-LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+JARVIS_COMMON_ATTRIBUTE_FORMAT(1, 2)
 std::string string_format(const char * fmt, ...);

 std::string string_strip(const std::string & str);
@ -424,13 +424,13 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
    return parts;
 }

-bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
+bool string_parse_kv_override(const char * data, std::vector<jarvis_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);

 std::string string_from(bool value);
 std::string string_from(const std::vector<int> & values);
-std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
-std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
+std::string string_from(const struct jarvis_context * ctx, const std::vector<jarvis_token> & tokens);
+std::string string_from(const struct jarvis_context * ctx, const struct jarvis_batch & batch);

 //
 // Filesystem utils
@ -447,32 +447,32 @@ std::string fs_get_cache_file(const std::string & filename);
 //

 struct common_init_result {
-    struct llama_model   * model   = nullptr;
-    struct llama_context * context = nullptr;
+    struct jarvis_model   * model   = nullptr;
+    struct jarvis_context * context = nullptr;
    std::vector<common_lora_adapter_container> lora_adapters;
 };

 struct common_init_result     common_init_from_params(common_params & params);

-struct llama_model_params     common_model_params_to_llama  (const common_params & params);
-struct llama_context_params   common_context_params_to_llama(const common_params & params);
+struct jarvis_model_params     common_model_params_to_jarvis  (const common_params & params);
+struct jarvis_context_params   common_context_params_to_jarvis(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

-struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
-struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct jarvis_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct jarvis_model_params & params);
+struct jarvis_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct jarvis_model_params & params);

 // clear LoRA adapters from context, then apply new list of adapters
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
+void common_lora_adapters_apply(struct jarvis_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);

 // Batch utils

-void common_batch_clear(struct llama_batch & batch);
+void common_batch_clear(struct jarvis_batch & batch);

 void common_batch_add(
-                 struct llama_batch & batch,
-                        llama_token   id,
-                          llama_pos   pos,
-    const std::vector<llama_seq_id> & seq_ids,
+                 struct jarvis_batch & batch,
+                        jarvis_token   id,
+                          jarvis_pos   pos,
+    const std::vector<jarvis_seq_id> & seq_ids,
                               bool   logits);

 //
@ -481,14 +481,14 @@ void common_batch_add(

 // tokenizes a string into a vector of tokens
 // should work similar to Python's `tokenizer.encode`
-std::vector<llama_token> common_tokenize(
-  const struct llama_context * ctx,
+std::vector<jarvis_token> common_tokenize(
+  const struct jarvis_context * ctx,
           const std::string & text,
                        bool   add_special,
                        bool   parse_special = false);

-std::vector<llama_token> common_tokenize(
-    const struct llama_model * model,
+std::vector<jarvis_token> common_tokenize(
+    const struct jarvis_model * model,
           const std::string & text,
                        bool   add_special,
                        bool   parse_special = false);
@ -496,23 +496,23 @@ std::vector<llama_token> common_tokenize(
 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string common_token_to_piece(
-        const struct llama_context * ctx,
-                       llama_token   token,
+        const struct jarvis_context * ctx,
+                       jarvis_token   token,
                       bool          special = true);

 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 // optionally renders special/control tokens
 std::string common_detokenize(
-                         llama_context * ctx,
-        const std::vector<llama_token> & tokens,
+                         jarvis_context * ctx,
+        const std::vector<jarvis_token> & tokens,
                                  bool   special = true);

 //
 // Chat template utils
 //

-// same with llama_chat_message, but uses std::string
+// same with jarvis_chat_message, but uses std::string
 struct common_chat_msg {
    std::string role;
    std::string content;
@ -521,23 +521,23 @@ struct common_chat_msg {
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl);

-// CPP wrapper for llama_chat_apply_template
+// CPP wrapper for jarvis_chat_apply_template
 // If the built-in template is not supported, we default to chatml
 // If the custom "tmpl" is not supported, we throw an error
-std::string common_chat_apply_template(const struct llama_model * model,
+std::string common_chat_apply_template(const struct jarvis_model * model,
        const std::string & tmpl,
        const std::vector<common_chat_msg> & chat,
        bool add_ass);

 // Format single message, while taking into account the position of that message in chat history
-std::string common_chat_format_single(const struct llama_model * model,
+std::string common_chat_format_single(const struct jarvis_model * model,
        const std::string & tmpl,
        const std::vector<common_chat_msg> & past_msg,
        const common_chat_msg & new_msg,
        bool add_ass);

 // Returns an example of formatted chat
-std::string common_chat_format_example(const struct llama_model * model,
+std::string common_chat_format_example(const struct jarvis_model * model,
        const std::string & tmpl);

 //
@ -545,10 +545,10 @@ std::string common_chat_format_example(const struct llama_model * model,
 //

 // Dump the KV cache view with the number of sequences per cell.
-void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
+void common_kv_cache_dump_view(const jarvis_kv_cache_view & view, int row_size = 80);

 // Dump the KV cache view showing individual sequences in each cell (long output).
-void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+void common_kv_cache_dump_view_seqs(const jarvis_kv_cache_view & view, int row_size = 40);

 //
 // Embedding utils
@ -596,5 +596,5 @@ void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std
 void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);

 void yaml_dump_non_result_info(
-    FILE * stream, const common_params & params, const llama_context * lctx,
+    FILE * stream, const common_params & params, const jarvis_context * lctx,
    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
--- a/common/console.cpp
+++ b/common/console.cpp
@ -435,7 +435,7 @@ namespace console {
                fputc('\n', out);
                has_more = !has_more;
            } else {
-                // llama will just eat the single space, it won't act as a space
+                // jarvis will just eat the single space, it won't act as a space
                if (line.length() == 1 && line.back() == ' ') {
                    line.clear();
                    pop_cursor();
--- a/common/json.hpp
+++ b/common/json.hpp
@ -5336,7 +5336,7 @@ template<typename IteratorType> class iteration_proxy
 };

 // Structured Bindings Support
-// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
 // And see https://github.com/nlohmann/json/pull/1391
 template<std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
 auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key())
@ -5344,7 +5344,7 @@ auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decl
    return i.key();
 }
 // Structured Bindings Support
-// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
 // And see https://github.com/nlohmann/json/pull/1391
 template<std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
 auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value())
@ -5357,7 +5357,7 @@ NLOHMANN_JSON_NAMESPACE_END

 // The Addition to the STD Namespace is required to add
 // Structured Bindings Support to the iteration_proxy_value class
-// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// For further reference see https://blog.tartanjarvis.xyz/structured-bindings/
 // And see https://github.com/nlohmann/json/pull/1391
 namespace std
 {
--- a/common/log.cpp
+++ b/common/log.cpp
@ -8,7 +8,7 @@
 #include <thread>
 #include <vector>

-int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
+int common_log_verbosity_thold = LOG_DEFAULT_JARVIS;

 void common_log_set_verbosity_thold(int verbosity) {
    common_log_verbosity_thold = verbosity;
--- a/common/log.h
+++ b/common/log.h
@ -11,7 +11,7 @@
 #endif

 #define LOG_DEFAULT_DEBUG 1
-#define LOG_DEFAULT_LLAMA 0
+#define LOG_DEFAULT_JARVIS 0

 // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
 // set via common_log_set_verbosity()
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@ -9,7 +9,7 @@
 #include <thread>

 void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
-                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
+                              std::vector<jarvis_token> & inp, int nnew, bool print_progress) {
    const int64_t t_start_ms = ggml_time_ms();
    const int64_t inp_size = inp.size();

@ -21,7 +21,7 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min,
        for (int64_t i = i_start; i < inp_size; ++i) {
            const int64_t ngram_start = i - ngram_size;
            common_ngram ngram(&inp[ngram_start], ngram_size);
-            const llama_token token = inp[i];
+            const jarvis_token token = inp[i];

            common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
            if (part_it == ngram_cache.end()) {
@ -51,18 +51,18 @@ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min,
 }

 // Helper function to get a token from the combined, speculative sequence of inp and draft.
-static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
+static jarvis_token get_token(const std::vector<jarvis_token> & inp, const std::vector<jarvis_token> & draft, const size_t i) {
    return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
 }

 // If sample size or percentage are below these thresholds the draft is aborted early:
-constexpr int    draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2,  2,  1,  1};
-constexpr int        draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50};
-constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4,  3,  2,  2};
-constexpr int     draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
+constexpr int    draft_min_sample_size_lax[JARVIS_NGRAM_MAX] = { 2,  2,  1,  1};
+constexpr int        draft_min_percent_lax[JARVIS_NGRAM_MAX] = {66, 50, 50, 50};
+constexpr int draft_min_sample_size_strict[JARVIS_NGRAM_MAX] = { 4,  3,  2,  2};
+constexpr int     draft_min_percent_strict[JARVIS_NGRAM_MAX] = {75, 66, 66, 66};

 // Helper function that tries to draft a token from only the static ngram cache:
-static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
+static jarvis_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
    common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
    if (part_static_it == nc_static.end()) {
        return -1;
@ -71,10 +71,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram

    int max_count_static  = 0;
    int sum_count_static  = 0;
-    llama_token max_token = -1;
+    jarvis_token max_token = -1;

-    for (std::pair<llama_token, int> token_count_static : part_static) {
-        const llama_token token = token_count_static.first;
+    for (std::pair<jarvis_token, int> token_count_static : part_static) {
+        const jarvis_token token = token_count_static.first;
        const int32_t count_static  = token_count_static.second;

        if (count_static > max_count_static) {
@ -84,21 +84,21 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
        sum_count_static += count_static;
    }

-    if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
+    if (sum_count_static < draft_min_sample_size_lax[JARVIS_NGRAM_STATIC-1]) {
        return -1;
    }
-    if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
+    if (100*max_count_static < draft_min_percent_lax[JARVIS_NGRAM_STATIC-1]*sum_count_static) {
        return -1;
    }
    return max_token;
 }

 // Try to draft a token from primary cache (context/dynamic), validate with static cache:
-static llama_token try_draft(
+static jarvis_token try_draft(
    common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
    const int * min_sample_size, const int * min_percent) {

-    llama_token drafted_token = -1;
+    jarvis_token drafted_token = -1;

    for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
        const common_ngram ngram_primary = ngrams_primary[i];
@ -112,10 +112,10 @@ static llama_token try_draft(
        int max_count_primary = 0;
        int max_count_static  = 0;
        int sum_count_primary = 0;
-        llama_token max_token = -1;
+        jarvis_token max_token = -1;

-        for (std::pair<llama_token, int> token_count_primary : part_primary) {
-            const llama_token token = token_count_primary.first;
+        for (std::pair<jarvis_token, int> token_count_primary : part_primary) {
+            const jarvis_token token = token_count_primary.first;

            common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);

@ -143,22 +143,22 @@ static llama_token try_draft(
 }

 void common_ngram_cache_draft(
-    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
+    std::vector<jarvis_token> & inp, std::vector<jarvis_token> & draft, int n_draft, int ngram_min, int ngram_max,
    common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
 ) {
    GGML_ASSERT(draft.size() == 1);
    const int inp_size = inp.size();

-    if (inp_size < LLAMA_NGRAM_STATIC) {
+    if (inp_size < JARVIS_NGRAM_STATIC) {
        return;
    }

    while ((int) draft.size()-1 < n_draft) {
-        llama_token drafted_token = -1;
+        jarvis_token drafted_token = -1;

-        const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
+        const int ngram_start_static = inp_size-JARVIS_NGRAM_STATIC + draft.size()-1;
        common_ngram ngram_static;
-        for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
+        for (int j = ngram_start_static; j < ngram_start_static + JARVIS_NGRAM_STATIC; ++j) {
            ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
        }
        common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
@ -207,12 +207,12 @@ void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & fil

        file_out.write(reinterpret_cast<const char *>(&ngram),   sizeof(common_ngram));
        file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
-        for (std::pair<llama_token, int32_t> item2 : token_counts) {
-            const llama_token token = item2.first;
+        for (std::pair<jarvis_token, int32_t> item2 : token_counts) {
+            const jarvis_token token = item2.first;
            const int32_t     count = item2.second;
            GGML_ASSERT(count > 0);

-            file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token));
+            file_out.write(reinterpret_cast<const char *>(&token), sizeof(jarvis_token));
            file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
        }
    }
@ -228,7 +228,7 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) {

    common_ngram ngram;
    int32_t     ntokens;
-    llama_token token;
+    jarvis_token token;
    int32_t     count;

    char * ngramc   = reinterpret_cast<char*>(&ngram);
@ -243,7 +243,7 @@ common_ngram_cache common_ngram_cache_load(std::string & filename) {

        for (int i = 0; i < ntokens; ++i) {
            GGML_ASSERT(!hashmap_file.eof());
-            GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token)));
+            GGML_ASSERT(hashmap_file.read(tokenc, sizeof(jarvis_token)));
            GGML_ASSERT(!hashmap_file.eof());
            GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
            GGML_ASSERT(count > 0);
@ -268,8 +268,8 @@ void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ng
            continue;
        }

-        for (std::pair<llama_token, int32_t> token_count : part) {
-            const llama_token token = token_count.first;
+        for (std::pair<jarvis_token, int32_t> token_count : part) {
+            const jarvis_token token = token_count.first;
            const int32_t     count = token_count.second;
            GGML_ASSERT(count > 0);

--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@ -1,34 +1,34 @@
 #pragma once

-#include "llama.h"
+#include "jarvis.h"

 #include <unordered_map>
 #include <string>
 #include <vector>

-#define LLAMA_NGRAM_MIN    1
-#define LLAMA_NGRAM_MAX    4
-#define LLAMA_NGRAM_STATIC 2
+#define JARVIS_NGRAM_MIN    1
+#define JARVIS_NGRAM_MAX    4
+#define JARVIS_NGRAM_STATIC 2

 // Data structures to map n-grams to empirical token probabilities:

 struct common_ngram {
-    llama_token tokens[LLAMA_NGRAM_MAX];
+    jarvis_token tokens[JARVIS_NGRAM_MAX];

    common_ngram() {
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+        for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
            tokens[i] = -1;
        }
    }

-    common_ngram(const llama_token * input, const int ngram_size) {
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+    common_ngram(const jarvis_token * input, const int ngram_size) {
+        for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
            tokens[i] = i < ngram_size ? input[i] : -1;
        }
    }

    bool operator==(const common_ngram & other) const {
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+        for (int i = 0; i < JARVIS_NGRAM_MAX; ++i) {
            if (tokens[i] != other.tokens[i]) {
                return false;
            }
@ -38,7 +38,7 @@ struct common_ngram {
 };

 struct common_token_hash_function {
-    size_t operator()(const llama_token token) const {
+    size_t operator()(const jarvis_token token) const {
        // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
        return token * 11400714819323198485llu;
    }
@ -47,7 +47,7 @@ struct common_token_hash_function {
 struct common_ngram_hash_function {
    size_t operator()(const common_ngram & ngram) const {
        size_t hash = common_token_hash_function{}(ngram.tokens[0]);
-        for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
+        for (int i = 1; i < JARVIS_NGRAM_MAX; ++i) {
            hash ^= common_token_hash_function{}(ngram.tokens[i]);
        }
        return hash;
@ -55,7 +55,7 @@ struct common_ngram_hash_function {
 };

 // token -> number of times token has been seen
-typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
+typedef std::unordered_map<jarvis_token, int32_t> common_ngram_cache_part;

 // n-gram -> empirical distribution of following tokens
 typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
@ -71,7 +71,7 @@ typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_h
 // In order to get correct results inp_data can ONLY BE APPENDED TO.
 // Changes in the middle need a complete rebuild.
 void common_ngram_cache_update(
-    common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
+    common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<jarvis_token> & inp_data, int nnew, bool print_progress);

 // Try to draft tokens from ngram caches.
 // inp:                the tokens generated so far.
@ -82,7 +82,7 @@ void common_ngram_cache_update(
 // nc_dynamic:         ngram cache based on previous user generations.
 // nc_static:          ngram cache generated from a large text corpus, used for validation.
 void common_ngram_cache_draft(
-    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
+    std::vector<jarvis_token> & inp, std::vector<jarvis_token> & draft, int n_draft, int ngram_min, int ngram_max,
    common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);

 // Save an ngram cache to a file.
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -6,7 +6,7 @@
 #include <unordered_map>

 // the ring buffer works similarly to std::deque, but with a fixed capacity
-// TODO: deduplicate with llama-impl.h
+// TODO: deduplicate with jarvis-impl.h
 template<typename T>
 struct ring_buffer {
    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
@ -101,24 +101,24 @@ struct ring_buffer {
 struct common_sampler {
    common_sampler_params params;

-    struct llama_sampler * grmr;
-    struct llama_sampler * chain;
+    struct jarvis_sampler * grmr;
+    struct jarvis_sampler * chain;

-    ring_buffer<llama_token> prev;
+    ring_buffer<jarvis_token> prev;

-    std::vector<llama_token_data> cur;
+    std::vector<jarvis_token_data> cur;

-    llama_token_data_array cur_p;
+    jarvis_token_data_array cur_p;

-    void set_logits(struct llama_context * ctx, int idx) {
-        const auto * logits = llama_get_logits_ith(ctx, idx);
+    void set_logits(struct jarvis_context * ctx, int idx) {
+        const auto * logits = jarvis_get_logits_ith(ctx, idx);

-        const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+        const int n_vocab = jarvis_n_vocab(jarvis_get_model(ctx));

        cur.resize(n_vocab);

-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
+        for (jarvis_token token_id = 0; token_id < n_vocab; token_id++) {
+            cur[token_id] = jarvis_token_data{token_id, logits[token_id], 0.0f};
        }

        cur_p = { cur.data(), cur.size(), -1, false };
@ -141,31 +141,31 @@ std::string common_sampler_params::print() const {
    return std::string(result);
 }

-struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
-    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
+struct common_sampler * common_sampler_init(const struct jarvis_model * model, const struct common_sampler_params & params) {
+    jarvis_sampler_chain_params lparams = jarvis_sampler_chain_default_params();

    lparams.no_perf = params.no_perf;

    auto * result = new common_sampler {
        /* .params = */ params,
-        /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
-        /* .chain  = */ llama_sampler_chain_init(lparams),
-        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
+        /* .grmr   = */ jarvis_sampler_init_grammar(model, params.grammar.c_str(), "root"),
+        /* .chain  = */ jarvis_sampler_chain_init(lparams),
+        /* .prev   = */ ring_buffer<jarvis_token>(std::max(32, params.n_prev)),
        /* .cur    = */ {},
        /* .cur_p  = */ {},
    };

-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_logit_bias(
-                llama_n_vocab(model),
+    jarvis_sampler_chain_add(result->chain,
+            jarvis_sampler_init_logit_bias(
+                jarvis_n_vocab(model),
                params.logit_bias.size(),
                params.logit_bias.data()));

-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_penalties(
-                llama_n_vocab  (model),
-                llama_token_eos(model),
-                llama_token_nl (model),
+    jarvis_sampler_chain_add(result->chain,
+            jarvis_sampler_init_penalties(
+                jarvis_n_vocab  (model),
+                jarvis_token_eos(model),
+                jarvis_token_nl (model),
                params.penalty_last_n,
                params.penalty_repeat,
                params.penalty_freq,
@ -184,44 +184,44 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                            c_breakers.push_back(str.c_str());
                        }

-                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                        jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_dry      (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
                    }
                        break;
                case COMMON_SAMPLER_TYPE_TOP_K:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_top_k    (params.top_k));
                    break;
                case COMMON_SAMPLER_TYPE_TOP_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_top_p    (params.top_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_MIN_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_min_p    (params.min_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_XTC:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
                    break;
                case COMMON_SAMPLER_TYPE_TFS_Z:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_tail_free(params.tfs_z, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_typical  (params.typ_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                    break;
                case COMMON_SAMPLER_TYPE_INFILL:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill   (model));
+                    jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_infill   (model));
                    break;
                default:
                    GGML_ASSERT(false && "unknown sampler type");
            }
        }
-        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
+        jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_dist(params.seed));
    } else if (params.mirostat == 1) {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
+        jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp(params.temp));
+        jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_mirostat(jarvis_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
    } else if (params.mirostat == 2) {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
+        jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_temp(params.temp));
+        jarvis_sampler_chain_add(result->chain, jarvis_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
    } else {
        GGML_ASSERT(false && "unknown mirostat version");
    }
@ -231,53 +231,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

 void common_sampler_free(struct common_sampler * gsmpl) {
    if (gsmpl) {
-        llama_sampler_free(gsmpl->grmr);
+        jarvis_sampler_free(gsmpl->grmr);

-        llama_sampler_free(gsmpl->chain);
+        jarvis_sampler_free(gsmpl->chain);

        delete gsmpl;
    }
 }

-void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
+void common_sampler_accept(struct common_sampler * gsmpl, jarvis_token token, bool accept_grammar) {
    if (accept_grammar) {
-        llama_sampler_accept(gsmpl->grmr, token);
+        jarvis_sampler_accept(gsmpl->grmr, token);
    }

-    llama_sampler_accept(gsmpl->chain, token);
+    jarvis_sampler_accept(gsmpl->chain, token);

    gsmpl->prev.push_back(token);
 }

 void common_sampler_reset(struct common_sampler * gsmpl) {
-    llama_sampler_reset(gsmpl->grmr);
+    jarvis_sampler_reset(gsmpl->grmr);

-    llama_sampler_reset(gsmpl->chain);
+    jarvis_sampler_reset(gsmpl->chain);
 }

 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
    return new common_sampler {
        /* .params = */ gsmpl->params,
-        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
-        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
+        /* .grmr   = */ jarvis_sampler_clone(gsmpl->grmr),
+        /* .chain  = */ jarvis_sampler_clone(gsmpl->chain),
        /* .prev   = */ gsmpl->prev,
        /* .cur    = */ gsmpl->cur,
        /* .cur_p  = */ gsmpl->cur_p,
    };
 }

-void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
+void common_perf_print(const struct jarvis_context * ctx, const struct common_sampler * gsmpl) {
    // TODO: measure grammar performance

    if (gsmpl) {
-        llama_perf_sampler_print(gsmpl->chain);
+        jarvis_perf_sampler_print(gsmpl->chain);
    }
    if (ctx) {
-        llama_perf_context_print(ctx);
+        jarvis_perf_context_print(ctx);
    }
 }

-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+jarvis_token common_sampler_sample(struct common_sampler * gsmpl, struct jarvis_context * ctx, int idx, bool grammar_first) {
    gsmpl->set_logits(ctx, idx);

    auto & grmr  = gsmpl->grmr;
@ -285,14 +285,14 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

    if (grammar_first) {
-        llama_sampler_apply(grmr, &cur_p);
+        jarvis_sampler_apply(grmr, &cur_p);
    }

-    llama_sampler_apply(chain, &cur_p);
+    jarvis_sampler_apply(chain, &cur_p);

    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");

-    const llama_token id = cur_p.data[cur_p.selected].id;
+    const jarvis_token id = cur_p.data[cur_p.selected].id;

    if (grammar_first) {
        return id;
@ -300,10 +300,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co

    // check if it the sampled token fits the grammar
    {
-        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
+        jarvis_token_data       single_token_data       = { id, 1.0f, 0.0f };
+        jarvis_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };

-        llama_sampler_apply(grmr, &single_token_data_array);
+        jarvis_sampler_apply(grmr, &single_token_data_array);

        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
        if (is_valid) {
@ -315,8 +315,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
    gsmpl->set_logits(ctx, idx);

-    llama_sampler_apply(grmr,  &cur_p);
-    llama_sampler_apply(chain, &cur_p);
+    jarvis_sampler_apply(grmr,  &cur_p);
+    jarvis_sampler_apply(chain, &cur_p);

    GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");

@ -324,31 +324,31 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
 }

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
-    return llama_sampler_get_seed(gsmpl->chain);
+    return jarvis_sampler_get_seed(gsmpl->chain);
 }

 // helpers

-llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
+jarvis_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
    return &gsmpl->cur_p;
 }

-llama_token common_sampler_last(const struct common_sampler * gsmpl) {
+jarvis_token common_sampler_last(const struct common_sampler * gsmpl) {
    return gsmpl->prev.rat(0);
 }

 std::string common_sampler_print(const struct common_sampler * gsmpl) {
    std::string result = "logits ";

-    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
-        const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
-        result += std::string("-> ") + llama_sampler_name(smpl) + " ";
+    for (int i = 0; i < jarvis_sampler_chain_n(gsmpl->chain); i++) {
+        const auto * smpl = jarvis_sampler_chain_get(gsmpl->chain, i);
+        result += std::string("-> ") + jarvis_sampler_name(smpl) + " ";
    }

    return result;
 }

-std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
+std::string common_sampler_prev_str(common_sampler * gsmpl, jarvis_context * ctx_main, int n) {
    n = std::min(n, (int) gsmpl->prev.size());

    if (n <= 0) {
@ -359,9 +359,9 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_
    result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab

    for (int i = n - 1; i >= 0; i--) {
-        const llama_token id = gsmpl->prev.rat(i);
+        const jarvis_token id = gsmpl->prev.rat(i);

-        GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
+        GGML_ASSERT(id != JARVIS_TOKEN_NULL && "null token in the sampling history - should not happen");

        result += common_token_to_piece(ctx_main, id);
    }
--- a/common/sampling.h
+++ b/common/sampling.h
@ -1,13 +1,13 @@
 #pragma once

-#include "llama.h"
+#include "jarvis.h"

 #include "common.h"

 #include <string>
 #include <vector>

-// common_sampler extends llama_sampler with additional functionality:
+// common_sampler extends jarvis_sampler with additional functionality:
 //
 //  - grammar support
 //  - custom sampler logic based on the parameters
@ -24,7 +24,7 @@
 // grammar constraints are applied to the full vocabulary and the token is resampled.
 //
 // The common_sampler also maintains a container with the last accepted tokens. In the future, this can
-// be moved into the core llama library.
+// be moved into the core jarvis library.
 //
 // For convenience, the common_sampler also maintains a container with the current candidate tokens.
 // This can be used to access the probabilities of the rest of the non-sampled tokens.
@ -34,19 +34,19 @@

 struct common_sampler;

-// llama_sampler API overloads
+// jarvis_sampler API overloads

-struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
+struct common_sampler * common_sampler_init(const struct jarvis_model * model, const struct common_sampler_params & params);

 void common_sampler_free(struct common_sampler * gsmpl);

 // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
-void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
+void                    common_sampler_accept(struct common_sampler * gsmpl, jarvis_token token, bool accept_grammar);
 void                    common_sampler_reset (struct common_sampler * gsmpl);
 struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);

 // arguments can be nullptr to skip printing
-void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
+void common_perf_print(const struct jarvis_context * ctx, const struct common_sampler * gsmpl);

 // extended sampling implementation:
 //
@ -58,23 +58,23 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 // if grammar_first is true, the grammar is applied before the samplers (slower)
 // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
 //
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+jarvis_token common_sampler_sample(struct common_sampler * gsmpl, struct jarvis_context * ctx, int idx, bool grammar_first = false);

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);

 // helpers

 // access the internal list of current candidate tokens
-llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
+jarvis_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);

 // get the last accepted token
-llama_token common_sampler_last(const struct common_sampler * gsmpl);
+jarvis_token common_sampler_last(const struct common_sampler * gsmpl);

 // print the sampler chain into a string
 std::string common_sampler_print(const struct common_sampler * gsmpl);

 // get a string representation of the last accepted tokens
-std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
+std::string common_sampler_prev_str(common_sampler * gsmpl, jarvis_context * ctx, int n);

 char        common_sampler_type_to_chr(enum common_sampler_type cnstr);
 std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
--- a/common/train.cpp
+++ b/common/train.cpp
@ -34,7 +34,7 @@ struct train_state  * init_train_state() {
    state->opt = new struct ggml_opt_context;
    state->opt->ctx = NULL;
    state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
-    state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
+    state->opt->params.graph_size = JARVIS_TRAIN_MAX_NODES;
    state->opt->loss_after = 0.0f;

    return state;
@ -213,7 +213,7 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
 }

 int64_t get_example_targets_batch(
-    struct llama_context * lctx,
+    struct jarvis_context * lctx,
    struct ggml_tensor   * tokens_input,
    struct ggml_tensor   * target_probs,
    int64_t                example_id,
@ -221,7 +221,7 @@ int64_t get_example_targets_batch(
    const size_t         * samples_begin,
    const size_t         * samples_size,
          size_t           samples_count,
-    const llama_token    * train_data,
+    const jarvis_token    * train_data,
    size_t                 n_train_data,
    bool                   separate_with_eos,
    bool                   separate_with_bos,
@ -241,8 +241,8 @@ int64_t get_example_targets_batch(
    int64_t used_samples = 0;

    ggml_set_f32(target_probs, 0.0f);
-    llama_token bos = llama_token_bos(llama_get_model(lctx));
-    llama_token eos = llama_token_eos(llama_get_model(lctx));
+    jarvis_token bos = jarvis_token_bos(jarvis_get_model(lctx));
+    jarvis_token eos = jarvis_token_eos(jarvis_get_model(lctx));
    // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
    for (int k=0; k<n_batch; ++k) {
        // printf("%s: batch %d\n", __func__, k);
@ -259,7 +259,7 @@ int64_t get_example_targets_batch(
        bool sample_separation_eos = !separate_with_eos;
        bool sample_separation_bos = !separate_with_bos;
        for (int64_t i=0; i<n_tokens; ++i) {
-            llama_token token = eos;
+            jarvis_token token = eos;
            if (sample_offs >= sample_size && fill_with_next_samples) {
                if (!sample_separation_eos) {
                    // insert eos token to separate samples
@ -281,7 +281,7 @@ int64_t get_example_targets_batch(
            }
            // note: no else-if here
            if (sample_offs < sample_size) {
-                token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1));
+                token = clamp(train_data[sample_begin+sample_offs], 0, (jarvis_token) (n_vocab - 1));
                ++sample_offs;
            }
            ggml_set_f32_nd(target_probs,  token, (int) i, (int) k, 0, +1.0f);
@ -712,12 +712,12 @@ void save_train_state_gguf(struct gguf_context * fctx, struct train_state * trai
 }


-struct llama_file {
+struct jarvis_file {
    // use FILE * so we don't have to re-open the file to mmap
    FILE * fp;
    size_t size;

-    llama_file(const char * fname, const char * mode) {
+    jarvis_file(const char * fname, const char * mode) {
        fp = std::fopen(fname, mode);
        if (fp == NULL) {
            size = 0;
@ -788,7 +788,7 @@ struct llama_file {
        write_raw(&val, sizeof(val));
    }

-    ~llama_file() {
+    ~jarvis_file() {
        if (fp) {
            std::fclose(fp);
        }
@ -823,16 +823,16 @@ static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nu
 }

 size_t tokenize_file(
-        struct llama_context     * lctx,
+        struct jarvis_context     * lctx,
        const char               * filename,
        const std::string        & sample_start,
        bool                       include_sample_start,
        bool                       overlapping_samples,
        unsigned                   context_length,
-        std::vector<llama_token> & out_tokens,
+        std::vector<jarvis_token> & out_tokens,
        std::vector<size_t>      & out_samples_begin,
        std::vector<size_t>      & out_samples_size) {
-    struct llama_file f(filename, "rb");
+    struct jarvis_file f(filename, "rb");

    if (f.size == 0) {
        out_tokens.clear();
@ -844,7 +844,7 @@ size_t tokenize_file(
    }

    // account for possible leading whitespace that will be added by tokenizer
-    // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
+    // e.g. '\t' will be tokenized by jarvis spm tokenizer to [29871, 12]
    const int n_max_tokens_overhead = 1;

    std::vector<char> buf;
@ -862,8 +862,8 @@ size_t tokenize_file(
        // tokenize all data at once
        out_tokens.resize(buf.size() + n_max_tokens_overhead);

-        int n_tokens = llama_tokenize(
-            llama_get_model(lctx),
+        int n_tokens = jarvis_tokenize(
+            jarvis_get_model(lctx),
            buf.data(),
            (int) buf.size(),
            out_tokens.data(),
@ -871,8 +871,8 @@ size_t tokenize_file(
            false, false);
        if (n_tokens < 0) {
            out_tokens.resize(-n_tokens);
-            n_tokens = llama_tokenize(
-                llama_get_model(lctx),
+            n_tokens = jarvis_tokenize(
+                jarvis_get_model(lctx),
                buf.data(),
                (int) buf.size(),
                out_tokens.data(),
@ -915,7 +915,7 @@ size_t tokenize_file(
        out_samples_size.resize(out_samples_begin.size(), 0);

        std::vector<char>        buf_sample;
-        std::vector<llama_token> tok_sample;
+        std::vector<jarvis_token> tok_sample;

        const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
        size_t found_too_big_sample   = 0;
@ -925,11 +925,11 @@ size_t tokenize_file(
        size_t found_max_sample_size  = 0;

        size_t max_token_text_size = 0;
-        int n_vocab = llama_n_vocab(llama_get_model(lctx));
-        for (llama_token token=0; token < n_vocab; ++token) {
+        int n_vocab = jarvis_n_vocab(jarvis_get_model(lctx));
+        for (jarvis_token token=0; token < n_vocab; ++token) {
            max_token_text_size = std::max(
                max_token_text_size,
-                strlen(llama_token_get_text(llama_get_model(lctx), token)));
+                strlen(jarvis_token_get_text(jarvis_get_model(lctx), token)));
        }

        // upper bound of context byte length.
@ -957,7 +957,7 @@ size_t tokenize_file(
            }

            if (sample_size > 0) {
-                // llama_tokenize expects zero terminated string,
+                // jarvis_tokenize expects zero terminated string,
                // copy sample into buffer and zero terminate it.
                buf_sample.resize(sample_size);
                memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
@ -966,7 +966,7 @@ size_t tokenize_file(

                // tokenize the sample
                tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
-                int n_tokens = llama_tokenize(llama_get_model(lctx),
+                int n_tokens = jarvis_tokenize(jarvis_get_model(lctx),
                    buf_sample.data(),
                    (int) buf_sample.size(),
                    tok_sample.data(),
@ -974,7 +974,7 @@ size_t tokenize_file(
                    false, false);
                if (n_tokens < 0) {
                    tok_sample.resize(-n_tokens);
-                    n_tokens = llama_tokenize(llama_get_model(lctx),
+                    n_tokens = jarvis_tokenize(jarvis_get_model(lctx),
                        buf_sample.data(),
                        (int) buf_sample.size(),
                        tok_sample.data(),
@ -1365,7 +1365,7 @@ bool consume_common_train_arg(
                *invalid_param = true;
                return true;
            }
-            if (llama_supports_gpu_offload()) {
+            if (jarvis_supports_gpu_offload()) {
                params->n_gpu_layers = std::stoi(argv[i]);
            } else {
                fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
--- a/common/train.h
+++ b/common/train.h
@ -7,9 +7,9 @@
 #include <vector>

 #include "ggml.h"
-#include "llama.h"
+#include "jarvis.h"

-#define LLAMA_TRAIN_MAX_NODES 16384
+#define JARVIS_TRAIN_MAX_NODES 16384

 typedef std::string mt19937_state;

@ -92,9 +92,9 @@ struct train_opt_callback_data {
    struct train_state         * train;
    save_train_files_callback    save_cb;
    void                       * save_data;
-    struct llama_context       * lctx;
+    struct jarvis_context       * lctx;
    int                          last_save_iter;
-    llama_token                * tokens_data;
+    jarvis_token                * tokens_data;
    size_t                       tokens_size;
    size_t                     * samples_begin;
    size_t                     * samples_size;
@ -146,18 +146,18 @@ void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
 void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);

 size_t tokenize_file(
-        struct llama_context     * lctx,
+        struct jarvis_context     * lctx,
        const char               * filename,
        const std::string        & sample_start,
        bool                       include_sample_start,
        bool                       overlapping_samples,
        unsigned                   context_length,
-        std::vector<llama_token> & out_tokens,
+        std::vector<jarvis_token> & out_tokens,
        std::vector<size_t>      & out_samples_begin,
        std::vector<size_t>      & out_samples_size);

 int64_t get_example_targets_batch(
-        struct llama_context * lctx,
+        struct jarvis_context * lctx,
        struct ggml_tensor   * tokens_input,
        struct ggml_tensor   * target_probs,
        int64_t                example_id,
@ -165,7 +165,7 @@ int64_t get_example_targets_batch(
        const size_t         * samples_begin,
        const size_t         * samples_size,
              size_t           samples_count,
-        const llama_token    * train_data,
+        const jarvis_token    * train_data,
        size_t                 n_train_data,
        bool                   separate_with_eos,
        bool                   separate_with_bos,
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -49,7 +49,7 @@ class Model:
    _model_classes: dict[str, type[Model]] = {}

    dir_model: Path
-    ftype: gguf.LlamaFileType
+    ftype: gguf.JarvisFileType
    fname_out: Path
    is_big_endian: bool
    endianess: gguf.GGUFEndian
@ -69,7 +69,7 @@ class Model:
    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH

-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
+    def __init__(self, dir_model: Path, ftype: gguf.JarvisFileType, fname_out: Path, is_big_endian: bool = False,
                 use_temp_file: bool = False, eager: bool = False,
                 metadata_override: Path | None = None, model_name: str | None = None,
                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
@ -96,15 +96,15 @@ class Model:
        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py

        # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
-        if self.ftype == gguf.LlamaFileType.GUESSED:
+        if self.ftype == gguf.JarvisFileType.GUESSED:
            # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
            _, first_tensor = next(self.get_tensors())
            if first_tensor.dtype == torch.float16:
                logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
-                self.ftype = gguf.LlamaFileType.MOSTLY_F16
+                self.ftype = gguf.JarvisFileType.MOSTLY_F16
            else:
                logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
-                self.ftype = gguf.LlamaFileType.MOSTLY_BF16
+                self.ftype = gguf.JarvisFileType.MOSTLY_BF16

        # Configure GGUF Writer
        self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
@ -308,7 +308,7 @@ class Model:
                if n_dims <= 1 or new_name.endswith("_norm.weight"):
                    data_qtype = gguf.GGMLQuantizationType.F32

-                # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
+                # Conditions should closely match those in jarvis_model_quantize_internal in jarvis.cpp
                # Some tensor types are always in float32
                if data_qtype is False and (
                    any(
@ -337,25 +337,25 @@ class Model:
                    )
                ):
                    if self.ftype in (
-                        gguf.LlamaFileType.MOSTLY_TQ1_0,
-                        gguf.LlamaFileType.MOSTLY_TQ2_0,
+                        gguf.JarvisFileType.MOSTLY_TQ1_0,
+                        gguf.JarvisFileType.MOSTLY_TQ2_0,
                    ):
                        # TODO: use Q4_K and Q6_K
                        data_qtype = gguf.GGMLQuantizationType.F16

                # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
                if isinstance(data_qtype, bool):
-                    if self.ftype == gguf.LlamaFileType.ALL_F32:
+                    if self.ftype == gguf.JarvisFileType.ALL_F32:
                        data_qtype = gguf.GGMLQuantizationType.F32
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
+                    elif self.ftype == gguf.JarvisFileType.MOSTLY_F16:
                        data_qtype = gguf.GGMLQuantizationType.F16
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
+                    elif self.ftype == gguf.JarvisFileType.MOSTLY_BF16:
                        data_qtype = gguf.GGMLQuantizationType.BF16
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
+                    elif self.ftype == gguf.JarvisFileType.MOSTLY_Q8_0:
                        data_qtype = gguf.GGMLQuantizationType.Q8_0
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
+                    elif self.ftype == gguf.JarvisFileType.MOSTLY_TQ1_0:
                        data_qtype = gguf.GGMLQuantizationType.TQ1_0
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
+                    elif self.ftype == gguf.JarvisFileType.MOSTLY_TQ2_0:
                        data_qtype = gguf.GGMLQuantizationType.TQ2_0
                    else:
                        raise ValueError(f"Unknown file type: {self.ftype.name}")
@ -394,7 +394,7 @@ class Model:
        if self.metadata.size_label is None and total_params > 0:
            self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)

-        # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
+        # Extract the encoding scheme from the file type name. e.g. 'gguf.JarvisFileType.MOSTLY_Q8_0' --> 'Q8_0'
        output_type: str = self.ftype.name.partition("_")[2]

        # Filename Output
@ -537,13 +537,13 @@ class Model:

    # NOTE: this function is generated by convert_hf_to_gguf_update.py
    #       do not modify it manually!
-    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
+    # ref:  https://github.com/ggerganov/jarvis.cpp/pull/6920
    # Marker: Start get_vocab_base_pre
    def get_vocab_base_pre(self, tokenizer) -> str:
        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
        # is specific for the BPE pre-tokenizer used by the model
        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
-        # use in llama.cpp to implement the same pre-tokenizer
+        # use in jarvis.cpp to implement the same pre-tokenizer

        chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'

@ -559,8 +559,8 @@ class Model:
        #       or pull the latest version of the model from Huggingface
        #       don't edit the hashes manually!
        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
-            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
-            res = "llama-bpe"
+            # ref: https://huggingface.co/meta-jarvis/Meta-Jarvis-3-8B
+            res = "jarvis-bpe"
        if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
            # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
            res = "deepseek-llm"
@ -616,7 +616,7 @@ class Model:
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
            res = "jina-v2-de"
        if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
-            # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
+            # ref: https://huggingface.co/abacusai/Smaug-Jarvis-3-70B-Instruct
            res = "smaug-bpe"
        if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
            # ref: https://huggingface.co/LumiOpen/Poro-34B-chat
@ -666,7 +666,7 @@ class Model:
            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
            logger.warning("**          - the pre-tokenization config has changed upstream")
            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
-            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
+            logger.warning("** ref:     https://github.com/ggerganov/jarvis.cpp/pull/6920")
            logger.warning("**")
            logger.warning(f"** chkhsh:  {chkhsh}")
            logger.warning("**************************************************************************************")
@ -746,7 +746,7 @@ class Model:
    def _set_vocab_sentencepiece(self, add_to_gguf=True):
        tokens, scores, toktypes = self._create_vocab_sentencepiece()

-        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_model("jarvis")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
@ -835,8 +835,8 @@ class Model:

        return tokens, scores, toktypes

-    def _set_vocab_llama_hf(self):
-        vocab = gguf.LlamaHfVocab(self.dir_model)
+    def _set_vocab_jarvis_hf(self):
+        vocab = gguf.JarvisHfVocab(self.dir_model)
        tokens = []
        scores = []
        toktypes = []
@ -848,7 +848,7 @@ class Model:

        assert len(tokens) == vocab.vocab_size

-        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_model("jarvis")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
@ -857,7 +857,7 @@ class Model:
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)

-    def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
+    def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "jarvis-spm"], vocab_size: int):
        tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
        logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
        vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
@ -875,7 +875,7 @@ class Model:
        assert field  # token list
        self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])

-        if model_name == "llama-spm":
+        if model_name == "jarvis-spm":
            field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
            assert field  # token scores
            self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
@ -884,7 +884,7 @@ class Model:
        assert field  # token types
        self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])

-        if model_name != "llama-spm":
+        if model_name != "jarvis-spm":
            field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
            assert field  # token merges
            self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
@ -1226,7 +1226,7 @@ class XverseModel(Model):
            tokens.append(token_text)
            toktypes.append(toktype)

-        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_model("jarvis")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
@ -1515,21 +1515,21 @@ class StableLMModel(Model):
                raise ValueError(f"Unprocessed norms: {norms}")


-@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
-class LlamaModel(Model):
-    model_arch = gguf.MODEL_ARCH.LLAMA
+@Model.register("LLaMAForCausalLM", "JarvisForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
+class JarvisModel(Model):
+    model_arch = gguf.MODEL_ARCH.JARVIS

    def set_vocab(self):
        try:
            self._set_vocab_sentencepiece()
        except FileNotFoundError:
            try:
-                self._set_vocab_llama_hf()
+                self._set_vocab_jarvis_hf()
            except (FileNotFoundError, TypeError):
-                # Llama 3
+                # Jarvis 3
                self._set_vocab_gpt2()

-        # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
+        # Apply to CodeJarvis only (and ignore for Jarvis 3 with a vocab size of 128256)
        if self.hparams.get("vocab_size", 32000) == 32016:
            special_vocab = gguf.SpecialVocab(
                self.dir_model, load_merges=False,
@ -1583,9 +1583,9 @@ class LlamaModel(Model):
        n_kv_head = self.hparams.get("num_key_value_heads")

        if name.endswith(("q_proj.weight", "q_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_head)
        if name.endswith(("k_proj.weight", "k_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)

        # process the experts separately
        if name.find("block_sparse_moe.experts") != -1:
@ -1625,7 +1625,7 @@ class LlamaModel(Model):

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
-            if rope_scaling.get("rope_type", '').lower() == "llama3":
+            if rope_scaling.get("rope_type", '').lower() == "jarvis3":
                base = self.hparams.get("rope_theta", 10000.0)
                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
@ -1793,7 +1793,7 @@ class DbrxModel(Model):

        # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
        # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
-        # But llama.cpp moe graph works differently
+        # But jarvis.cpp moe graph works differently
        # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
        # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
        exp_tensor_names = {"ffn.experts.mlp.w1": None,       # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff,   n_expert}
@ -1842,7 +1842,7 @@ class MiniCPMModel(Model):
        self.gguf_writer.add_file_type(self.ftype)

    def set_vocab(self):
-        self._set_vocab_llama_hf()
+        self._set_vocab_jarvis_hf()

    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
        if n_kv_head is not None and n_head != n_kv_head:
@ -2188,7 +2188,7 @@ class Phi3MiniModel(Model):
                    if foken_data.get("special"):
                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL

-        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_model("jarvis")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
@ -2456,7 +2456,7 @@ class InternLM2Model(Model):
                    if foken_data.get("special"):
                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL

-        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_model("jarvis")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
@ -2468,7 +2468,7 @@ class InternLM2Model(Model):
        if chat_eos_token_id is not None:
            # For the chat model, we replace the eos with '<|im_end|>'.
            # TODO: this is a hack, should be fixed
-            #       https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
+            #       https://github.com/ggerganov/jarvis.cpp/pull/6745#issuecomment-2067687048
            special_vocab.special_token_ids["eos"] = chat_eos_token_id
            logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
                           " in chat mode so that the conversation can end normally.")
@ -2505,8 +2505,8 @@ class InternLM2Model(Model):
            q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1]

            # The model weights of q and k equire additional reshape.
-            q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
-            k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
+            q = JarvisModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
+            k = JarvisModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads)
            v = v.reshape((-1, v.shape[-1]))

            return [
@ -2769,7 +2769,7 @@ class GemmaModel(Model):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused

-        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
+        # lm_head is not used in jarvis.cpp, while autoawq will include this tensor in model
        # To prevent errors, skip loading lm_head.weight.
        if name == "lm_head.weight":
            logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
@ -2816,7 +2816,7 @@ class Gemma2Model(Model):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused

-        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
+        # lm_head is not used in jarvis.cpp, while autoawq will include this tensor in model
        # To prevent errors, skip loading lm_head.weight.
        if name == "lm_head.weight":
            logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
@ -2894,7 +2894,7 @@ class Rwkv6Model(Model):
        self.gguf_writer.add_feed_forward_length(intermediate_size)
        self.gguf_writer.add_file_type(self.ftype)

-        # required by llama.cpp, unused
+        # required by jarvis.cpp, unused
        self.gguf_writer.add_head_count(0)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@ -3024,7 +3024,7 @@ class OlmoModel(Model):
            self.gguf_writer.add_clamp_kqv(clip_qkv)

    # Same as super class, but permuting q_proj, k_proj
-    # Copied from: LlamaModel
+    # Copied from: JarvisModel
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused

@ -3032,9 +3032,9 @@ class OlmoModel(Model):
        n_kv_head = self.hparams.get("num_key_value_heads")

        if name.endswith("q_proj.weight"):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_head)
        if name.endswith("k_proj.weight"):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)

        return [(self.map_tensor_name(name), data_torch)]

@ -3174,12 +3174,12 @@ class OpenELMModel(Model):
        assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
        assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)

-    # Uses the tokenizer from meta-llama/Llama-2-7b-hf
+    # Uses the tokenizer from meta-jarvis/Jarvis-2-7b-hf
    def set_vocab(self):
        try:
            self._set_vocab_sentencepiece()
        except FileNotFoundError:
-            self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
+            self._set_vocab_builtin("jarvis-spm", self.hparams["vocab_size"])

    def set_gguf_parameters(self):
        n_embd = self._n_embd
@ -3300,7 +3300,7 @@ class ArcticModel(Model):
                        toktypes[token_id] = token_type
                        scores[token_id] = token_score

-        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_model("jarvis")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
@ -3322,9 +3322,9 @@ class ArcticModel(Model):
        n_kv_head = self.hparams.get("num_key_value_heads")

        if name.endswith("q_proj.weight"):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_head)
        if name.endswith("k_proj.weight"):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)

        # process the experts separately
        if name.find("block_sparse_moe.experts") != -1:
@ -3882,7 +3882,7 @@ class ChatGLMModel(Model):
            scores.append(score)
            toktypes.append(toktype)

-        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_model("jarvis")
        # glm3 needs prefix and suffix formatted as:
        # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
        self.gguf_writer.add_tokenizer_pre("chatglm-spm")
@ -4087,7 +4087,7 @@ class ExaoneModel(Model):

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
-            if rope_scaling.get("rope_type", '').lower() == "llama3":
+            if rope_scaling.get("rope_type", '').lower() == "jarvis3":
                base = self.hparams.get("rope_theta", 10000.0)
                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
@ -4116,12 +4116,12 @@ class ExaoneModel(Model):


@Model.register("GraniteForCausalLM")
-class GraniteModel(LlamaModel):
+class GraniteModel(JarvisModel):
    """Conversion for IBM's GraniteForCausalLM"""
    model_arch = gguf.MODEL_ARCH.GRANITE

    def set_gguf_parameters(self):
-        """Granite uses standard llama parameters with the following differences:
+        """Granite uses standard jarvis parameters with the following differences:

        - No head_dim support
        - New multiplier params:
@ -4196,9 +4196,9 @@ class ChameleonModel(Model):
        hidden_dim = self.hparams.get("hidden_size")

        if name.endswith(("q_proj.weight", "q_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_head)
        if name.endswith(("k_proj.weight", "k_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+            data_torch = JarvisModel.permute(data_torch, n_head, n_kv_head)
        if name.endswith(("q_norm.weight", "q_norm.bias")):
            data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
        if name.endswith(("k_norm.weight", "k_norm.bias")):
@ -4379,14 +4379,14 @@ def main() -> None:
        logger.error(f'Error: {args.model} is not a directory')
        sys.exit(1)

-    ftype_map: dict[str, gguf.LlamaFileType] = {
-        "f32": gguf.LlamaFileType.ALL_F32,
-        "f16": gguf.LlamaFileType.MOSTLY_F16,
-        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
-        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
-        "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
-        "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
-        "auto": gguf.LlamaFileType.GUESSED,
+    ftype_map: dict[str, gguf.JarvisFileType] = {
+        "f32": gguf.JarvisFileType.ALL_F32,
+        "f16": gguf.JarvisFileType.MOSTLY_F16,
+        "bf16": gguf.JarvisFileType.MOSTLY_BF16,
+        "q8_0": gguf.JarvisFileType.MOSTLY_Q8_0,
+        "tq1_0": gguf.JarvisFileType.MOSTLY_TQ1_0,
+        "tq2_0": gguf.JarvisFileType.MOSTLY_TQ2_0,
+        "auto": gguf.JarvisFileType.GUESSED,
    }

    is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -5,10 +5,10 @@
 # generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
 #
 # This is necessary in order to analyze the type of pre-tokenizer used by the model and
-# provide the necessary information to llama.cpp via the GGUF header in order to implement
+# provide the necessary information to jarvis.cpp via the GGUF header in order to implement
 # the same pre-tokenizer.
 #
-# ref: https://github.com/ggerganov/llama.cpp/pull/6920
+# ref: https://github.com/ggerganov/jarvis.cpp/pull/6920
 #
 # Instructions:
 #
@ -18,9 +18,9 @@
 #   python3 convert_hf_to_gguf_update.py <huggingface_token>
 #
 # - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
-# - Update llama.cpp with the new pre-tokenizer if necessary
+# - Update jarvis.cpp with the new pre-tokenizer if necessary
 #
-# TODO: generate tokenizer tests for llama.cpp
+# TODO: generate tokenizer tests for jarvis.cpp
 #

 import logging
@ -65,8 +65,8 @@ else:

 # TODO: add models here, base models preferred
 models = [
-    {"name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
-    {"name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
+    {"name": "jarvis-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-jarvis/Jarvis-2-7b-hf", },
+    {"name": "jarvis-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-jarvis/Meta-Jarvis-3-8B", },
    {"name": "phi-3",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
    {"name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
    {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
@ -86,7 +86,7 @@ models = [
    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
-    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
+    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Jarvis-3-70B-Instruct", },
    {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
    {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
@ -215,7 +215,7 @@ src_func = f"""
        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
        # is specific for the BPE pre-tokenizer used by the model
        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
-        # use in llama.cpp to implement the same pre-tokenizer
+        # use in jarvis.cpp to implement the same pre-tokenizer

        chktxt = {repr(CHK_TXT)}

@ -239,7 +239,7 @@ src_func = f"""
            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
            logger.warning("**          - the pre-tokenization config has changed upstream")
            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
-            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
+            logger.warning("** ref:     https://github.com/ggerganov/jarvis.cpp/pull/6920")
            logger.warning("**")
            logger.warning(f"** chkhsh:  {{chkhsh}}")
            logger.warning("**************************************************************************************")
@ -311,7 +311,7 @@ tests = [
    "3333333",
    "33333333",
    "333333333",
-    "Cửa Việt", # llama-bpe fails on this
+    "Cửa Việt", # jarvis-bpe fails on this
    " discards",
    CHK_TXT,
 ]
--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@ -223,13 +223,13 @@ class GGMLToGGUF:
                assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
                logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
        self.n_kv_head = n_kv_head
-        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
+        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.JARVIS, ggml_model.hyperparameters.n_layer)

    def save(self):
        logger.info('* Preparing to save GGUF file')
        gguf_writer = gguf.GGUFWriter(
            self.cfg.output,
-            gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
+            gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.JARVIS],
            use_temp_file = False)
        self.add_params(gguf_writer)
        self.add_vocab(gguf_writer)
@ -286,7 +286,7 @@ class GGMLToGGUF:

    def add_vocab(self, gguf_writer):
        hp = self.model.hyperparameters
-        gguf_writer.add_tokenizer_model('llama')
+        gguf_writer.add_tokenizer_model('jarvis')
        gguf_writer.add_tokenizer_pre('default')
        tokens = []
        scores = []
@ -358,7 +358,7 @@ class GGMLToGGUF:


 def handle_metadata(cfg, hp):
-    import examples.convert_legacy_llama as convert
+    import examples.convert_legacy_jarvis as convert

    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
    hf_config_path   = cfg.model_metadata_dir / "config.json"
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@ -271,12 +271,12 @@ if __name__ == '__main__':
    args = parse_args()
    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)

-    ftype_map: dict[str, gguf.LlamaFileType] = {
-        "f32": gguf.LlamaFileType.ALL_F32,
-        "f16": gguf.LlamaFileType.MOSTLY_F16,
-        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
-        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
-        "auto": gguf.LlamaFileType.GUESSED,
+    ftype_map: dict[str, gguf.JarvisFileType] = {
+        "f32": gguf.JarvisFileType.ALL_F32,
+        "f16": gguf.JarvisFileType.MOSTLY_F16,
+        "bf16": gguf.JarvisFileType.MOSTLY_BF16,
+        "q8_0": gguf.JarvisFileType.MOSTLY_Q8_0,
+        "auto": gguf.JarvisFileType.GUESSED,
    }

    ftype = ftype_map[args.outtype]
@ -372,9 +372,9 @@ if __name__ == '__main__':
            def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
                dest = list(super().modify_tensors(data_torch, name, bid))
                # some archs may have the same tensor for lm_head and output (tie word embeddings)
-                # in this case, adapters targeting lm_head will fail when using llama-export-lora
+                # in this case, adapters targeting lm_head will fail when using jarvis-export-lora
                # therefore, we ignore them for now
-                # see: https://github.com/ggerganov/llama.cpp/issues/9065
+                # see: https://github.com/ggerganov/jarvis.cpp/issues/9065
                if name == "lm_head.weight" and len(dest) == 0:
                    raise ValueError("lm_head is present in adapter, but is ignored in base model")
                for dest_name, dest_data in dest:
--- a/docs/android.md
+++ b/docs/android.md
@ -5,14 +5,14 @@

 [Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid.

-With Termux, you can install and run `llama.cpp` as if the environment were Linux. Once in the Termux shell:
+With Termux, you can install and run `jarvis.cpp` as if the environment were Linux. Once in the Termux shell:

 ```
 $ apt update && apt upgrade -y
 $ apt install git cmake
 ```

-Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake.
+Then, follow the [build instructions](https://github.com/ggerganov/jarvis.cpp/blob/master/docs/build.md), specifically for CMake.

 Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:

@ -20,22 +20,22 @@ Once the binaries are built, download your model of choice (e.g., from Hugging F
 $ curl -L {model-url} -o ~/{model}.gguf
 ```

-Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
+Then, if you are not already in the repo directory, `cd` into `jarvis.cpp` and:

 ```
-$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
+$ ./build/bin/jarvis-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
 ```

-Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
+Here, we show `jarvis-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.

 To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:

 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4

 ## Cross-compile using Android NDK
-It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)
+It's possible to build `jarvis.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)

-Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory:
+Once you're ready and have cloned `jarvis.cpp`, invoke the following in the project directory:

 ```
 $ cmake \
@ -45,15 +45,15 @@ $ cmake \
  -DCMAKE_C_FLAGS="-march=armv8.7a" \
  -DCMAKE_CXX_FLAGS="-march=armv8.7a" \
  -DGGML_OPENMP=OFF \
-  -DGGML_LLAMAFILE=OFF \
+  -DGGML_JARVISFILE=OFF \
  -B build-android
 ```

 Notes:
  - While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time
-  - `llamafile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/llamafile/issues/325)
+  - `jarvisfile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/jarvisfile/issues/325)

-The above command should configure `llama.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `llama.cpp` includes runtime checks for available CPU features it can use.
+The above command should configure `jarvis.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `jarvis.cpp` includes runtime checks for available CPU features it can use.

 Feel free to adjust the Android ABI for your target. Once the project is configured:

@ -65,17 +65,17 @@ $ cmake --install build-android --prefix {install-dir} --config Release
 After installing, go ahead and download the model of your choice to your host system. Then:

 ```
-$ adb shell "mkdir /data/local/tmp/llama.cpp"
-$ adb push {install-dir} /data/local/tmp/llama.cpp/
-$ adb push {model}.gguf /data/local/tmp/llama.cpp/
+$ adb shell "mkdir /data/local/tmp/jarvis.cpp"
+$ adb push {install-dir} /data/local/tmp/jarvis.cpp/
+$ adb push {model}.gguf /data/local/tmp/jarvis.cpp/
 $ adb shell
 ```

 In the `adb shell`:

 ```
-$ cd /data/local/tmp/llama.cpp
-$ LD_LIBRARY_PATH=lib ./bin/llama-simple -m {model}.gguf -c {context-size} -p "{your-prompt}"
+$ cd /data/local/tmp/jarvis.cpp
+$ LD_LIBRARY_PATH=lib ./bin/jarvis-simple -m {model}.gguf -c {context-size} -p "{your-prompt}"
 ```

 That's it!
--- a/docs/backend/BLIS.md
+++ b/docs/backend/BLIS.md
@ -25,13 +25,13 @@ sudo make install

 We recommend using openmp since it's easier to modify the cores being used.

-### llama.cpp compilation
+### jarvis.cpp compilation

 Makefile:

 ```bash
 make GGML_BLIS=1 -j
-# make GGML_BLIS=1 llama-benchmark-matmult
+# make GGML_BLIS=1 jarvis-benchmark-matmult
 ```

 CMake:
@ -43,7 +43,7 @@ cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
 make -j
 ```

-### llama.cpp execution
+### jarvis.cpp execution

 According to the BLIS documentation, we could set the following
 environment variables to modify the behavior of openmp:
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@ -1,4 +1,4 @@
-# llama.cpp for CANN
+# jarvis.cpp for CANN

 - [Background](#background)
 - [News](#news)
@ -17,9 +17,9 @@

 **CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.

-**Llama.cpp + CANN**
+**Jarvis.cpp + CANN**

-The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
+The jarvis.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.

 ## News

@ -78,11 +78,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 | GritLM-7B                   |   √   |   √  |   √  |
 | internlm2_5-7b-chat         |   √   |   √  |   √  |
 | koala-7B-HF                 |   √   |   √  |   √  |
-| Llama-2-7b-chat-hf          |   √   |   √  |   √  |
-| Llama-3-Smaug-8B            |   √   |   √  |   √  |
-| Llama2-Chinese-7b-Chat      |   √   |   √  |   √  |
-| Llama3-8B                   |   √   |   √  |   √  |
-| Llama3-8b-chinese           |   √   |   √  |   √  |
+| Jarvis-2-7b-chat-hf          |   √   |   √  |   √  |
+| Jarvis-3-Smaug-8B            |   √   |   √  |   √  |
+| Jarvis2-Chinese-7b-Chat      |   √   |   √  |   √  |
+| Jarvis3-8B                   |   √   |   √  |   √  |
+| Jarvis3-8b-chinese           |   √   |   √  |   √  |
 | mamba-130m-hf               |   √   |   √  |   √  |
 | Mistral-7B-Instruct-v0.2    |   √   |   √  |   √  |
 | Mixtral-8x7B-Instruct-v0.1  |   x   |   √  |   √  |
@ -120,9 +120,9 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 ## Docker

 ### Build Images
-You can get a image with llama.cpp in one command.
+You can get a image with jarvis.cpp in one command.
 ```sh
-docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile .
+docker build -t jarvis-cpp-cann -f .devops/jarvis-cli-cann.Dockerfile .
 ```

 ### Run container
@ -133,7 +133,7 @@ npu-smi info

 # Select the cards that you want to use, make sure these cards are not used by someone.
 # Following using cards of device0.
-docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
+docker run --name jarviscpp --device /dev/davinci0  --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it jarvis-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
 ```

 *Notes:*
@ -208,7 +208,7 @@ docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager

 Upon a successful installation, CANN is enabled for the available ascend devices.

-### II. Build llama.cpp
+### II. Build jarvis.cpp

 ```sh
 cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
@ -242,13 +242,13 @@ cmake --build build --config release
    - Use device 0:

    ```sh
-    ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+    ./build/bin/jarvis-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
    ```

    - Use multiple devices:

    ```sh
-    ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+    ./build/bin/jarvis-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
    ```

 ### **GitHub contribution**:
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@ -1,4 +1,4 @@
-# llama.cpp for SYCL
+# jarvis.cpp for SYCL

 - [Background](#background)
 - [Recommended Release](#recommended-release)
@ -24,9 +24,9 @@
 - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

-### Llama.cpp + SYCL
+### Jarvis.cpp + SYCL

-The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
+The jarvis.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.

 ## Recommended Release

@ -36,7 +36,7 @@ The following release is verified with good quality:

 |Commit ID|Tag|Release|Verified  Platform|
 |-|-|-|-|
-|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
+|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[jarvis-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/jarvis.cpp/releases/download/b3038/jarvis-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|


 ## News
@ -46,7 +46,7 @@ The following release is verified with good quality:
  - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.

 - 2024.5
-  - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
+  - Performance is increased: 34 -> 37 tokens/s of jarvis-2-7b.Q4_0 on Arc770.
  - Arch Linux is verified successfully.

 - 2024.4
@ -54,8 +54,8 @@ The following release is verified with good quality:

 - 2024.3
  - Release binary files of Windows.
-  - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
-  - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
+  - A blog is published: **Run LLM on all Intel GPUs Using jarvis.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-jarvis-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-jarvis-cpp-fd2e2dcbd9bd).
+  - New base line is ready: [tag b2437](https://github.com/ggerganov/jarvis.cpp/tree/b2437).
  - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
  - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
  - Support detecting all GPUs with level-zero and same top **Max compute units**.
@ -100,9 +100,9 @@ SYCL backend supports Intel GPU Family:
 *Notes:*

 - **Memory**
-  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
+  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/jarvis-cli`.

-  - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
+  - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *jarvis-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.

 - **Execution Unit (EU)**
  - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
@ -130,14 +130,14 @@ The docker build option is currently limited to *intel GPU* targets.
 ### Build image
 ```sh
 # Using FP16
-docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
+docker build -t jarvis-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/jarvis-cli-intel.Dockerfile .
 ```

 *Notes*:

 To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.

-You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
+You can also use the `.devops/jarvis-server-intel.Dockerfile`, which builds the *"server"* alternative.

 ### Run container

@ -145,7 +145,7 @@ You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *
 # First, find all the DRI cards
 ls -la /dev/dri
 # Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
-docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 jarvis-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```

 *Notes:*
@ -276,7 +276,7 @@ For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
 [hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
 ```

-### II. Build llama.cpp
+### II. Build jarvis.cpp

 #### Intel GPU

@ -309,7 +309,7 @@ export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
 export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
 export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR

-# Build LLAMA with Nvidia BLAS acceleration through SYCL
+# Build JARVIS with Nvidia BLAS acceleration through SYCL

 # Option 1: Use FP32 (recommended for better performance in most cases)
 cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
@ -329,7 +329,7 @@ export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
 export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
 export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR

-# Build LLAMA with rocBLAS acceleration through SYCL
+# Build JARVIS with rocBLAS acceleration through SYCL

 ## AMD
 # Use FP32, FP16 is not supported
@ -344,7 +344,7 @@ cmake --build build --config Release -j -v

 #### Retrieve and prepare model

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [jarvis-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Jarvis-2-7B-GGUF/blob/main/jarvis-2-7b.Q4_0.gguf) model as example.

 ##### Check device

@ -359,7 +359,7 @@ source /opt/intel/oneapi/setvars.sh
 Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:

 ```sh
-./build/bin/llama-ls-sycl-device
+./build/bin/jarvis-ls-sycl-device
 ```

 This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
@ -390,12 +390,12 @@ Choose one of following methods to run.
 - Use device 0:

 ```sh
-./examples/sycl/run-llama2.sh 0
+./examples/sycl/run-jarvis2.sh 0
 ```
 - Use multiple devices:

 ```sh
-./examples/sycl/run-llama2.sh
+./examples/sycl/run-jarvis2.sh
 ```

 2. Command line
@ -418,13 +418,13 @@ Examples:
 - Use device 0:

 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+ZES_ENABLE_SYSMAN=1 ./build/bin/jarvis-cli -m models/jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
 ```

 - Use multiple devices:

 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+ZES_ENABLE_SYSMAN=1 ./build/bin/jarvis-cli -m models/jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
 ```

 *Notes:*
@ -492,7 +492,7 @@ a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can
 b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)


-### II. Build llama.cpp
+### II. Build jarvis.cpp

 You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.

@ -506,7 +506,7 @@ Choose one of following methods to build from source code.

 2. CMake

-On the oneAPI command line window, step into the llama.cpp main directory and run the following:
+On the oneAPI command line window, step into the jarvis.cpp main directory and run the following:

 ```
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
@ -524,34 +524,34 @@ Or, use CMake presets to build:

 ```sh
 cmake --preset x64-windows-sycl-release
-cmake --build build-x64-windows-sycl-release -j --target llama-cli
+cmake --build build-x64-windows-sycl-release -j --target jarvis-cli

 cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
-cmake --build build-x64-windows-sycl-release -j --target llama-cli
+cmake --build build-x64-windows-sycl-release -j --target jarvis-cli

 cmake --preset x64-windows-sycl-debug
-cmake --build build-x64-windows-sycl-debug -j --target llama-cli
+cmake --build build-x64-windows-sycl-debug -j --target jarvis-cli
 ```

 3. Visual Studio

-You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
+You can use Visual Studio to open jarvis.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.

 *Notes:*

- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
+- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target jarvis-cli`.

 ### III. Run the inference

 #### Retrieve and prepare model

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [jarvis-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Jarvis-2-7B-GGUF/blob/main/jarvis-2-7b.Q4_0.gguf) model as example.

 ##### Check device

 1. Enable oneAPI running environment

-On the oneAPI command line window, run the following and step into the llama.cpp directory:
+On the oneAPI command line window, run the following and step into the jarvis.cpp directory:
 ```
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```
@ -561,7 +561,7 @@ On the oneAPI command line window, run the following and step into the llama.cpp
 Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:

 ```
-build\bin\llama-ls-sycl-device.exe
+build\bin\jarvis-ls-sycl-device.exe
 ```

 This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
@ -589,7 +589,7 @@ Choose one of following methods to run.
 1. Script

 ```
-examples\sycl\win-run-llama2.bat
+examples\sycl\win-run-jarvis2.bat
 ```

 2. Command line
@ -613,13 +613,13 @@ Examples:
 - Use device 0:

 ```
-build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
+build\bin\jarvis-cli.exe -m models\jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
 ```

 - Use multiple devices:

 ```
-build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
+build\bin\jarvis-cli.exe -m models\jarvis-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
 ```


@ -682,13 +682,13 @@ use 1 SYCL GPUs: [0] with Max compute units:512
  ```
  Otherwise, please double-check the GPU driver installation steps.

- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend?
+- Can I report Ojarvis issue on Intel GPU to jarvis.cpp SYCL backend?

-  No. We can't support Ollama issue directly, because we aren't familiar with Ollama.
+  No. We can't support Ojarvis issue directly, because we aren't familiar with Ojarvis.

-  Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it.
+  Sugguest reproducing on jarvis.cpp and report similar issue to jarvis.cpp. We will surpport it.

-  It's same for other projects including llama.cpp SYCL backend.
+  It's same for other projects including jarvis.cpp SYCL backend.

 - Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`

--- a/docs/build.md
+++ b/docs/build.md
@ -1,13 +1,13 @@
-# Build llama.cpp locally
+# Build jarvis.cpp locally

 **To get the Code:**

 ```bash
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
+git clone https://github.com/ggerganov/jarvis.cpp
+cd jarvis.cpp
 ```

-In order to build llama.cpp you have four different options.
+In order to build jarvis.cpp you have four different options.

 - Using `make`:
  - On Linux or MacOS:
@ -21,17 +21,17 @@ In order to build llama.cpp you have four different options.
    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
    2. Extract `w64devkit` on your pc.
    3. Run `w64devkit.exe`.
-    4. Use the `cd` command to reach the `llama.cpp` folder.
+    4. Use the `cd` command to reach the `jarvis.cpp` folder.
    5. From here you can run:
        ```bash
        make
        ```

  - Notes:
-    - For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
+    - For `Q4_0_4_4` quantization type build, add the `GGML_NO_JARVISFILE=1` flag. For example, use `make GGML_NO_JARVISFILE=1`.
    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, run `make LLAMA_DEBUG=1`
+    - For debug builds, run `make JARVIS_DEBUG=1`

 - Using `CMake`:

@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options.

  **Notes**:

-    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
+    - For `Q4_0_4_4` quantization type build, add the `-DGGML_JARVISFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_JARVISFILE=OFF`.
    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
    - For faster repeated compilation, install [ccache](https://ccache.dev/).
    - For debug builds, there are two cases:
@ -118,7 +118,7 @@ This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS i
    4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
    5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
    6. Run `w64devkit.exe`.
-    7. Use the `cd` command to reach the `llama.cpp` folder.
+    7. Use the `cd` command to reach the `jarvis.cpp` folder.
    8. From here you can run:

        ```bash
@ -140,13 +140,13 @@ Check [BLIS.md](./backend/BLIS.md) for more information.

 SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.

-llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
+jarvis.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).

-For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
+For detailed info, please refer to [jarvis.cpp for SYCL](./backend/SYCL.md).

 ### Intel oneMKL

-Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
+Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [jarvis.cpp for SYCL](./backend/SYCL.md).

 - Using manual oneAPI installation:
  By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
@ -159,7 +159,7 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
 - Using oneAPI docker image:
  If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.

-Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
+Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-jarvis2-on-intel-cpu.html) for more information.

 ### CUDA

@ -300,7 +300,7 @@ Libs: -lvulkan-1
 EOF

 ```
-Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
+Switch into the `jarvis.cpp` directory and run `make GGML_VULKAN=1`.

 #### MSYS2
 Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
@ -311,7 +311,7 @@ Install [MSYS2](https://www.msys2.org/) and then run the following commands in a
      mingw-w64-ucrt-x86_64-vulkan-devel \
      mingw-w64-ucrt-x86_64-shaderc
  ```
-Switch into `llama.cpp` directory and build using CMake.
+Switch into `jarvis.cpp` directory and build using CMake.
 ```sh
 cmake -B build -DGGML_VULKAN=ON
 cmake --build build --config Release
@ -323,10 +323,10 @@ You don't need to install Vulkan SDK. It will be installed inside the container.

 ```sh
 # Build the image
-docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
+docker build -t jarvis-cpp-vulkan -f .devops/jarvis-cli-vulkan.Dockerfile .

 # Then, use it:
-docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 jarvis-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```

 **Without docker**:
@ -348,13 +348,13 @@ Alternatively your package manager might be able to provide the appropriate libr
 For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
 For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.

-Then, build llama.cpp using the cmake command below:
+Then, build jarvis.cpp using the cmake command below:

 ```bash
 cmake -B build -DGGML_VULKAN=1
 cmake --build build --config Release
 # Test the output binary (with "-ngl 33" to offload all layers to GPU)
-./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
+./bin/jarvis-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4

 # You should see in the output, ggml_vulkan detected your GPU. For example:
 # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
@ -367,7 +367,7 @@ For more information about Ascend NPU in [Ascend Community](https://www.hiascend

 Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)

-Go to `llama.cpp` directory and build using CMake.
+Go to `jarvis.cpp` directory and build using CMake.
 ```bash
 cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
 cmake --build build --config release
@ -375,15 +375,15 @@ cmake --build build --config release

 You can test with:

-`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
+`./build/jarvis-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`

-If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
+If the fllowing info is output on screen, you are using `jarvis.cpp by CANN backend`:
 ```bash
 llm_load_tensors:       CANN buffer size = 13313.00 MiB
-llama_new_context_with_model:       CANN compute buffer size =  1260.81 MiB
+jarvis_new_context_with_model:       CANN compute buffer size =  1260.81 MiB
 ```

-For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
+For detailed info, such as model/device supports, CANN install, please refer to [jarvis.cpp for CANN](./backend/CANN.md).

 ### Android

@ -391,6 +391,6 @@ To read documentation for how to build on Android, [click here](./android.md)

 ### Arm CPU optimized mulmat kernels

-Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
+Jarvis.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.

-To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
+To support `Q4_0_4_4`, you must build with `GGML_NO_JARVISFILE=1` (`make`) or `-DGGML_JARVISFILE=OFF` (`cmake`).
--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@ -1,9 +1,9 @@
-# Add a new model architecture to `llama.cpp`
+# Add a new model architecture to `jarvis.cpp`

 Adding a model requires few steps:

 1. Convert the model to GGUF
-2. Define the model architecture in `llama.cpp`
+2. Define the model architecture in `jarvis.cpp`
 3. Build the GGML graph implementation

 After following these steps, you can open PR.
@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
 ### 1. Convert the model to GGUF

 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
+Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_jarvis.py](/examples/convert_legacy_jarvis.py) (for `jarvis/jarvis2` models in `.pth` format).

 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.

@ -81,26 +81,26 @@ Depending on the model configuration, tokenizer, code and tensors layout, you wi

 NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.

-### 2. Define the model architecture in `llama.cpp`
+### 2. Define the model architecture in `jarvis.cpp`

-The model params and tensors layout must be defined in `llama.cpp`:
+The model params and tensors layout must be defined in `jarvis.cpp`:
 1. Define a new `llm_arch`
 2. Define the tensors layout in `LLM_TENSOR_NAMES`
 3. Add any non standard metadata in `llm_load_hparams`
 4. Create the tensors for inference in `llm_load_tensors`
-5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
+5. If the model has a RoPE operation, add the rope type in `jarvis_rope_type`

 NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.

 ### 3. Build the GGML graph implementation

-This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
+This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `jarvis_build_graph`.

-Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
+Have a look at existing implementation like `build_jarvis`, `build_dbrx` or `build_bert`.

 When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.

-Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
+Note: to debug the inference graph: you can use [jarvis-eval-callback](/examples/eval-callback/).

 ## GGUF specification

@ -108,12 +108,12 @@ https://github.com/ggerganov/ggml/blob/master/docs/gguf.md

 ## Resources

- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
+- YaRN RoPE scaling https://github.com/ggerganov/jarvis.cpp/pull/2268
+- support Baichuan serial models https://github.com/ggerganov/jarvis.cpp/pull/3009
+- support attention bias https://github.com/ggerganov/jarvis.cpp/pull/4283
+- Mixtral support https://github.com/ggerganov/jarvis.cpp/pull/4406
+- BERT embeddings https://github.com/ggerganov/jarvis.cpp/pull/5423
+- Grok-1 support https://github.com/ggerganov/jarvis.cpp/pull/6204
+- Command R Plus support https://github.com/ggerganov/jarvis.cpp/pull/6491
+- support arch DBRX https://github.com/ggerganov/jarvis.cpp/pull/6515
+- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/jarvis.cpp/discussions/2948
--- a/docs/development/debugging-tests.md
+++ b/docs/development/debugging-tests.md
@ -51,7 +51,7 @@ rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
 Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.

 ```bash
-cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
+cmake -DCMAKE_BUILD_TYPE=Debug -DJARVIS_CUDA=1 -DJARVIS_FATAL_WARNINGS=ON ..
 make -j
 ```

@ -71,12 +71,12 @@ This may return output similar to below (focusing on key lines to pay attention

 ```bash
 ...
-1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
+1: Test command: ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf"
 1: Working Directory: .
 Labels: main
-  Test  #1: test-tokenizer-0-llama-spm
+  Test  #1: test-tokenizer-0-jarvis-spm
 ...
-4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf"
+4: Test command: ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-falcon.gguf"
 4: Working Directory: .
 Labels: main
  Test  #4: test-tokenizer-0-falcon
@ -86,8 +86,8 @@ Labels: main
 #### Step 4: Identify Test Command for Debugging

 So for test #1 above we can tell these two pieces of relevant information:
-* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
-* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
+* Test Binary: `~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0`
+* Test GGUF Model: `~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf`

 #### Step 5: Run GDB on test command

@ -100,5 +100,5 @@ gdb --args ${Test Binary} ${Test GGUF Model}
 Example:

 ```bash
-gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
+gdb --args ~/jarvis.cpp/build-ci-debug/bin/test-tokenizer-0 "~/jarvis.cpp/tests/../models/ggml-vocab-jarvis-spm.gguf"
 ```
--- a/docs/development/token_generation_performance_tips.md
+++ b/docs/development/token_generation_performance_tips.md
@ -1,23 +1,23 @@
 # Token generation performance troubleshooting

 ## Verifying that the model is running on the GPU with CUDA
-Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
+Make sure you compiled jarvis with the correct env variables according to [this guide](/docs/build.md#cuda), so that jarvis accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running jarvis, you may configure `N` to be very large, and jarvis will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
-./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
+./jarvis-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```

-When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
+When running jarvis, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
 ```shell
-llama_model_load_internal: [cublas] offloading 60 layers to GPU
-llama_model_load_internal: [cublas] offloading output layer to GPU
-llama_model_load_internal: [cublas] total VRAM used: 17223 MB
+jarvis_model_load_internal: [cublas] offloading 60 layers to GPU
+jarvis_model_load_internal: [cublas] offloading output layer to GPU
+jarvis_model_load_internal: [cublas] total VRAM used: 17223 MB
 ... rest of inference
 ```

 If you see these lines, then the GPU is being used.

 ## Verifying that the CPU is not oversaturated
-llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
+jarvis accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.

 # Example of runtime flags effect on inference speed benchmark
 These runs were tested on the following machine:
@ -27,7 +27,7 @@ RAM: 32GB

 Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)

-Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
+Run command: `./jarvis-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`

 Result:

--- a/docs/docker.md
+++ b/docs/docker.md
@ -2,26 +2,26 @@

 ## Prerequisites
 * Docker must be installed and running on your system.
-* Create a folder to store big models & intermediate files (ex. /llama/models)
+* Create a folder to store big models & intermediate files (ex. /jarvis/models)

 ## Images
 We have three Docker images available for this project:

-1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
-3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
+1. `ghcr.io/ggerganov/jarvis.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
+2. `ghcr.io/ggerganov/jarvis.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
+3. `ghcr.io/ggerganov/jarvis.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)

 Additionally, there the following images, similar to the above:

- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/jarvis.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/jarvis.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/jarvis.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/jarvis.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/jarvis.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/jarvis.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/jarvis.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/jarvis.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/jarvis.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)

 The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).

@ -32,25 +32,25 @@ The easiest way to download the models, convert them to ggml and optimize them i
 Replace `/path/to/models` below with the actual path where you downloaded the models.

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
+docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:full --all-in-one "/models/" 7B
 ```

 On completion, you are ready to play!

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 or with a light image:

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/jarvis.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 or with a server image:

 ```bash
-docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
+docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/jarvis.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
 ```

 ## Docker With CUDA
@ -60,9 +60,9 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
 ## Building Docker locally

 ```bash
-docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
-docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
-docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
+docker build -t local/jarvis.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
+docker build -t local/jarvis.cpp:light-cuda -f .devops/jarvis-cli-cuda.Dockerfile .
+docker build -t local/jarvis.cpp:server-cuda -f .devops/jarvis-server-cuda.Dockerfile .
 ```

 You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@ -74,18 +74,18 @@ The defaults are:

 The resulting images, are essentially the same as the non-CUDA images:

-1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
-2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
-3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
+1. `local/jarvis.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `local/jarvis.cpp:light-cuda`: This image only includes the main executable file.
+3. `local/jarvis.cpp:server-cuda`: This image only includes the server executable file.

 ## Usage

 After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.

 ```bash
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/jarvis.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```

 ## Docker With MUSA
@ -95,9 +95,9 @@ Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/
 ## Building Docker locally

 ```bash
-docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
-docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
-docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
+docker build -t local/jarvis.cpp:full-musa -f .devops/full-musa.Dockerfile .
+docker build -t local/jarvis.cpp:light-musa -f .devops/jarvis-cli-musa.Dockerfile .
+docker build -t local/jarvis.cpp:server-musa -f .devops/jarvis-server-musa.Dockerfile .
 ```

 You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.
@ -108,16 +108,16 @@ The defaults are:

 The resulting images, are essentially the same as the non-MUSA images:

-1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
-2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
-3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
+1. `local/jarvis.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `local/jarvis.cpp:light-musa`: This image only includes the main executable file.
+3. `local/jarvis.cpp:server-musa`: This image only includes the server executable file.

 ## Usage

 After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag.

 ```bash
-docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/jarvis.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/jarvis.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/jarvis.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
--- a/docs/install.md
+++ b/docs/install.md
@ -1,39 +1,39 @@
-# Install pre-built version of llama.cpp
+# Install pre-built version of jarvis.cpp

 ## Homebrew

 On Mac and Linux, the homebrew package manager can be used via

 ```sh
-brew install llama.cpp
+brew install jarvis.cpp
 ```
-The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
+The formula is automatically updated with new `jarvis.cpp` releases. More info: https://github.com/ggerganov/jarvis.cpp/discussions/7668

 ## Nix

 On Mac and Linux, the Nix package manager can be used via

 ```sh
-nix profile install nixpkgs#llama-cpp
+nix profile install nixpkgs#jarvis-cpp
 ```
 For flake enabled installs.

 Or

 ```sh
-nix-env --file '<nixpkgs>' --install --attr llama-cpp
+nix-env --file '<nixpkgs>' --install --attr jarvis-cpp
 ```

 For non-flake enabled installs.

-This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
+This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/jarvis-cpp/package.nix#L164).

 ## Flox

-On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
+On Mac and Linux, Flox can be used to install jarvis.cpp within a Flox environment via

 ```sh
-flox install llama-cpp
+flox install jarvis-cpp
 ```

-Flox follows the nixpkgs build of llama.cpp.
+Flox follows the nixpkgs build of jarvis.cpp.
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -13,10 +13,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
    add_subdirectory(cvector-generator)
-    add_subdirectory(baby-llama)
+    add_subdirectory(baby-jarvis)
    add_subdirectory(batched-bench)
    add_subdirectory(batched)
-    add_subdirectory(convert-llama2c-to-ggml)
+    add_subdirectory(convert-jarvis2c-to-ggml)
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)
    add_subdirectory(export-lora)
@ -27,7 +27,7 @@ else()
    add_subdirectory(gritlm)
    add_subdirectory(imatrix)
    add_subdirectory(infill)
-    add_subdirectory(llama-bench)
+    add_subdirectory(jarvis-bench)
    add_subdirectory(llava)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
@ -41,7 +41,7 @@ else()
    if (GGML_RPC)
        add_subdirectory(rpc)
    endif()
-    if (LLAMA_BUILD_SERVER)
+    if (JARVIS_BUILD_SERVER)
    add_subdirectory(server)
    endif()
    if (GGML_SYCL)
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@ -2,7 +2,7 @@
 set -e

 AI_NAME="${AI_NAME:-Miku}"
-MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
+MODEL="${MODEL:-./models/jarvis-2-7b-chat.ggmlv3.q4_K_M.bin}"
 USER_NAME="${USER_NAME:-Anon}"

 # Uncomment and adjust to the number of CPU cores you want to use.
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
    GEN_OPTIONS+=(--threads "$N_THREAD")
 fi

-./llama-cli "${GEN_OPTIONS[@]}" \
+./jarvis-cli "${GEN_OPTIONS[@]}" \
    --model "$MODEL" \
    --in-prefix " " \
    --in-suffix "${AI_NAME}:" \
--- a/examples/baby-llama/CMakeLists.txt
+++ b/examples/baby-llama/CMakeLists.txt
@ -1,5 +1,5 @@
-set(TARGET llama-baby-llama)
-add_executable(${TARGET} baby-llama.cpp)
+set(TARGET jarvis-baby-jarvis)
+add_executable(${TARGET} baby-jarvis.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -11,8 +11,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-#ifdef LLAMA_DEFAULT_RMS_EPS
-constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+#ifdef JARVIS_DEFAULT_RMS_EPS
+constexpr float rms_norm_eps = JARVIS_DEFAULT_RMS_EPS;
 #else
 constexpr float rms_norm_eps = 5e-6f;
 #endif
@ -71,7 +71,7 @@ static struct ggml_tensor * randomize_tensor(
    return tensor;
 }

-struct llama_hparams {
+struct jarvis_hparams {
    uint32_t n_vocab = 32000;
    uint32_t n_ctx   = 512;   // this is provided as user input?
    uint32_t n_embd  = 4096;
@ -80,17 +80,17 @@ struct llama_hparams {
    uint32_t n_layer = 32;
    uint32_t n_rot   = 64;

-    bool operator!=(const llama_hparams & other) const {
-        return memcmp(this, &other, sizeof(llama_hparams));
+    bool operator!=(const jarvis_hparams & other) const {
+        return memcmp(this, &other, sizeof(jarvis_hparams));
    }
 };

-static uint32_t get_n_ff(const struct llama_hparams* hparams) {
+static uint32_t get_n_ff(const struct jarvis_hparams* hparams) {
    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
    return n_ff;
 }

-struct llama_hparams_lora {
+struct jarvis_hparams_lora {
    uint32_t n_vocab = 32000;
    uint32_t n_ctx   = 512;   // this is provided as user input?
    uint32_t n_embd  = 4096;
@ -100,12 +100,12 @@ struct llama_hparams_lora {
    uint32_t n_rot   = 64;
    uint32_t n_lora  = 64;

-    bool operator!=(const llama_hparams_lora & other) const {
-        return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
+    bool operator!=(const jarvis_hparams_lora & other) const {
+        return memcmp(this, &other, sizeof(jarvis_hparams_lora)) != 0;
    }
 };

-struct llama_layer {
+struct jarvis_layer {
    // normalization
    struct ggml_tensor * attention_norm;

@ -124,7 +124,7 @@ struct llama_layer {
    struct ggml_tensor * w3;
 };

-struct llama_layer_lora {
+struct jarvis_layer_lora {
    // normalization
    struct ggml_tensor * attention_norm;

@ -148,34 +148,34 @@ struct llama_layer_lora {
 };


-struct llama_kv_cache {
+struct jarvis_kv_cache {
    struct ggml_context * ctx = NULL;

    struct ggml_tensor * k;
    struct ggml_tensor * v;

-    // llama_ctx_buffer buf;
+    // jarvis_ctx_buffer buf;

    int n; // number of tokens currently in the cache
 };

-struct llama_model {
+struct jarvis_model {
    struct ggml_context * ctx = NULL;

-    llama_hparams hparams;
+    jarvis_hparams hparams;

    struct ggml_tensor * tok_embeddings;

    struct ggml_tensor * norm;
    struct ggml_tensor * output;

-    std::vector<llama_layer> layers;
+    std::vector<jarvis_layer> layers;
 };

-struct llama_model_lora {
+struct jarvis_model_lora {
    struct ggml_context * ctx = NULL;

-    llama_hparams_lora hparams;
+    jarvis_hparams_lora hparams;

    struct ggml_tensor * tok_embeddings;

@ -183,10 +183,10 @@ struct llama_model_lora {
    struct ggml_tensor * outputa;
    struct ggml_tensor * outputb;

-    std::vector<llama_layer_lora> layers;
+    std::vector<jarvis_layer_lora> layers;
 };

-static void init_model(struct llama_model * model) {
+static void init_model(struct jarvis_model * model) {
    const auto & hparams = model->hparams;

    const uint32_t n_embd  = hparams.n_embd;
@ -223,7 +223,7 @@ static void init_model(struct llama_model * model) {
 }


-static void init_model_lora(struct llama_model_lora * model) {
+static void init_model_lora(struct jarvis_model_lora * model) {
    const auto & hparams = model->hparams;

    const uint32_t n_embd  = hparams.n_embd;
@ -266,7 +266,7 @@ static void init_model_lora(struct llama_model_lora * model) {
    }
 }

-static void set_param_model(struct llama_model * model) {
+static void set_param_model(struct jarvis_model * model) {
    const auto& hparams = model->hparams;

    const uint32_t n_layer = hparams.n_layer;
@ -292,7 +292,7 @@ static void set_param_model(struct llama_model * model) {
    }
 }

-static void set_param_model_lora(struct llama_model_lora * model) {
+static void set_param_model_lora(struct jarvis_model_lora * model) {
    const auto& hparams = model->hparams;

    const uint32_t n_layer = hparams.n_layer;
@ -323,7 +323,7 @@ static void set_param_model_lora(struct llama_model_lora * model) {
    }
 }

-static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
+static void randomize_model(struct jarvis_model * model, int seed, float mean, float std, float min, float max) {
    const auto & hparams = model->hparams;

    const uint32_t n_layer = hparams.n_layer;
@ -355,7 +355,7 @@ static void randomize_model(struct llama_model * model, int seed, float mean, fl


 static void randomize_model_lora(
-    struct llama_model_lora * model, int seed, float mean, float std, float min, float max
+    struct jarvis_model_lora * model, int seed, float mean, float std, float min, float max
 ) {
    const auto & hparams = model->hparams;

@ -391,7 +391,7 @@ static void randomize_model_lora(
    free_random_normal_distribution(rnd);
 }

-static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
+static void init_kv_cache(struct jarvis_kv_cache* cache, struct jarvis_model * model, int n_batch) {
    const auto & hparams = model->hparams;

    const uint32_t n_ctx   = hparams.n_ctx;
@ -425,7 +425,7 @@ static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * mod
    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
 }

-static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
+static bool init_kv_cache_lora(struct jarvis_kv_cache* cache, struct jarvis_model_lora * model, int n_batch) {
    const auto & hparams = model->hparams;

    const uint32_t n_ctx   = hparams.n_ctx;
@ -462,8 +462,8 @@ static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_
 }

 static struct ggml_tensor * forward(
-    struct llama_model    * model,
-    struct llama_kv_cache * cache,
+    struct jarvis_model    * model,
+    struct jarvis_kv_cache * cache,
    struct ggml_context   * ctx0,
    struct ggml_cgraph    * gf,
    struct ggml_tensor    * tokens_input,
@ -472,7 +472,7 @@ static struct ggml_tensor * forward(
 ) {
    const int N = n_tokens;

-    struct llama_kv_cache& kv_self = *cache;
+    struct jarvis_kv_cache& kv_self = *cache;
    const auto & hparams = model->hparams;
    const int n_ctx   = hparams.n_ctx;
    const int n_embd  = hparams.n_embd;
@ -692,8 +692,8 @@ static struct ggml_tensor * forward(
 }

 static struct ggml_tensor * forward_batch(
-    struct llama_model    * model,
-    struct llama_kv_cache * cache,
+    struct jarvis_model    * model,
+    struct jarvis_kv_cache * cache,
    struct ggml_context   * ctx0,
    struct ggml_cgraph    * gf,
    struct ggml_tensor    * tokens_input,
@ -703,7 +703,7 @@ static struct ggml_tensor * forward_batch(
 ) {
    const int N = n_tokens;

-    struct llama_kv_cache& kv_self = *cache;
+    struct jarvis_kv_cache& kv_self = *cache;
    const auto & hparams = model->hparams;
    const int n_ctx   = hparams.n_ctx;
    const int n_vocab = hparams.n_vocab;
@ -989,8 +989,8 @@ static struct ggml_tensor * forward_batch(
 }

 static struct ggml_tensor * forward_lora(
-    struct llama_model_lora * model,
-    struct llama_kv_cache   * cache,
+    struct jarvis_model_lora * model,
+    struct jarvis_kv_cache   * cache,
    struct ggml_context     * ctx0,
    struct ggml_cgraph      * gf,
    struct ggml_tensor      * tokens_input,
@ -999,7 +999,7 @@ static struct ggml_tensor * forward_lora(
 ) {
    const int N = n_tokens;

-    struct llama_kv_cache& kv_self = *cache;
+    struct jarvis_kv_cache& kv_self = *cache;
    const auto & hparams = model->hparams;

    const int n_ctx   = hparams.n_ctx;
@ -1444,7 +1444,7 @@ int main(int argc, char ** argv) {
    lcparams.mem_buffer = NULL;
    lcparams.no_alloc   = false;

-    struct llama_model model;
+    struct jarvis_model model;
    model.hparams.n_vocab = 8;
    model.hparams.n_ctx   = 8;
    model.hparams.n_embd  = 32;
@ -1467,7 +1467,7 @@ int main(int argc, char ** argv) {
    randomize_model(&model, 1337, 0.0f, 1.0f, -1.0f, +1.0f);

 /*
-    struct llama_model_lora model_lora;
+    struct jarvis_model_lora model_lora;
    // model.hparams.n_vocab = 6;
    // model.hparams.n_ctx   = 64;
    // model.hparams.n_embd  = 128;
@ -1501,7 +1501,7 @@ int main(int argc, char ** argv) {
 */
    int n_batch = 8;
    // key + value cache for the self attention
-    struct llama_kv_cache kv_self;
+    struct jarvis_kv_cache kv_self;
    printf("init_kv_cache\n");
    kv_self.ctx = model.ctx;
    init_kv_cache(&kv_self, &model, n_batch);
@ -1533,7 +1533,7 @@ int main(int argc, char ** argv) {
        int n_past = 0;

        struct ggml_cgraph * gf = NULL;
-        gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
+        gf = ggml_new_graph_custom(ctx0, JARVIS_TRAIN_MAX_NODES, true);

        get_example_targets_batch(ctx0, 64*ex+0,  tokens_input, targets);

@ -1601,7 +1601,7 @@ int main(int argc, char ** argv) {
            struct ggml_context * ctx0 = ggml_init(params);

            struct ggml_cgraph * gf = NULL;
-            gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
+            gf = ggml_new_graph_custom(ctx0, JARVIS_TRAIN_MAX_NODES, true);

            int n_past = 0;
            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@ -5,7 +5,7 @@
 #
 # Usage:
 #
-#   cd llama.cpp
+#   cd jarvis.cpp
 #   make -j
 #
 #   ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
@ -21,7 +21,7 @@ if [ $# -gt 2 ]; then
  eargs="${@:3}"
 fi

-ftmp="__llama.cpp_example_tmp__.txt"
+ftmp="__jarvis.cpp_example_tmp__.txt"
 trap "rm -f $ftmp" EXIT

 echo "Translate from English to French:
@ -58,4 +58,4 @@ echo "$2
 model=$1

 # generate the most likely continuation until the string "===" is found
-./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
+./jarvis-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
--- a/examples/batched-bench/CMakeLists.txt
+++ b/examples/batched-bench/CMakeLists.txt
@ -1,5 +1,5 @@
-set(TARGET llama-batched-bench)
+set(TARGET jarvis-batched-bench)
 add_executable(${TARGET} batched-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common jarvis ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@ -1,6 +1,6 @@
-# llama.cpp/example/batched-bench
+# jarvis.cpp/example/batched-bench

-Benchmark the batched decoding performance of `llama.cpp`
+Benchmark the batched decoding performance of `jarvis.cpp`

 ## Usage

@ -10,16 +10,16 @@ There are 2 modes of operation:
 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)

 ```bash
-./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
+./jarvis-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]

 # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
+./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99

 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
+./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps

 # custom set of batches
-./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
+./jarvis-batched-bench -m ./models/jarvis-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
 ```

 ## Sample results
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -1,7 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include "log.h"
-#include "llama.h"
+#include "jarvis.h"

 #include <algorithm>
 #include <cstdio>
@ -17,7 +17,7 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
    common_params params;

-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
+    if (!common_params_parse(argc, argv, params, JARVIS_EXAMPLE_BENCH, print_usage)) {
        return 1;
    }

@ -31,42 +31,42 @@ int main(int argc, char ** argv) {

    // init LLM

-    llama_backend_init();
-    llama_numa_init(params.numa);
+    jarvis_backend_init();
+    jarvis_numa_init(params.numa);

    // initialize the model

-    llama_model_params model_params = common_model_params_to_llama(params);
+    jarvis_model_params model_params = common_model_params_to_jarvis(params);

-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    jarvis_model * model = jarvis_load_model_from_file(params.model.c_str(), model_params);

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

-    llama_context_params ctx_params = common_context_params_to_llama(params);
+    jarvis_context_params ctx_params = common_context_params_to_jarvis(params);

    // ensure enough sequences are available
    ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());

-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    jarvis_context * ctx = jarvis_new_context_with_model(model, ctx_params);

    if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        fprintf(stderr , "%s: error: failed to create the jarvis_context\n" , __func__);
        return 1;
    }

-    const int32_t n_kv_max = llama_n_ctx(ctx);
+    const int32_t n_kv_max = jarvis_n_ctx(ctx);

-    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
+    jarvis_batch batch = jarvis_batch_init(n_kv_max, 0, 1);

    // decode in batches of ctx_params.n_batch tokens
-    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
+    auto decode_helper = [](jarvis_context * ctx, jarvis_batch & batch, int32_t n_batch) {
        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));

-            llama_batch batch_view = {
+            jarvis_batch batch_view = {
                n_tokens,
                batch.token    + i,
                nullptr,
@ -76,13 +76,13 @@ int main(int argc, char ** argv) {
                batch.logits   + i,
            };

-            const int ret = llama_decode(ctx, batch_view);
+            const int ret = jarvis_decode(ctx, batch_view);
            if (ret != 0) {
                LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                return false;
            }

-            llama_synchronize(ctx);
+            jarvis_synchronize(ctx);
        }

        return true;
@ -95,7 +95,7 @@ int main(int argc, char ** argv) {
        }

        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: jarvis_decode() failed\n", __func__);
            return 1;
        }
    }
@ -132,16 +132,16 @@ int main(int argc, char ** argv) {

                const auto t_pp_start = ggml_time_us();

-                llama_kv_cache_clear(ctx);
+                jarvis_kv_cache_clear(ctx);

                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                    LOG_ERR("%s: llama_decode() failed\n", __func__);
+                    LOG_ERR("%s: jarvis_decode() failed\n", __func__);
                    return 1;
                }

                if (is_pp_shared) {
                    for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+                        jarvis_kv_cache_seq_cp(ctx, 0, i, -1, -1);
                    }
                }

@ -157,7 +157,7 @@ int main(int argc, char ** argv) {
                    }

                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                        LOG_ERR("%s: llama_decode() failed\n", __func__);
+                        LOG_ERR("%s: jarvis_decode() failed\n", __func__);
                        return 1;
                    }
                }
@ -189,14 +189,14 @@ int main(int argc, char ** argv) {
    }

    LOG("\n");
-    llama_perf_context_print(ctx);
+    jarvis_perf_context_print(ctx);

-    llama_batch_free(batch);
+    jarvis_batch_free(batch);

-    llama_free(ctx);
-    llama_free_model(model);
+    jarvis_free(ctx);
+    jarvis_free_model(model);

-    llama_backend_free();
+    jarvis_backend_free();

    LOG("\n\n");

--- a/examples/batched.swift/Makefile
+++ b/examples/batched.swift/Makefile
@ -1,6 +1,6 @@
 .PHONY: build

 build:
-	xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
-	rm -f ./llama-batched-swift
-	ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
+	xcodebuild -scheme jarvis-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
+	rm -f ./jarvis-batched-swift
+	ln -s ./build/Build/Products/Debug/jarvis-batched-swift ./jarvis-batched-swift
--- a/examples/batched.swift/Package.swift
+++ b/examples/batched.swift/Package.swift
@ -4,17 +4,17 @@
 import PackageDescription

 let package = Package(
-    name: "llama-batched-swift",
+    name: "jarvis-batched-swift",
    platforms: [.macOS(.v12)],
    dependencies: [
-        .package(name: "llama", path: "../../"),
+        .package(name: "jarvis", path: "../../"),
    ],
    targets: [
        // Targets are the basic building blocks of a package, defining a module or a test suite.
        // Targets can depend on other targets in this package and products from dependencies.
        .executableTarget(
-            name: "llama-batched-swift",
-            dependencies: ["llama"],
+            name: "jarvis-batched-swift",
+            dependencies: ["jarvis"],
            path: "Sources",
            linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
        ),
--- a/Show more
+++ b/Show more