Merge branch 'ggerganov:master' into bitnet

2024-06-20 22:33:53 +08:00 · 2024-06-20 22:33:53 +08:00 · abcdc5033a
commit abcdc5033a
parent a58cf0d61f de391e4c80
259 changed files with 43256 additions and 36986 deletions
--- a/.devops/cloud-v-pipeline
+++ b/.devops/cloud-v-pipeline
@ -15,7 +15,7 @@ node('x86_runner1'){            // Running on x86 runner containing latest vecto
    stage('Running llama.cpp'){
        sh'''#!/bin/bash
            module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
-            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
+            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
            cat llama_log.txt                   # Printing results
        '''
    }
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@ -23,13 +23,13 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1
-RUN make -j$(nproc) main
+RUN make -j$(nproc) llama-cli
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
-COPY --from=build /app/main /main
+COPY --from=build /app/llama-cli /llama-cli
-ENTRYPOINT [ "/main" ]
+ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@ -0,0 +1,26 @@
 ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git
 WORKDIR /app
 COPY . .
 RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
        echo "LLAMA_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
    fi && \
    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-cli
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 COPY --from=build /app/build/bin/llama-cli /llama-cli
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
-RUN make -j$(nproc) main
+RUN make -j$(nproc) llama-cli
-ENTRYPOINT [ "/app/main" ]
+ENTRYPOINT [ "/app/llama-cli" ]
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@ -15,13 +15,13 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DLLAMA_VULKAN=1 && \
-    cmake --build build --config Release --target main
+    cmake --build build --config Release --target llama-cli
 # Clean up
 WORKDIR /
-RUN cp /app/build/bin/main /main && \
+RUN cp /app/build/bin/llama-cli /llama-cli && \
    rm -rf /app
 ENV LC_ALL=C.utf8
-ENTRYPOINT [ "/main" ]
+ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/llama-cli.Dockerfile
@ -9,15 +9,15 @@ WORKDIR /app
 COPY . .
-RUN make -j$(nproc) main
+RUN make -j$(nproc) llama-cli
 FROM ubuntu:$UBUNTU_VERSION as runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
-COPY --from=build /app/main /main
+COPY --from=build /app/llama-cli /llama-cli
 ENV LC_ALL=C.utf8
-ENTRYPOINT [ "/main" ]
+ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cpp-clblast.srpm.spec
+++ b/.devops/llama-cpp-clblast.srpm.spec
@ -36,9 +36,9 @@ make -j LLAMA_CLBLAST=1
 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamaclblast
+cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli
-cp -p server %{buildroot}%{_bindir}/llamaclblastserver
+cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server
-cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
+cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
+ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
@ -67,9 +67,9 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 %files
-%{_bindir}/llamaclblast
+%{_bindir}/llama-clblast-cli
-%{_bindir}/llamaclblastserver
+%{_bindir}/llama-clblast-server
-%{_bindir}/llamaclblastsimple
+%{_bindir}/llama-clblast-simple
 /usr/lib/systemd/system/llamaclblast.service
 %config /etc/sysconfig/llama
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@ -36,9 +36,9 @@ make -j LLAMA_CUDA=1
 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamacppcuda
+cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
-cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
+cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
-cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
+cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
+ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
@ -67,9 +67,9 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 %files
-%{_bindir}/llamacppcuda
+%{_bindir}/llama-cuda-cli
-%{_bindir}/llamacppcudaserver
+%{_bindir}/llama-cuda-server
-%{_bindir}/llamacppcudasimple
+%{_bindir}/llama-cuda-simple
 /usr/lib/systemd/system/llamacuda.service
 %config /etc/sysconfig/llama
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@ -38,9 +38,9 @@ make -j
 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llama
+cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
-cp -p server %{buildroot}%{_bindir}/llamaserver
+cp -p llama-server %{buildroot}%{_bindir}/llama-server
-cp -p simple %{buildroot}%{_bindir}/llamasimple
+cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
+ExecStart=/usr/bin/llama-server $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
@ -69,9 +69,9 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 %files
-%{_bindir}/llama
+%{_bindir}/llama-cli
-%{_bindir}/llamaserver
+%{_bindir}/llama-server
-%{_bindir}/llamasimple
+%{_bindir}/llama-simple
 /usr/lib/systemd/system/llama.service
 %config /etc/sysconfig/llama
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@ -25,13 +25,13 @@ ENV LLAMA_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
-RUN make -j$(nproc) server
+RUN make -j$(nproc) llama-server
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1
-COPY --from=build /app/server /server
+COPY --from=build /app/llama-server /llama-server
-ENTRYPOINT [ "/server" ]
+ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@ -0,0 +1,29 @@
 ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
        echo "LLAMA_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
    fi && \
    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-server
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev
 COPY --from=build /app/build/bin/llama-server /llama-server
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev
-RUN make -j$(nproc)
+RUN make -j$(nproc) llama-server
-ENTRYPOINT [ "/app/server" ]
+ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@ -19,13 +19,13 @@ RUN apt-get update && \
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
-    cmake --build build --config Release --target server
+    cmake --build build --config Release --target llama-server
 # Clean up
 WORKDIR /
-RUN cp /app/build/bin/server /server && \
+RUN cp /app/build/bin/llama-server /llama-server && \
    rm -rf /app
 ENV LC_ALL=C.utf8
-ENTRYPOINT [ "/server" ]
+ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@ -11,15 +11,15 @@ COPY . .
 ENV LLAMA_CURL=1
-RUN make -j$(nproc) server
+RUN make -j$(nproc) llama-server
 FROM ubuntu:$UBUNTU_VERSION as runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1
-COPY --from=build /app/server /server
+COPY --from=build /app/llama-server /llama-server
 ENV LC_ALL=C.utf8
-ENTRYPOINT [ "/server" ]
+ENTRYPOINT [ "/llama-server" ]
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@ -1,34 +0,0 @@
 ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
    rm /etc/apt/sources.list.d/intel-graphics.list && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git
 WORKDIR /app
 COPY . .
 RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
        echo "LLAMA_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
    fi && \
    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target main
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 COPY --from=build /app/build/bin/main /main
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/main" ]
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@ -6,11 +6,11 @@
        let
          inherit (config.packages) default;
          binaries = [
-            "llama"
+            "llama-cli"
            "llama-embedding"
            "llama-server"
-            "quantize"
+            "llama-quantize"
-            "train-text-from-scratch"
+            "llama-train-text-from-scratch"
          ];
          mkApp = name: {
            type = "app";
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -243,8 +243,6 @@ effectiveStdenv.mkDerivation (
    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
    # if they haven't been added yet.
    postInstall = ''
      mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
      mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
      mkdir -p $out/include
      cp $src/llama.h $out/include/
    '';
@ -294,7 +292,7 @@ effectiveStdenv.mkDerivation (
      license = lib.licenses.mit;
      # Accommodates `nix run` and `lib.getExe`
-      mainProgram = "llama";
+      mainProgram = "llama-cli";
      # These people might respond, on the best effort basis, if you ping them
      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@ -1,45 +0,0 @@
 ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
    rm /etc/apt/sources.list.d/intel-graphics.list && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
        echo "LLAMA_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
    fi && \
    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target server
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
    rm /etc/apt/sources.list.d/intel-graphics.list && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev
 COPY --from=build /app/build/bin/server /server
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/server" ]
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -10,11 +10,11 @@ shift
 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
    python3 ./convert-hf-to-gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    ./quantize "$@"
+    ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    ./main "$@"
+    ./llama-cli "$@"
 elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
-    ./finetune "$@"
+    ./llama-finetune "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -22,11 +22,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            ./quantize "$i" "${i/f16/q4_0}" q4_0
+            ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
        fi
    done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    ./server "$@"
+    ./llama-server "$@"
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
--- a/.dockerignore
+++ b/.dockerignore
@ -12,8 +12,8 @@ build*/
 models/*
-/main
+/llama-cli
-/quantize
+/llama-quantize
 arm_neon.h
 compile_commands.json
--- a/.editorconfig
+++ b/.editorconfig
@ -26,3 +26,6 @@ indent_size = 2
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
 [examples/cvector-generator/*.txt]
 insert_final_newline = unset
--- a/.github/ISSUE_TEMPLATE/01-bug-low.yml
+++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml
@ -24,7 +24,7 @@ body:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
-        $./main --version
+        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
--- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml
+++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
@ -24,7 +24,7 @@ body:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
-        $./main --version
+        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
--- a/.github/ISSUE_TEMPLATE/03-bug-high.yml
+++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml
@ -24,7 +24,7 @@ body:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
-        $./main --version
+        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
--- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml
+++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
@ -24,7 +24,7 @@ body:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
-        $./main --version
+        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -42,7 +42,6 @@ build:
            - cmake/**
            - CMakeLists.txt
            - CMakePresets.json
            - codecov.yml
 examples:
    - changed-files:
        - any-glob-to-any-file: examples/**
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -1,5 +1,7 @@
- Self Reported Review Complexity:
+
-    - [ ] Review Complexity : Low
+
-    - [ ] Review Complexity : Medium
+- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
-    - [ ] Review Complexity : High
+- Self-reported review complexity:
- [ ] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
+  - [ ] Low
  - [ ] Medium
  - [ ] High
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@ -119,7 +119,7 @@ jobs:
              -DLLAMA_FATAL_WARNINGS=OFF \
              -DLLAMA_ALL_WARNINGS=OFF \
              -DCMAKE_BUILD_TYPE=Release;
-          cmake --build build --config Release -j $(nproc) --target server
+          cmake --build build --config Release -j $(nproc) --target llama-server
      - name: Download the dataset
        id: download_dataset
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -84,7 +84,7 @@ jobs:
          name: llama-bin-macos-arm64.zip
  macOS-latest-cmake-x64:
-    runs-on: macos-latest
+    runs-on: macos-12
    steps:
      - name: Clone
@ -103,12 +103,10 @@ jobs:
        id: cmake_build
        run: |
          sysctl -a
          mkdir build
          cd build
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
+          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Test
        id: cmake_test
@ -241,8 +239,8 @@ jobs:
          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
          echo "Fetch llama2c model"
          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
      - name: Determine tag name
        id: tag
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@ -1,40 +0,0 @@
 name: Code Coverage
 on: [push, pull_request]
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  run:
    runs-on: ubuntu-20.04
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential gcc-8 lcov
      - name: Build
        run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
      - name: Run tests
        run: CC=gcc-8 make test
      - name: Generate coverage report
        run: |
          make coverage
          make lcov-report
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v3
        env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
        with:
          files: lcov-report/coverage.info
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -30,20 +30,20 @@ jobs:
    strategy:
      matrix:
        config:
-          - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
          #                     have disabled them for now until the reason why
          #                     is understood.
-          - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -87,8 +87,22 @@ jobs:
            exit 1
          fi
      - name: Build (no OpenMP)
        id: cmake_build_no_openmp
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
              -DLLAMA_OPENMP=OFF ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
      - name: Build
        id: cmake_build
        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
              -DLLAMA_NATIVE=OFF \
@ -96,7 +110,7 @@ jobs:
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
      - name: Tests
        id: server_integration_tests
@ -136,7 +150,7 @@ jobs:
        id: cmake_build
        run: |
          cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
      - name: Python setup
        id: setup_python
--- a/.gitignore
+++ b/.gitignore
@ -1,129 +1,123 @@
-*.o
+# Extensions
 *.a
-*.so
+*.bat
 *.bin
 *.dll
 *.dot
 *.etag
 *.exe
 *.gcda
 *.gcno
 *.gcov
 *.gguf
 *.gguf.json
 *.bin
 *.exe
 *.dll
 *.log
 *.gcov
 *.gcno
 *.gcda
 *.dot
 *.bat
 *.tmp
 *.metallib
 *.etag
 *.lastModified
-.DS_Store
+*.log
-.build/
+*.metallib
 *.o
 *.so
 *.tmp
 # IDE / OS
 .cache/
 .ccls-cache/
 .direnv/
 .DS_Store
 .envrc
 .idea/
 .swiftpm
 .venv
 .clang-tidy
 .vs/
 .vscode/
-.idea/
+nppBackup
 ggml-metal-embed.metal
-lcov-report/
+# Coverage
 gcovr-report/
 lcov-report/
 # Build Artifacts
 tags
 .build/
 build*
 !build-info.cmake
 !build-info.cpp.in
 !build-info.sh
 !build.zig
-cmake-build-*
+/libllama.so
 /llama-*
 android-ndk-*
 arm_neon.h
 cmake-build-*
 CMakeSettings.json
 compile_commands.json
 ggml-metal-embed.metal
 llama-batched-swift
 out/
 tmp/
 # CI
 !.github/workflows/*.yml
 # Models
 models/*
 models-mnt
 !models/.editorconfig
 !models/ggml-vocab-*.gguf*
-/Pipfile
+# Zig
 /baby-llama
 /beam-search
 /benchmark-matmult
 /convert-llama2c-to-ggml
 /embd-input-test
 /embedding
 /eval-callback
 /gguf
 /gguf-llama-simple
 /gguf-split
 /gritlm
 /imatrix
 /infill
 /libllama.so
 /llama-bench
 /llava-cli
 /lookahead
 /lookup
 /lookup-create
 /lookup-merge
 /lookup-stats
 /main
 /metal
 /passkey
 /perplexity
 /q8dot
 /quantize
 /quantize-stats
 /result
 /save-load-state
 /server
 /simple
 /batched
 /batched-bench
 /export-lora
 /finetune
 /retrieval
 /speculative
 /parallel
 /train-text-from-scratch
 /tokenize
 /vdot
 /common/build-info.cpp
 arm_neon.h
 compile_commands.json
 CMakeSettings.json
 __pycache__
 dist
 zig-out/
 zig-cache/
 # Logs
 ppl-*.txt
 qnt-*.txt
 perf-*.txt
 # Examples
 examples/jeopardy/results.txt
 examples/server/*.css.hpp
 examples/server/*.html.hpp
 examples/server/*.js.hpp
 examples/server/*.mjs.hpp
-examples/server/*.css.hpp
+!build_64.sh
 !examples/*.bat
 !examples/*/*.kts
 !examples/*/*/*.kts
 !examples/sycl/*.bat
 !examples/sycl/*.sh
 # Python
 __pycache__
 .venv
 /Pipfile
 dist
 poetry.lock
 poetry.toml
 nppBackup
 # Test binaries
-/tests/test-grammar-parser
+/tests/test-backend-ops
 /tests/test-llama-grammar
 /tests/test-double-float
 /tests/test-grad0
 /tests/test-grammar-parser
 /tests/test-llama-grammar
 /tests/test-opt
 /tests/test-quantize-fns
 /tests/test-quantize-perf
 /tests/test-rope
 /tests/test-sampling
 /tests/test-tokenizer-0
 /tests/test-tokenizer-1-spm
 /tests/test-tokenizer-1-bpe
-/tests/test-rope
+/tests/test-tokenizer-1-spm
-/tests/test-backend-ops
+
 # Scripts
 !/scripts/install-oneapi.bat
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -39,8 +39,12 @@ endif()
 if (APPLE)
    set(LLAMA_METAL_DEFAULT ON)
    set(LLAMA_BLAS_DEFAULT ON)
    set(LLAMA_BLAS_VENDOR_DEFAULT "Apple")
 else()
    set(LLAMA_METAL_DEFAULT OFF)
    set(LLAMA_BLAS_DEFAULT OFF)
    set(LLAMA_BLAS_VENDOR_DEFAULT "Generic")
 endif()
 set(LLAMA_LLAMAFILE_DEFAULT ON)
@ -91,9 +95,10 @@ endif()
 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
-option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
+option(LLAMA_BLAS                            "llama: use BLAS"                                  ${LLAMA_BLAS_DEFAULT})
 set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING
                                             "llama: BLAS library vendor")
 option(LLAMA_LLAMAFILE                       "llama: use llamafile SGEMM"                       ${LLAMA_LLAMAFILE_DEFAULT})
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
 option(LLAMA_CUBLAS                          "llama: use CUDA (deprecated, use LLAMA_CUDA)"     OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
@ -114,6 +119,7 @@ option(LLAMA_HIP_UMA                         "llama: use HIP unified memory arch
 option(LLAMA_VULKAN                          "llama: use Vulkan"                                OFF)
 option(LLAMA_VULKAN_CHECK_RESULTS            "llama: run Vulkan op checks"                      OFF)
 option(LLAMA_VULKAN_DEBUG                    "llama: enable Vulkan debug output"                OFF)
 option(LLAMA_VULKAN_MEMORY_DEBUG             "llama: enable Vulkan memory debug output"         OFF)
 option(LLAMA_VULKAN_VALIDATE                 "llama: enable Vulkan validation"                  OFF)
 option(LLAMA_VULKAN_RUN_TESTS                "llama: run Vulkan tests"                          OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
@ -311,9 +317,9 @@ if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
    endif()
-    if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
+    #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
-        set(BLA_SIZEOF_INTEGER 8)
+    #    set(BLA_SIZEOF_INTEGER 8)
-    endif()
+    #endif()
    set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
    find_package(BLAS)
@ -321,7 +327,7 @@ if (LLAMA_BLAS)
    if (BLAS_FOUND)
        message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
-        if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
+        if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple"))
            # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
            # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
            find_package(PkgConfig REQUIRED)
@ -374,12 +380,15 @@ if (LLAMA_BLAS)
        add_compile_options(${BLAS_LINKER_FLAGS})
-        add_compile_definitions(GGML_USE_OPENBLAS)
+        add_compile_definitions(GGML_USE_BLAS)
        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
            add_compile_definitions(GGML_BLAS_USE_MKL)
        endif()
        set(GGML_HEADERS_BLAS ggml-blas.h)
        set(GGML_SOURCES_BLAS ggml-blas.cpp)
        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${BLAS_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
    else()
@ -526,6 +535,10 @@ if (LLAMA_VULKAN)
            add_compile_definitions(GGML_VULKAN_DEBUG)
        endif()
        if (LLAMA_VULKAN_MEMORY_DEBUG)
            add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
        endif()
        if (LLAMA_VULKAN_VALIDATE)
            add_compile_definitions(GGML_VULKAN_VALIDATE)
        endif()
@ -652,6 +665,7 @@ if (LLAMA_SYCL)
    #todo: AOT
    find_package(IntelSYCL REQUIRED)
    find_package(MKL REQUIRED)
    message(STATUS "SYCL found")
@ -666,21 +680,22 @@ if (LLAMA_SYCL)
    endif()
    add_compile_options(-I./) #include DPCT
    add_compile_options(-I/${SYCL_INCLUDE_DIR})
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
    if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
    endif()
    set(GGML_HEADERS_SYCL ggml-sycl.h)
-    set(GGML_SOURCES_SYCL ggml-sycl.cpp)
+    file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
    list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
    if (WIN32)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
    else()
        add_compile_options(-I/${SYCL_INCLUDE_DIR})
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
        if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
        elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
@ -1258,6 +1273,7 @@ add_library(ggml OBJECT
            ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
            ${GGML_SOURCES_VULKAN}    ${GGML_HEADERS_VULKAN}
            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
            ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
            )
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -11,9 +11,21 @@
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
    },
-
+    {
        "name": "sycl-base",
        "hidden": true,
        "generator": "Ninja",
        "binaryDir": "${sourceDir}/build-${presetName}",
        "cacheVariables": {
            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
            "CMAKE_CXX_COMPILER": "icx",
            "LLAMA_SYCL": "ON",
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
    },
    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
-    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
    { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
    { "name": "static",  "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
    {
@ -35,15 +47,18 @@
    },
    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
-    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "release" ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
-    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "release", "static" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },
    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
-    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "release" ] },
+    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
-    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "release", "static" ] },
+    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
-    { "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
+    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
-    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
+    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
  ]
 }
--- a/176
+++ b/176
@ -1,8 +1,45 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
-	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
+	libllava.a \
-	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
+	llama-baby-llama \
-	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
+	llama-batched \
 	llama-batched-bench \
 	llama-bench \
 	llama-benchmark-matmult \
 	llama-cli \
 	llama-convert-llama2c-to-ggml \
 	llama-embedding \
 	llama-eval-callback \
 	llama-export-lora \
 	llama-finetune \
 	llama-gbnf-validator \
 	llama-gguf \
 	llama-gguf-split \
 	llama-gritlm \
 	llama-imatrix \
 	llama-infill \
 	llama-llava-cli \
 	llama-lookahead \
 	llama-lookup \
 	llama-lookup-create \
 	llama-lookup-merge \
 	llama-lookup-stats \
 	llama-parallel \
 	llama-passkey \
 	llama-perplexity \
 	llama-q8dot \
 	llama-quantize \
 	llama-quantize-stats \
 	llama-retrieval \
 	llama-save-load-state \
 	llama-server \
 	llama-simple \
 	llama-speculative \
 	llama-tokenize \
 	llama-train-text-from-scratch \
 	llama-vdot \
 	llama-cvector-generator \
 	tests/test-c.o
 # Binaries only useful for tests
 TEST_TARGETS = \
@ -404,10 +441,11 @@ ifndef LLAMA_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
 	ifeq ($(UNAME_S),Darwin)
-		MK_CPPFLAGS += -DGGML_USE_ACCELERATE
+		MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
 		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
 		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
 		MK_LDFLAGS  += -framework Accelerate
 		OBJS        += ggml-blas.o
 	endif
 endif # LLAMA_NO_ACCELERATE
@ -418,21 +456,30 @@ ifndef LLAMA_NO_OPENMP
 endif # LLAMA_NO_OPENMP
 ifdef LLAMA_OPENBLAS
-	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
+	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
 	OBJS        += ggml-blas.o
 endif # LLAMA_OPENBLAS
 ifdef LLAMA_OPENBLAS64
 	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas64)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas64)
 	OBJS        += ggml-blas.o
 endif # LLAMA_OPENBLAS64
 ifdef LLAMA_BLIS
 	MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
 	OBJS        += ggml-blas.o
 endif # LLAMA_BLIS
 ifndef LLAMA_NO_LLAMAFILE
 	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
 	OBJS        += sgemm.o
 endif
 ifdef LLAMA_BLIS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
 endif # LLAMA_BLIS
 ifdef LLAMA_RPC
 	MK_CPPFLAGS   += -DGGML_USE_RPC
 	OBJS          += ggml-rpc.o
@ -460,7 +507,7 @@ ifdef LLAMA_CUDA
 		CUDA_PATH ?= /usr/local/cuda
 	endif
 	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
-	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
+	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
 	OBJS         += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
 	OBJS         += $(OBJS_CUDA_TEMP_INST)
@ -561,6 +608,10 @@ ifdef LLAMA_VULKAN_DEBUG
 	MK_CPPFLAGS  += -DGGML_VULKAN_DEBUG
 endif
 ifdef LLAMA_VULKAN_MEMORY_DEBUG
 	MK_CPPFLAGS  += -DGGML_VULKAN_MEMORY_DEBUG
 endif
 ifdef LLAMA_VULKAN_VALIDATE
 	MK_CPPFLAGS  += -DGGML_VULKAN_VALIDATE
 endif
@ -740,6 +791,9 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
 ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 ggml-blas.o: ggml-blas.cpp ggml-blas.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 unicode.o: unicode.cpp unicode.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@ -777,7 +831,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
 	ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
 clean:
-	rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -vrf *.o tests/*.o *.so *.a *.dll common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
 	rm -vrf ggml-cuda/*.o
 	rm -vrf ggml-cuda/template-instances/*.o
 	find examples pocs -type f -name "*.o" -delete
@ -793,62 +847,62 @@ clean:
 # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
-main: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+llama-cli: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo
-	@echo '====  Run ./main -h for help.  ===='
+	@echo '====  Run ./llama-cli -h for help.  ===='
 	@echo
-infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+llama-infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-simple: examples/simple/simple.cpp                            ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-simple: examples/simple/simple.cpp                            ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tokenize: examples/tokenize/tokenize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-tokenize: examples/tokenize/tokenize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-batched: examples/batched/batched.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-batched: examples/batched/batched.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-quantize: examples/quantize/quantize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-quantize: examples/quantize/quantize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.o llama.o $(OBJS)
+llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-perplexity: examples/perplexity/perplexity.cpp                ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-perplexity: examples/perplexity/perplexity.cpp                ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-imatrix: examples/imatrix/imatrix.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-imatrix: examples/imatrix/imatrix.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-gritlm: examples/gritlm/gritlm.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-gritlm: examples/gritlm/gritlm.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+llama-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
@ -861,23 +915,27 @@ examples/server/%.hpp: examples/server/public/% Makefile
 		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
 	) > $@
-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
+llama-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
+llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
+llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -888,55 +946,61 @@ llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS)
 libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp  -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
 	$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
+llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
+llama-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
+llama-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+llama-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
+llama-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
 	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
 	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
 passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+llama-lookup-create: examples/lookup/lookup-create.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-lookup-merge: examples/lookup/lookup-merge.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-lookup-stats: examples/lookup/lookup-stats.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -962,20 +1026,20 @@ build-info.o: common/build-info.cpp
 tests: $(TEST_TARGETS)
-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
+llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-run-benchmark-matmult: benchmark-matmult
+run-benchmark-matmult: llama-benchmark-matmult
 	./$@
 .PHONY: run-benchmark-matmult swift
-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
+llama-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
+llama-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
--- a/README-sycl.md
+++ b/README-sycl.md
@ -1,6 +1,7 @@
 # llama.cpp for SYCL
 - [Background](#background)
 - [Recommended Release](#recommended-release)
 - [News](#news)
 - [OS](#os)
 - [Hardware](#hardware)
@ -31,8 +32,23 @@ When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneM
 It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
 ## Recommended Release
 The SYCL backend would be broken by some PRs due to no online CI.
 The following release is verified with good quality:
 |Commit ID|Tag|Release|Verified  Platform|
 |-|-|-|-|
 |fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
 ## News
 - 2024.5
  - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
  - Arch Linux is verified successfully.
 - 2024.4
  - Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
@ -77,7 +93,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 *Notes:*
 - **Memory**
-  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
+  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
  - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
@ -99,14 +115,14 @@ The docker build option is currently limited to *intel GPU* targets.
 ### Build image
 ```sh
 # Using FP16
-docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
+docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
 ```
 *Notes*:
 To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
-You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
+You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
 ### Run container
@ -275,7 +291,7 @@ source /opt/intel/oneapi/setvars.sh
 Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
 ```sh
-./build/bin/ls-sycl-device
+./build/bin/llama-ls-sycl-device
 ```
 A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
 ```
@ -313,7 +329,7 @@ Examples:
 - Use device 0:
 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
 ```
 or run by script:
@ -324,7 +340,7 @@ or run by script:
 - Use multiple devices:
 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
 ```
 Otherwise, you can run the script:
@ -394,15 +410,9 @@ Output (example):
 4. Install build tools
-a. Download & install cmake for Windows: https://cmake.org/download/
+a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
 b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
 b. Download & install mingw-w64 make for Windows provided by w64devkit
 - Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
 - Extract `w64devkit` on your pc.
 - Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
 ### II. Build llama.cpp
@ -412,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
 # Option 2: Or FP16
-cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
 cmake --build build --config Release -j
 ```
@ -425,9 +435,23 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
 .\examples\sycl\win-build-sycl.bat
 ```
 Or, use CMake presets to build:
 ```sh
 cmake --preset x64-windows-sycl-release
 cmake --build build-x64-windows-sycl-release -j --target llama-cli
 cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release
 cmake --build build-x64-windows-sycl-release -j --target llama-cli
 cmake --preset x64-windows-sycl-debug
 cmake --build build-x64-windows-sycl-debug -j --target llama-cli
 ```
 Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
 *Notes:*
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.
+- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
 ### III. Run the inference
@ -488,13 +512,13 @@ Examples:
 - Use device 0:
 ```
-build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
+build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
 ```
 - Use multiple devices:
 ```
-build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
+build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
 ```
 Otherwise, run the following wrapper script:
--- a/README.md
+++ b/README.md
@ -10,6 +10,9 @@
 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
 > [!IMPORTANT]
 [2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
 ### Recent API changes
 - [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
@ -192,6 +195,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
 - [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
 - [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
 - [RAGNA Desktop](https://ragna.app/) (proprietary)
 - [RecurseChat](https://recurse.chat/) (proprietary)
 - [semperai/amica](https://github.com/semperai/amica)
 - [withcatai/catai](https://github.com/withcatai/catai)
@ -205,6 +209,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
 - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
@ -217,7 +222,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 Here is a typical run using LLaMA v2 13B on M2 Ultra:
 ```
-$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
+$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
 I llama.cpp build info:
 I UNAME_S:  Darwin
 I UNAME_P:  arm
@ -383,6 +388,30 @@ brew install llama.cpp
 ```
 The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
 ### Nix
 On Mac and Linux, the Nix package manager can be used via
 ```
 nix profile install nixpkgs#llama-cpp
 ```
 For flake enabled installs.
 Or
 ```
 nix-env --file '<nixpkgs>' --install --attr llama-cpp
 ```
 For non-flake enabled installs.
 This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
 #### Flox
 On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
 ```
 flox install llama-cpp
 ```
 Flox follows the nixpkgs build of llama.cpp.
 ### Metal Build
 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
@ -555,7 +584,7 @@ Building the program with BLAS support may lead to some performance improvements
  ```sh
  # Build the image
-  docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
+  docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
  # Then, use it:
  docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
@ -586,7 +615,7 @@ Building the program with BLAS support may lead to some performance improvements
  cmake -B build -DLLAMA_VULKAN=1
  cmake --build build --config Release
  # Test the output binary (with "-ngl 33" to offload all layers to GPU)
-  ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
+  ./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
  # You should see in the output, ggml_vulkan detected your GPU. For example:
  # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
@ -619,21 +648,18 @@ python3 -m pip install -r requirements.txt
 # convert the model to ggml FP16 format
 python3 convert-hf-to-gguf.py models/mymodel/
 # [Optional] for models using BPE tokenizers
 python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
 # quantize the model to 4-bits (using Q4_K_M method)
-./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
+./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
 # update the gguf filetype to current version if older version is now unsupported
-./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
+./llama-quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
 ```
 ### Run the quantized model
 ```bash
 # start inference on a gguf model
-./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
+./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
 ```
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
@ -708,7 +734,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
 #### How to run
 1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
+2. Run `./llama-perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
 3. Output:
 ```
 perplexity : calculating perplexity over 655 chunks
@ -732,16 +758,16 @@ Here is an example of a few-shot interaction, invoked with the command
 ./examples/chat-13B.sh
 # custom arguments using a 13B model
-./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
+./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```
-Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
+Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
 ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
 ### Persistent Interaction
-The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
+The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
 ```bash
 # Start a new chat
@ -763,7 +789,7 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
 `llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
 ```bash
-./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
+./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
 ```
 The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
@ -842,7 +868,7 @@ $mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/ho
 Now, you can start chatting:
 ```
 $cd /data/data/com.termux/files/home/bin
-$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
+$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
 ```
 Here's a demo of an interactive session running on Pixel 5 phone:
@ -909,8 +935,8 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
 ```bash
 docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
-docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
+docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
-docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
+docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
 ```
 You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@ -960,7 +986,7 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
 ### Docs
- [main](./examples/main/README.md)
+- [main (cli)](./examples/main/README.md)
 - [server](./examples/server/README.md)
 - [jeopardy](./examples/jeopardy/README.md)
 - [BLIS](./docs/BLIS.md)
--- a/ci/run.sh
+++ b/ci/run.sh
@ -303,47 +303,47 @@ function gg_run_open_llama_7b_v2 {
    wiki_test="${path_wiki}/wiki.test.raw"
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-    (time ./bin/main --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-    (time ./bin/save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
@ -437,45 +437,45 @@ function gg_run_pythia_1_4b {
    wiki_test_60="${path_wiki}/wiki.test-60.raw"
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-    (time ./bin/save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
@ -569,47 +569,47 @@ function gg_run_pythia_2_8b {
    wiki_test="${path_wiki}/wiki.test.raw"
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-    (time ./bin/main --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-    (time ./bin/save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
@ -693,10 +693,10 @@ function gg_run_embd_bge_small {
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    (time ./bin/embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    set +e
 }
--- a/codecov.yml
+++ b/codecov.yml
@ -1,14 +0,0 @@
 comment: off
 coverage:
  status:
    project:
      default:
        target: auto
        threshold: 0
        base: auto
    patch:
      default:
        target: auto
        threshold: 0
        base: auto
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1576,6 +1576,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            return true;
        }
        params.out_file = argv[i];
        params.cvector_outfile = argv[i];
        return true;
    }
    if (arg == "-ofreq" || arg == "--output-frequency") {
@ -1610,6 +1611,55 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.i_chunk = std::stoi(argv[i]);
        return true;
    }
    // cvector params
    if (arg == "--completions-file") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.cvector_completions_file = argv[i];
        return true;
    }
    if (arg == "--positive-file") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.cvector_positive_file = argv[i];
        return true;
    }
    if (arg == "--negative-file") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.cvector_negative_file = argv[i];
        return true;
    }
    if (arg == "--completions") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.n_completions = std::stoi(argv[i]);
        return true;
    }
    if (arg == "--pca-batch") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.n_pca_batch = std::stoi(argv[i]);
        return true;
    }
    if (arg == "--pca-iter") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.n_pca_iterations = std::stoi(argv[i]);
        return true;
    }
 #ifndef LOG_DISABLE_LOGS
    // Parse args for logging parameters
    if (log_param_single_parse(argv[i])) {
@ -1931,6 +1981,16 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "logging",     "       --log-append",           "Don't truncate the old log file." });
 #endif // LOG_DISABLE_LOGS
    options.push_back({ "cvector" });
    options.push_back({ "cvector",     "-o,    --output FNAME",         "output file (default: '%s')", params.cvector_outfile.c_str() });
    options.push_back({ "cvector",     "       --positive-file FNAME",  "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
    options.push_back({ "cvector",     "       --negative-file FNAME",  "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
    options.push_back({ "cvector",     "       --completions-file FNAME",
                                                                        "completions file (default: '%s')", params.cvector_completions_file.c_str() });
    options.push_back({ "cvector",     "       --completions N",        "number of lines of completions file to use (default: %d)", params.n_completions });
    options.push_back({ "cvector",     "       --batch-pca N",          "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
    options.push_back({ "cvector",     "       --iter-pca N",           "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
    printf("usage: %s [options]\n", argv[0]);
    for (const auto & o : options) {
--- a/common/common.h
+++ b/common/common.h
@ -73,7 +73,6 @@ struct gpt_params {
    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
    int32_t n_beams               =     0; // if non-zero then use beam search of given width.
    int32_t grp_attn_n            =     1; // group-attention factor
    int32_t grp_attn_w            =   512; // group-attention width
    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
@ -232,6 +231,15 @@ struct gpt_params {
    bool process_output = false; // collect data for the output tensor
    bool compute_ppl    = true;  // whether to compute perplexity
    // cvector-generator params
    int n_completions = 64;
    int n_pca_batch = 20;
    int n_pca_iterations = 1000;
    std::string cvector_outfile          = "control_vector.gguf";
    std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
    std::string cvector_positive_file    = "examples/cvector-generator/positive.txt";
    std::string cvector_negative_file    = "examples/cvector-generator/negative.txt";
 };
 void gpt_params_handle_model_default(gpt_params & params);
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -83,6 +83,7 @@ models = [
    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
    {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
 ]
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -477,6 +477,9 @@ class Model:
        if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
            # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
            res = "smaug-bpe"
        if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
            # ref: https://huggingface.co/LumiOpen/Poro-34B-chat
            res = "poro-chat"
        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
            res = "jina-v2-code"
@ -1671,6 +1674,12 @@ class Qwen2MoeModel(Model):
        super().set_gguf_parameters()
        if (n_experts := self.hparams.get("num_experts")) is not None:
            self.gguf_writer.add_expert_count(n_experts)
        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
            logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
        if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
            self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
            logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
    _experts: list[dict[str, Tensor]] | None = None
--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
 When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
-Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
+Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
 ## GGUF specification
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@ -3,7 +3,7 @@
 ## Verifying that the model is running on the GPU with CUDA
 Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
-./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
+./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```
 When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@ -27,7 +27,7 @@ RAM: 32GB
 Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
-Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
+Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
 Result:
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -12,43 +12,45 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
    add_subdirectory(cvector-generator)
    add_subdirectory(baby-llama)
    add_subdirectory(batched)
    add_subdirectory(batched-bench)
    add_subdirectory(batched)
    add_subdirectory(benchmark)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)
    add_subdirectory(export-lora)
    add_subdirectory(finetune)
-    add_subdirectory(gritlm)
+    add_subdirectory(gbnf-validator)
    add_subdirectory(gguf-split)
    add_subdirectory(gguf)
    add_subdirectory(gritlm)
    add_subdirectory(imatrix)
    add_subdirectory(infill)
    add_subdirectory(llama-bench)
    add_subdirectory(llava)
    if (LLAMA_SYCL)
        add_subdirectory(sycl)
    endif()
    add_subdirectory(main)
    add_subdirectory(tokenize)
    add_subdirectory(parallel)
    add_subdirectory(perplexity)
    add_subdirectory(quantize)
    add_subdirectory(quantize-stats)
    add_subdirectory(retrieval)
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
    add_subdirectory(passkey)
    add_subdirectory(speculative)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
-    add_subdirectory(gguf)
+    add_subdirectory(main)
-    add_subdirectory(train-text-from-scratch)
+    add_subdirectory(parallel)
-    add_subdirectory(imatrix)
+    add_subdirectory(passkey)
-    if (LLAMA_BUILD_SERVER)
+    add_subdirectory(perplexity)
-        add_subdirectory(server)
+    add_subdirectory(quantize-stats)
-    endif()
+    add_subdirectory(quantize)
-    add_subdirectory(export-lora)
+    add_subdirectory(retrieval)
    if (LLAMA_RPC)
        add_subdirectory(rpc)
    endif()
    if (LLAMA_BUILD_SERVER)
    add_subdirectory(server)
    endif()
    if (LLAMA_SYCL)
        add_subdirectory(sycl)
    endif()
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
    add_subdirectory(speculative)
    add_subdirectory(tokenize)
    add_subdirectory(train-text-from-scratch)
 endif()
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
    GEN_OPTIONS+=(--threads "$N_THREAD")
 fi
-./main "${GEN_OPTIONS[@]}" \
+./llama-cli "${GEN_OPTIONS[@]}" \
    --model "$MODEL" \
    --in-prefix " " \
    --in-suffix "${AI_NAME}:" \
--- a/examples/baby-llama/CMakeLists.txt
+++ b/examples/baby-llama/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET baby-llama)
+set(TARGET llama-baby-llama)
 add_executable(${TARGET} baby-llama.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@ -58,4 +58,4 @@ echo "$2
 model=$1
 # generate the most likely continuation until the string "===" is found
-./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
+./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
--- a/examples/batched-bench/CMakeLists.txt
+++ b/examples/batched-bench/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET batched-bench)
+set(TARGET llama-batched-bench)
 add_executable(${TARGET} batched-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@ -10,16 +10,16 @@ There are 2 modes of operation:
 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
 ```bash
-./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
+./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
 # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
+./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
+./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
 # custom set of batches
-./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
+./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
 ```
 ## Sample results
--- a/examples/batched.swift/Makefile
+++ b/examples/batched.swift/Makefile
@ -1,6 +1,6 @@
 .PHONY: build
 build:
-	xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
+	xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
-	rm -f ./batched_swift
+	rm -f ./llama-batched-swift
-	ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
+	ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
--- a/examples/batched.swift/Package.swift
+++ b/examples/batched.swift/Package.swift
@ -4,7 +4,7 @@
 import PackageDescription
 let package = Package(
-    name: "batched_swift",
+    name: "llama-batched-swift",
    platforms: [.macOS(.v12)],
    dependencies: [
        .package(name: "llama", path: "../../"),
@ -13,7 +13,7 @@ let package = Package(
        // Targets are the basic building blocks of a package, defining a module or a test suite.
        // Targets can depend on other targets in this package and products from dependencies.
        .executableTarget(
-            name: "batched_swift",
+            name: "llama-batched-swift",
            dependencies: ["llama"],
            path: "Sources",
            linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@ -1,4 +1,4 @@
 This is a swift clone of `examples/batched`.
 $ `make`
-$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
+$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`
--- a/examples/batched/CMakeLists.txt
+++ b/examples/batched/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET batched)
+set(TARGET llama-batched)
 add_executable(${TARGET} batched.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@ -3,7 +3,7 @@
 The example demonstrates batched generation from a given prompt
 ```bash
-./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
+./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
 ...
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET benchmark)
+set(TARGET llama-bench-matmult)
 add_executable(${TARGET} benchmark-matmult.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
     $PROMPT_TEMPLATE > $PROMPT_FILE
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./main $GEN_OPTIONS \
+./llama-cli $GEN_OPTIONS \
  --model "$MODEL" \
  --threads "$N_THREAD" \
  --n_predict "$N_PREDICTS" \
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@ -62,7 +62,7 @@ fi
 if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
    echo 'Prompt cache does not exist, building...'
    # Default batch_size to 64 here for better user feedback during initial prompt processing
-    ./main 2>>"$LOG" \
+    ./llama-cli 2>>"$LOG" \
        --batch_size 64 \
        "${OPTS[@]}" \
        --prompt-cache "$PROMPT_CACHE_FILE" \
@ -109,13 +109,13 @@ while read -e line; do
    printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
-    ./main 2>>"$LOG" "${OPTS[@]}" \
+    ./llama-cli 2>>"$LOG" "${OPTS[@]}" \
            --prompt-cache "$CUR_PROMPT_CACHE" \
            --prompt-cache-all \
            --file "$CUR_PROMPT_FILE" \
            --reverse-prompt "${USER_NAME}:" \
            --n_predict "$n_predict" |
-        skip_bytes 1 |                  # skip BOS token added by ./main
+        skip_bytes 1 |                  # skip BOS token added by ./llama-cli
        tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
        skip_bytes "$n_prompt_len_pre"  # print generation
@ -133,7 +133,7 @@ while read -e line; do
    # TODO get both messages in one go
    if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
        ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
-        echo >&2 "Couldn't get number of tokens from ./main output!"
+        echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
        exit 1
    fi
@ -144,7 +144,7 @@ while read -e line; do
    fi
    # Update cache for next prompt in background, ideally during user input
-    ./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
+    ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
          --prompt-cache "$NEXT_PROMPT_CACHE" \
          --file "$NEXT_PROMPT_FILE" \
          --n_predict 1 &
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
     $PROMPT_TEMPLATE > $PROMPT_FILE
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./bin/main $GEN_OPTIONS \
+./bin/llama-cli $GEN_OPTIONS \
  --model "$MODEL" \
  --threads "$N_THREAD" \
  --n_predict "$N_PREDICTS" \
--- a/examples/chat.sh
+++ b/examples/chat.sh
@ -11,6 +11,6 @@ cd ..
 #
 #   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
 #
-./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
+./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
    --repeat_penalty 1.0 --color -i \
    -r "User:" -f prompts/chat-with-bob.txt
--- a/examples/convert-llama2c-to-ggml/CMakeLists.txt
+++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET convert-llama2c-to-ggml)
+set(TARGET llama-convert-llama2c-to-ggml)
 add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@ -8,7 +8,7 @@ To convert the model first download the models from the [llama2.c](https://githu
 After successful compilation, following usage options are available:
 ```
-usage: ./convert-llama2c-to-ggml [options]
+usage: ./llama-convert-llama2c-to-ggml [options]
 options:
  -h, --help                       show this help message and exit
@ -19,10 +19,10 @@ options:
 An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
-`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
+`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
 Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
 Now you can use the model with a command like:
-`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
+`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
--- a/examples/cvector-generator/CMakeLists.txt
+++ b/examples/cvector-generator/CMakeLists.txt
@ -0,0 +1,5 @@
 set(TARGET llama-cvector-generator)
 add_executable(${TARGET} cvector-generator.cpp pca.hpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/cvector-generator/README.md
+++ b/examples/cvector-generator/README.md
@ -0,0 +1,34 @@
 # cvector-generator
 This example demonstrates how to generate a control vector using gguf models.
 Related PRs:
 - [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
 - (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
 - [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
 ## Examples
 ```sh
 # CPU only
 ./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
 # With GPU
 ./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
 # With advanced options
 ./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100
 # To see help message
 ./cvector-generator -h
 # Then, have a look at "cvector" section
 ```
 ## Tips and tricks
 If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example:
 ```
 <|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
 <|im_start|>system\nYou are in a very good mood today<|im_end|>
 ```
--- a/examples/cvector-generator/completions.txt
+++ b/examples/cvector-generator/completions.txt
@ -0,0 +1,582 @@
 That game
 I can see
 Hmm, this
 I can relate to
 Who is
 I understand the
 Ugh,
 What the hell was
 Hey, did anyone
 Although
 Thank you for choosing
 What are you
 Oh w
 How dare you open
 It was my pleasure
 I'm hon
 I appreciate that you
 Are you k
 Whoever left this
 It's always
 Ew,
 Hey, I l
 Hello? Is someone
 I understand that
 That poem
 Aww, poor
 Hey, it
 Alright, who
 I didn't
 Well, life
 The document
 Oh no, this
 I'm concerned
 Hello, this is
 This art
 Hmm, this drink
 Hi there!
 It seems
 Is
 Good
 I can't
 Ex
 Who are
 I can see that
 Wow,
 Today is a
 Hey friend
 Sometimes friends
 Oh, this old
 The weather outside
 This place is sur
 I appreciate your input
 Thank you for the
 Look at
 I'm disappoint
 To my
 How dare you
 That's an
 This piece of art
 Eww
 This park is
 This is incredible
 Oh no, someone
 Exc
 Well, it'
 I warned
 Hey, I understand
 Hey, I saw
 How dare you go
 What the he
 Hey
 It's
 Hello? Hello?
 It
 Oh no!
 This is the perfect
 Good morning,
 Oh no, there
 It's so
 Yeah
 Uh,
 Hello everyone
 Who turned off
 The weather
 Who'
 Hey, this
 Wait,
 Eww, gross
 Excuse
 It seems like you
 Thank you so
 What happened?
 Oh my g
 I am deeply sad
 I war
 Okay, let'
 Hey, that
 That was a beautiful
 Oh no! That
 What happened
 Hey there
 The artist'
 What?!
 Hey, it'
 I am disappoint
 It seems like
 Oh no! The
 This park is a
 If you
 Yes! I did
 It sounds
 What
 Who is it
 Hmm, that
 That's strange
 Yeah, that was
 That's interesting
 This park
 What the hell
 Who is that
 I feel like my
 Oh well
 What the hell is
 Hello? Hello
 To my dearest
 Bless you!\"
 Thank you for
 Oh, looks like
 Can you please
 This place is
 Eww, what
 Bless you
 Is everything
 Hey, I just
 Whoever left these
 Well, that'
 I feel
 Hey, do you
 It's sad
 Oh no, it
 Hey, that'
 Oh my god,
 Thank you,
 Hello little one,
 I apolog
 Hey team, I
 How dare you read
 Who is this and
 Whoever left
 Hi there! W
 A
 If you have
 I was
 U
 Bless
 Well, this
 Oh, I'
 It's a
 Eww,
 Is everything okay?
 Oh, I
 Hello, can you
 Al
 That was a great
 What are
 I understand that not
 Oh no, not
 Who is it?\"
 Hey, can we
 Whoever is taking
 I would love to
 Hey, I noticed
 Hey, could
 I understand that there
 Hello?
 D
 Oh man, I
 Thank you so much
 Oh no, my
 Dear [Name
 Uh
 I remember
 Hey, who
 Well, it
 Are you
 I understand that it
 Hey, is
 I would
 Who is this
 Excuse me
 Alright
 I am thrilled
 Sometimes friends have
 Who the
 It's interesting
 I would love
 E
 Hello? Is anyone
 Well, this is
 This place
 Well,
 I warned you
 Hey, watch where
 Oh my
 That'
 Sometimes friends have different
 I understand that everyone
 What?
 What do these notes
 I can relate
 I'm not
 I understand
 To my dear
 Guys
 Well
 Hey, I appreciate
 Wow, what
 Dear
 That melody
 Who the hell
 Today is
 Hello little
 Wow, look
 That's great
 Love is never wrong
 I'm having
 Whoa, did
 Ugh
 Can you please provide
 I miss you,
 I feel uncom
 I know
 Ugh, this
 Hey, watch
 Oh great, a
 I didn
 Okay
 That game of char
 Oh
 I appreciate
 Who's there
 I am so
 Oh great, someone
 Hey, could you
 I remember wondering
 Wait, what?
 What do
 Hello? Can
 Hey there,
 That game of
 This is incred
 Oh my gosh
 Oh great, f
 I appreciate your
 It sounds like
 What the heck
 Okay, I understand
 Ew
 I understand that this
 Uh, hi
 Hi everyone!
 What the hell?
 Thank you for your
 Oh no, the
 Wow, I
 Who turned
 Dear [
 Whoever
 This is a
 Whoa, he
 What in the world
 Although the physical
 Hello, who is
 That's amaz
 Hey, I know
 Okay, that
 Hi everyone
 Hey, is everything
 I understand your fr
 Oh no, poor
 Oh, look
 Good morning
 Ew, gross
 Oh no, did
 Look at the family
 Hey team
 Yes!
 Hey, can I
 Okay, that'
 It's great
 Love is
 Hey, what
 Good morning, world
 Who is it?
 That poem really reson
 I
 That's
 I understand the task
 Gu
 Hello? Who'
 This postcard is
 Whoa,
 Oh, that
 I understand that I
 Whoever is
 Hello? Who is
 I'm really
 Wow, this
 Can
 This artwork really
 This is a shame
 I miss you too
 Who are you?
 Today is a difficult
 Hey, just
 Are you okay
 I am
 Hi,
 Wow, that
 Hey there! Can
 Okay, stay
 Oh great, just
 Yeah,
 Hello? Can you
 Oh, looks
 Thank you for sharing
 I'm glad
 Hey, is that
 Hmm
 It was my
 It sounds like you
 Wow, your
 I was promised certain
 That was such a
 Thank
 Excuse you
 That was
 Hey team,
 I feel un
 It was
 What'
 Hey friend, I
 How
 Saying goodbye
 That
 It's heart
 How dare
 Oh,
 Hello, may
 What's this
 Thank you for recogn
 Aww, that
 Oh, I remember
 Hmm, that'
 I miss
 I know this
 Wait
 Is everything okay
 Who is that person
 Wow, you
 Oh great
 I'm sad
 Wow, the
 I am very disappoint
 Who turned off the
 I understand that things
 I'm very
 Hi
 That's very
 Okay, I
 Oh no,
 Wow, there
 What's wrong
 I apologize for
 Hey, I
 Can I help you
 Oh, I didn
 Alright,
 Oh wow,
 Oh my goodness
 I know this event
 What in the
 Saying
 Yeah, that
 Guys, I
 Hey, this v
 This post
 Are
 Hey, can
 Hello? Is
 I can only imagine
 Oh, that sounds
 Hey, is anyone
 I am disappointed
 Hello,
 Hey everyone, I
 That was such
 It's okay
 The artist
 Whoa
 I understand that mistakes
 Can I help
 Who
 Hi everyone! I
 Hey, can you
 Wow, how
 Today
 Oh no, I
 Oh well, I
 Well, that
 This is the
 Yes! I finally
 Hey there little
 Hello everyone!
 Love is never
 Look at the
 This postcard
 Oh great,
 Can I
 Hmm, this is
 I understand your
 Oh, look at
 B
 I'm so
 Whoa, this
 W
 Oh, this
 Sometimes
 This piece of
 What the
 That was a
 Hey, do
 Oh no
 Whoa, what
 I feel like I
 The documentary
 Hello
 Hello little one
 I understand that my
 Eww, that
 Wow, an
 Yes! Finally,
 Although the physical location
 Whoever is watching
 That movie
 I remember wondering about
 Hey there, little
 Who's
 Hello, who
 Hello everyone! Thank
 Hello, can
 That's too
 Hey, just wanted
 Hey there, I
 Saying good
 Hey there!
 Who is there?
 Oh my good
 I am very
 Oh no, what
 Wow, thank
 I was promised
 Hi, is
 Hey, I'
 Guys, the
 Oh no, that
 Who is there
 Hello, this
 That movie really touched
 If you have something
 The documentary was
 I'm starting
 Are you kidd
 That movie really
 Hey everyone,
 Thank you for considering
 I didn'
 Yes! I
 Can you
 Oh my god
 Hey, whoever
 That melody really
 Thank you, little
 Hello, may I
 Look
 Wow, we
 It looks
 What do these
 Oh wow
 I apologize
 What are you all
 It's such
 It's clear
 Hey, I was
 Hey friend,
 I can only
 The weather outside is
 Eww, this
 I miss you
 Wow
 Aww,
 Hi, is there
 This artwork
 Okay,
 Oh well,
 This
 I'
 Say
 Hey there little gu
 Hmm,
 Whoa, who
 I am thr
 Oh man
 Okay, stay calm
 I'm happy
 Oh, this cur
 Oh man,
 I'm sorry
 Hello? Who
 What?! That
 This piece
 Hey everyone
 That's so
 Are you okay?
 What happened? Where
 Hi there
 The
 Who the hell entered
 I can
 Guys,
 What's
 What in
 It's important
 I'm
 I'm coming
 It'
 Yes! Finally
 Wait, what
 Wow, reading
 I'm surprised
 Hey, did
 Hey,
 Okay, let
 I understand that you
 Who the hell threw
 Eww, who
 Thank you for thinking
 Who is this?\"
 I am deeply
 Thank you for including
 Oh no, an
 It looks like you
 Aww
 I'm confused
 Wow, it
 That poem really
 Yes
 Hey there, is
 Hey, what'
 Thank you for remember
 To
 This is
 Thank you for making
 I can'
 That mel
 Wow, they
 I feel like
 Although the
 Who are you
 Love
 If
 What the hell are
 I am so sad
 Oh, I found
 Thank you
 It looks like
 Well, life is
 I appreciate that
 The artist's
 Whoa, that
 It's never
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@ -0,0 +1,499 @@
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
 #include "pca.hpp"
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
 #endif
 #ifdef GGML_USE_METAL
 #include "ggml-metal.h"
 #endif
 #include <cstdio>
 #include <string>
 #include <tuple>
 #include <vector>
 #include <algorithm>
 #include <iostream>
 #include <fstream>
 #include <climits>
 //////////////////////////////////////////////////
 // utils
 template <class Iter>
 static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
    std::string ret;
    for (; begin != end; ++begin) {
        ret += llama_token_to_piece(ctx, *begin);
    }
    return ret;
 }
 static void print_usage(int argc, char ** argv, const gpt_params & params) {
    gpt_params_print_usage(argc, argv, params);
    printf("\nexample usage:\n");
    printf("\n    CPU only:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
    printf("\n    with GPU:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
    printf("\n    advanced:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]);
    printf("\n");
 }
 //////////////////////////////////////////////////
 // cb_eval is reused for each pair of positive - negative prompt
 struct callback_data {
    ggml_context * ctx_ggml = nullptr;   // holds v_pos, v_neg, v_diff_filtered
    int n_layers = 0;
    int n_tokens = 0;
    bool is_eval_pos = true;
    // each element of the vector correspond to one layer
    std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
    std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
    std::vector<struct ggml_tensor *> v_diff_filtered;   // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
    // save a tensor into either v_pos or v_neg (decided by is_eval_pos)
    void save_tensor_for_layer(struct ggml_tensor * t) {
        GGML_ASSERT(t->type == GGML_TYPE_F32);
        if (ctx_ggml == nullptr) {
            // alloc a new ctx_ggml if needed
            struct ggml_init_params params_ggml = {
                /*.mem_size   =*/ ggml_tensor_overhead() * n_layers * 3u,
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
            };
            ctx_ggml = ggml_init(params_ggml);
        }
        // copy tensor data
        auto n_bytes = ggml_nbytes(t);
        struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
        t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
        ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
        ggml_set_name(t_layer, ggml_get_name(t));
        //print_debug_tensor(t_layer);
        if (is_eval_pos) {
            v_pos.push_back(t_layer);
        } else {
            v_neg.push_back(t_layer);
        }
    }
    // calculate diff (v_pos - v_neg) and place the result back to v_pos
    // all zero rows in the diff tensor will also be removed
    // NOTE: final layer is ignored. we only have (n_layers - 1) to process
    std::vector<struct ggml_tensor *> calc_diff() {
        for (float il = 0; il < v_pos.size(); il++) {
            float * a = (float *) v_pos[il]->data;
            float * b = (float *) v_neg[il]->data;
            size_t n_elem = ggml_nelements(v_pos[il]);
            for (size_t j = 0; j < n_elem; j++) {
                a[j] -= b[j];
            }
            //print_debug_tensor(v_pos[i]);
            auto diff_filtered = filter_nonzero_rows(v_pos[il]);
            v_diff_filtered.push_back(diff_filtered);
        }
        return v_diff_filtered; // for convinient, we return the result std::vector
    }
    // delete zero rows from a given 2D tensor
    struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
        //printf("filter_nonzero_rows\n");
        auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
            // check if given row containing all zero elements
            int n_cols = t->ne[0]; // hint: should be equal to n_embd
            for (int col = 0; col < n_cols; ++col) {
                if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
                    return false;
                }
            }
            return true;
        };
        std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
        for (int i_row = 0; i_row < a->ne[1]; i_row++) {
            if (!is_row_all_zeros(a, i_row, 1e-6)) {
                rows_to_copy.push_back(i_row);
            }
        }
        // get "n_nonzero_rows" for the output "diff_filtered"
        int n_nonzero_rows = rows_to_copy.size();
        //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
        int n_embd = a->ne[0];
        GGML_ASSERT(n_nonzero_rows > 0);
        // diff_filtered: [n_embd, n_nonzero_rows]
        struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
            ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
        ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
        diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
        // copy non-zero rows
        for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
            int src_row = rows_to_copy[dest_row];
            for (int i = 0; i < n_embd; i++) {
                float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
                ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
            }
        }
        //print_debug_tensor(diff_filtered);
        return diff_filtered;
    }
    // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
    void reset() {
        for (auto ptr : v_pos) free(ptr->data);
        for (auto ptr : v_neg) free(ptr->data);
        for (auto ptr : v_diff_filtered) free(ptr->data);
        v_pos.clear();
        v_neg.clear();
        v_diff_filtered.clear();
        if (ctx_ggml) {
            ggml_free(ctx_ggml);
        }
        ctx_ggml = nullptr;
    }
 };
 /**
 * process_ctx is used to store the ggml context for pre-post processing the diff vectors
 * in short, input => v_diff and output => v_final
 */
 struct train_context {
    ggml_context * ctx_ggml;
    int n_embd;
    int n_layers;
    /* pair of prompts to be used for generating final vector */
    std::vector<std::string> positive_entries;
    std::vector<std::string> negative_entries;
    // each element of the vector correspond to one layer
    // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
    // NOTE (2): v_diff is transposed from v_diff_tmp
    std::vector<struct ggml_tensor *> v_diff;  // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
    std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
    // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
    // v_diff_tmp will get converted unto v_diff later on
    std::vector<std::vector<uint8_t>> v_diff_tmp;
    train_context(int n_embd_, int n_layers_) {
        n_embd = n_embd_;
        n_layers = n_layers_;
        struct ggml_init_params params_ggml = {
            /*.mem_size   =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
        ctx_ggml = ggml_init(params_ggml);
        for (int il = 0; il < n_layers - 1; il++) {
            std::vector<uint8_t> empty;
            v_diff_tmp.push_back(empty);
            auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
            t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
            v_final.push_back(t);
        }
    }
    // add new rows into existing tensor in v_diff_tmp
    void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
        GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
        for (int il = 0; il < n_layers - 1; il++) {
            auto t = diff_filtered[il];
            auto & diff_tmp = v_diff_tmp[il];
            size_t curr_size = diff_tmp.size();
            diff_tmp.resize(curr_size + ggml_nbytes(t));
            memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
        }
    }
    // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
    // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
    void build_v_diff() {
        printf("build_v_diff\n");
        for (int il = 0; il < n_layers - 1; il++) {
            auto & diff_tmp = v_diff_tmp[il];
            int n_elem = diff_tmp.size() / sizeof(float);
            GGML_ASSERT(n_elem % n_embd == 0);
            int n_rows = n_elem / n_embd;
            struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
            ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
            // copy data & transpose
            diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
            float * arr = (float *) diff_tmp.data();
            for (int ir = 0; ir < n_rows; ++ir) {
                for (int ic = 0; ic < n_embd; ++ic) {
                    float f = arr[ir*n_embd + ic];
                    ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
                }
            }
            v_diff.push_back(diff);
            print_debug_tensor(diff);
            // free memory of diff_tmp
            diff_tmp.resize(0);
        }
    }
    ~train_context() {
        for (auto ptr : v_final) free(ptr->data);
        for (auto ptr : v_diff) free(ptr->data);
        // no need to free v_diff_tmp, since we didn't use malloc
        ggml_free(ctx_ggml);
    }
 };
 struct tokenized_prompt {
    std::vector<llama_token> tokens_pos;
    std::vector<llama_token> tokens_neg;
    size_t max_seq_len;
    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
        const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
        tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
        tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
        padding_seq(ctx, tokens_pos, max_seq_len);
        padding_seq(ctx, tokens_neg, max_seq_len);
    }
    void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
        // TODO: customize padding token
        std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
        llama_token pad_tok = pad_tokens.back();
        while (tokens.size() < len) {
            tokens.push_back(pad_tok);
        }
    }
 };
 //////////////////////////////////////////////////
 template <typename T>
 static std::string to_string(const T & val) {
    std::stringstream ss;
    ss << val;
    return ss.str();
 }
 static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
    std::vector<std::string> output;
    std::ifstream file(path);
    if (!file.is_open()) {
        fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
        exit(1);
    }
    std::string line;
    while (std::getline(file, line)) {
        bool is_skip = skip_empty_lines && line.empty();
        if (!is_skip) {
            string_process_escapes(line);
            output.push_back(line);
        }
    }
    file.close();
    return output;
 }
 //////////////////////////////////////////////////
 static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
    auto * cb_data = (callback_data *) user_data;
    static const char * l_out_name = "l_out";
    const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
    if (ask) {
        return is_l_out;
    }
    if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
        return true;
    }
    // save the tensor to current context
    cb_data->save_tensor_for_layer(t);
    return true;
 }
 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
    llama_kv_cache_clear(ctx);
    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
        fprintf(stderr, "%s : failed to eval\n", __func__);
        return false;
    }
    return true;
 }
 static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
    struct gguf_context * ctx = gguf_init_empty();
    const std::string arch = "controlvector";
    gguf_set_val_str(ctx, "general.architecture", arch.c_str());
    gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
    gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
    for (size_t i = 0; i < v_ctrl.size(); ++i) {
        gguf_add_tensor(ctx, v_ctrl[i]);
        print_debug_tensor(v_ctrl[i]);
        printf("Added tensor: %s\n", v_ctrl[i]->name);
    }
    printf("%s: writing file...\n", __func__);
    gguf_write_to_file(ctx, fname.c_str(), false);
    printf("%s: wrote file '%s'\n", __func__, fname.c_str());
    gguf_free(ctx);
 }
 /**
 * Load prompt files and completion file.
 * Then format each pair of prompt + completion to make an entry.
 */
 static int prepare_entries(gpt_params & params, train_context & ctx_train) {
    // load prompts
    std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
    std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
    if (positive_prompts.size() != negative_prompts.size()) {
        fprintf(stderr, "number of positive and negative prompts must be equal\n");
        return 1;
    }
    if (positive_prompts.empty()) {
        fprintf(stderr, "must provide at least one prompt pair\n");
        return 1;
    }
    // create templated prompts
    std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
    auto format_template = [](std::string persona, std::string suffix) {
        // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]"
        return persona + " " + suffix;
    };
    for (size_t i = 0; i < positive_prompts.size(); ++i) {
        for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
            // TODO replicate the truncations done by the python implementation
            ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
            ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
        }
    }
    return 0;
 }
 int main(int argc, char ** argv) {
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        print_usage(argc, argv, params);
        return 1;
    }
    if (params.n_pca_iterations % params.n_pca_batch != 0) {
        fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
        return 1;
    }
    callback_data cb_data;
    // pass the callback to the backend scheduler
    // it will be executed for each node during the graph computation
    params.cb_eval = cb_eval;
    params.cb_eval_user_data = &cb_data;
    params.warmup = false;
    print_build_info();
    llama_backend_init();
    llama_numa_init(params.numa);
    // load the model to get hparams
    llama_model * model;
    llama_context * ctx;
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    // int n_ctx = llama_n_ctx(ctx);
    int n_layers = llama_n_layer(model);
    int n_embd = llama_n_embd(model);
    // get model hint param (a.k.a model arch name)
    char model_hint[128];
    llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
    // init train_context
    train_context ctx_train(n_embd, n_layers);
    // load and prepare entries for training
    prepare_entries(params, ctx_train);
    // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
    std::vector<tokenized_prompt> tokenized_prompts;
    size_t n_total_tokens = 0;
    for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
        tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
        n_total_tokens += 2 * t.max_seq_len;
        tokenized_prompts.push_back(std::move(t));
    }
    std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
    for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
        bool success = false;
        tokenized_prompt t = tokenized_prompts[i];
        cb_data.n_layers = n_layers;
        cb_data.n_tokens = t.max_seq_len;
        printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
            (int) i+1, (int) ctx_train.positive_entries.size(),
            tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
            tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
            (int) t.max_seq_len);
        cb_data.is_eval_pos = true;
        success = get_hidden_layers(ctx, t.tokens_pos);
        if (!success) break;
        cb_data.is_eval_pos = false;
        success = get_hidden_layers(ctx, t.tokens_neg);
        if (!success) break;
        // calculate diff and remove all zero rows
        auto v_diff_filtered = cb_data.calc_diff();
        // save & concat the filtered v_diff to ctx_train
        ctx_train.concat_diff_tmp(v_diff_filtered);
        // reset for next iteration
        cb_data.reset();
    }
    // done with the model, we can now free it to make gain some memory
    printf("Done evaluate prompts, unload model...\n");
    llama_free(ctx);
    llama_free_model(model);
    // prepare ctx_train for PCA
    ctx_train.build_v_diff();
    // run PCA
    PCA::pca_params pca_params;
    pca_params.n_threads = params.n_threads;
    pca_params.n_batch = params.n_pca_batch;
    pca_params.n_iterations = params.n_pca_iterations;
    PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
    // write output vectors to gguf
    export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
    llama_backend_free();
    return 0;
 }
--- a/examples/cvector-generator/negative.txt
+++ b/examples/cvector-generator/negative.txt
@ -0,0 +1 @@
 [INST] Act like a person who is extremely sad. [/INST]
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@ -0,0 +1,322 @@
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
 #endif
 #ifdef GGML_USE_METAL
 #include "ggml-metal.h"
 #endif
 #include <cstdio>
 #include <ctime>
 #include <string>
 #include <tuple>
 #include <vector>
 #include <algorithm>
 #include <iostream>
 #include <fstream>
 #define DEBUG_POS 5
 static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
    printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
    if (!with_data) return;
    printf("%s: %s[0] = [", __func__, t->name);
    for (size_t i = 0; i <= DEBUG_POS; i++) {
        printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
    }
    printf(" ... ]\n");
 }
 namespace PCA {
 // input params for PCA computations
 struct pca_params {
    int n_threads = 1;
    int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
    int n_iterations = 1000;
    float tolerance = 1e-7;
    // for debugging
    int i_layer = 0;
    int n_layers = 0;
 };
 // result from each iteration
 struct pca_result {
    struct ggml_tensor * calculated_square = NULL;
    std::vector<struct ggml_tensor *> eigenvectors;
    std::vector<float> distances;
 };
 struct pca_model {
    ggml_backend_t backend = NULL;
    ggml_backend_buffer_t buffer;
    struct ggml_context * ctx;      // context to compute graph on target device
    struct ggml_context * ctx_host; // host context to store results
    // tensors on target device
    struct ggml_tensor * dev_input;
    struct ggml_tensor * dev_square;
    struct ggml_tensor * dev_eigenvector;
    pca_model(struct ggml_tensor * t_input) {
 #ifdef GGML_USE_CUDA
        fprintf(stderr, "%s: using CUDA backend\n", __func__);
        backend = ggml_backend_cuda_init(0); // init device 0
        if (!backend) {
            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
        }
 #endif
 // TODO: enable Metal support when support for GGML_OP_SQRT is added
 // #ifdef GGML_USE_METAL
 //         fprintf(stderr, "%s: using Metal backend\n", __func__);
 //         backend = ggml_backend_metal_init();
 //         if (!backend) {
 //             fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
 //         }
 // #endif
        // if there aren't GPU Backends fallback to CPU backend
        if (!backend) {
            backend = ggml_backend_cpu_init();
        }
        const int num_tensors = 4;
        struct ggml_init_params params {
            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
        ctx = ggml_init(params);
        auto n_samples = t_input->ne[0];
        auto n_embd    = t_input->ne[1];
        dev_input       = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
        dev_square      = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,    n_embd);
        dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
        ggml_set_name(dev_input,       "dev_input");
        ggml_set_name(dev_square,      "dev_square");
        ggml_set_name(dev_eigenvector, "dev_eigenvector");
        buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
        ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
        // initialize eigenvector to random normalized vector
        {
            std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
            std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
            std::uniform_real_distribution<float> distribution(0.0, 1.0);
            float sum_sqr = 0.0; // for normalizing random_vec
            for (size_t i = 0; i < random_vec.size(); ++i) {
                float f = distribution(generator);
                sum_sqr += f * f;
                random_vec[i] = f;
            }
            // normalize it
            float random_vec_norm = std::sqrt(sum_sqr);
            for (size_t i = 0; i < random_vec.size(); ++i) {
                random_vec[i] /= random_vec_norm;
            }
            ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
        }
    }
    ~pca_model() {
        ggml_free(ctx);
        ggml_backend_buffer_free(buffer);
        ggml_backend_free(backend);
    }
 };
 static struct ggml_cgraph * build_graph_piter(
        const struct pca_params & params,
        const pca_model & model,
        bool calc_square = false) {
    GGML_ASSERT(params.n_batch > 0);
    // TODO: buf_size must be able to scale with params.n_batch
    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
    static std::vector<uint8_t> buf(buf_size);
    struct ggml_init_params params0 = {
        /*.mem_size   =*/ buf_size,
        /*.mem_buffer =*/ buf.data(),
        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
    };
    // create a temporally context to build the graph
    struct ggml_context * ctx0 = ggml_init(params0);
    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
    // turn v_diff_original into square matrix if needed
    struct ggml_tensor * tmp_square;
    if (calc_square) {
        tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
        ggml_set_name(tmp_square, "tmp_square");
    }
    struct ggml_tensor * b_tensor;
    struct ggml_tensor * distance;
    struct ggml_tensor * old_eigen    = model.dev_eigenvector;
    struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
    for (int i = 0; i < params.n_batch; ++i) {
        // b_tensor = square * eigenvector^T
        b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
        ggml_set_name(b_tensor, "b_tensor");
        // normalize
        b_tensor = ggml_div_inplace(ctx0,
            b_tensor,
            ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
        );
        ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
        // calculate distance(new eigenvector - old eigenvector)
        // we don't use ggml_sub because it may not be implemented on GPU backend
        struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
        distance = ggml_sqrt_inplace(ctx0,
            ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
        ggml_format_name(distance, "distance_%d", i);
        old_eigen = b_tensor;
        // build operations nodes
        ggml_build_forward_expand(gf, distance);
    }
    // delete the temporally context used to build the graph
    ggml_free(ctx0);
    return gf;
 }
 static ggml_status compute_piter(
        const struct pca_params & params,
        const pca_model & model,
        struct ggml_cgraph * gf,
        ggml_gallocr_t allocr,
        struct pca_result & result) {
    // allocate tensors
    ggml_gallocr_alloc_graph(allocr, gf);
    if (ggml_backend_is_cpu(model.backend)) {
        ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
    }
 // TODO: enable GPU support when support for GGML_OP_SQRT is added
 //#ifdef GGML_USE_METAL
 //    if (ggml_backend_is_metal(model.backend)) {
 //        ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
 //    }
 //#endif
    ggml_status res = ggml_backend_graph_compute(model.backend, gf);
    if (res == GGML_STATUS_SUCCESS) {
        auto extract_i = [](std::string prefix, std::string str) -> int {
            int i = -1;
            if (str.rfind(prefix, 0) == 0) {
                sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
            }
            return i;
        };
        result.calculated_square = NULL;
        result.eigenvectors.clear();
        result.distances.clear();
        result.eigenvectors.resize(params.n_batch);
        result.distances.resize(params.n_batch);
        // get output nodes
        for (int i = 0; i < gf->n_nodes; ++i) {
            auto node = gf->nodes[i];
            int iter = -1;
            // find b_tensor (without copying data from device)
            if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
                result.eigenvectors[iter] = node;
            }
            // find distances, then copy data from device
            if ((iter = extract_i("distance_", node->name)) > -1) {
                float d;
                ggml_backend_tensor_get(node, &d, 0, sizeof(float));
                result.distances[iter] = d;
                // std::cout << node->name << " = " << d << "\n";
            }
            // find tmp_square if it exists (without copying data from device)
            if (std::string(node->name) == "tmp_square") {
                result.calculated_square = node;
            }
        }
    }
    return res;
 }
 static void power_iteration(
        const struct pca_params & params,
        struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
        struct ggml_tensor * output) {
    //printf("in power iteration\n");
    struct pca_model model(input);
    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
    struct pca_result result;
    struct ggml_tensor * last_eigenvector = NULL;
    int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
    for (int iter = 0; iter < n_iters; ++iter) {
        bool calc_square = (iter == 0); // only need to calculate square for first iteration
        struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
        // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
        compute_piter(params, model, gf, allocr, result);
        for (size_t k = 0; k < result.distances.size(); ++k) {
            last_eigenvector = result.eigenvectors[k];
            if (result.distances[k] < params.tolerance) {
                break; // done
            }
        }
        if (calc_square) {
            // copy and store the square matrix if needed
            GGML_ASSERT(result.calculated_square != NULL);
            ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
        }
        {
            // copy last eigen vector and store as input for next iteration
            GGML_ASSERT(last_eigenvector != NULL);
            ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
        }
        printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
            __func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
    }
    // get output tensor
    GGML_ASSERT(last_eigenvector);
    ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
    //print_debug_tensor(output);
    ggml_gallocr_free(allocr);
 }
 static void run_pca(
        struct pca_params & params,
        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
        const std::vector<struct ggml_tensor *> & v_output) {
    printf("%s: Running PCA...\n", __func__);
    for (size_t il = 0; il < v_input.size(); ++il) {
        // prepare output vector
        struct ggml_tensor * ctrl_out = v_output[il];
        ggml_format_name(ctrl_out, "direction.%ld", il+1);
        // run power_iteration
        params.i_layer = il;
        params.n_layers = v_input.size();
        power_iteration(params, v_input[il], ctrl_out);
        printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
    }
 }
 }
--- a/examples/cvector-generator/positive.txt
+++ b/examples/cvector-generator/positive.txt
@ -0,0 +1 @@
 [INST] Act like a person who is extremely happy. [/INST]
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET embedding)
+set(TARGET llama-embedding)
 add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.):
 ```bash
-./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
+./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
 ```
 ### Windows:
 ```powershell
-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
+llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
 ```
 The above command will output space-separated float values.
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@ -1,9 +1,9 @@
-set(TARGET eval-callback)
+set(TARGET llama-eval-callback)
 add_executable(${TARGET} eval-callback.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 set(TEST_TARGET test-eval-callback)
-add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
 set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
--- a/examples/eval-callback/README.md
+++ b/examples/eval-callback/README.md
@ -6,7 +6,7 @@ It simply prints to the console all operations and tensor data.
 Usage:
 ```shell
-eval-callback \
+llama-eval-callback \
  --hf-repo ggml-org/models \
  --hf-file phi-2/ggml-model-q4_0.gguf \
  --model phi-2-q4_0.gguf \
--- a/examples/export-lora/CMakeLists.txt
+++ b/examples/export-lora/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET export-lora)
+set(TARGET llama-export-lora)
 add_executable(${TARGET} export-lora.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@ -3,7 +3,7 @@
 Apply LORA adapters to base model and export the resulting model.
 ```
-usage: export-lora [options]
+usage: llama-export-lora [options]
 options:
  -h, --help                         show this help message and exit
@ -17,7 +17,7 @@ options:
 For example:
 ```bash
-./bin/export-lora \
+./bin/llama-export-lora \
    -m open-llama-3b-v2-q8_0.gguf \
    -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
    -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
--- a/examples/finetune/CMakeLists.txt
+++ b/examples/finetune/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET finetune)
+set(TARGET llama-finetune)
 add_executable(${TARGET} finetune.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@ -7,7 +7,7 @@ Basic usage instructions:
 wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
 # finetune LORA adapter
-./bin/finetune \
+./bin/llama-finetune \
        --model-base open-llama-3b-v2-q8_0.gguf \
        --checkpoint-in  chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
        --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
        --use-checkpointing
 # predict
-./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
 ```
 **Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
@ -38,14 +38,14 @@ After 10 more iterations:
 Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
 llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
-These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
+These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
-In `main` you can also load multiple LORA adapters, which will then be mixed together.
+In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
 For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
 ```bash
-./bin/main -m open-llama-3b-v2-q8_0.gguf \
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
  --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
  --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
 ```
@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
 For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
 ```bash
-./bin/main -m open-llama-3b-v2-q8_0.gguf \
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
  --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
  --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
--- a/examples/finetune/finetune.sh
+++ b/examples/finetune/finetune.sh
@ -2,7 +2,7 @@
 cd `dirname $0`
 cd ../..
-EXE="./finetune"
+EXE="./llama-finetune"
 if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
 if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
--- a/examples/gbnf-validator/CMakeLists.txt
+++ b/examples/gbnf-validator/CMakeLists.txt
@ -1,5 +1,5 @@
-set(TARGET gbnf-validator)
+set(TARGET llama-gbnf-validator)
 add_executable(${TARGET} gbnf-validator.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@ -7,6 +7,8 @@
 #include <cstdio>
 #include <cstdlib>
 #include <sstream>
 #include <fstream>
 #include <string>
 #include <vector>
@ -69,13 +71,14 @@ int main(int argc, char** argv) {
        return 1;
    }
-    fseek(grammar_file, 0, SEEK_END);
+    std::string grammar_str;
-    size_t grammar_size = ftell(grammar_file);
+    {
-    fseek(grammar_file, 0, SEEK_SET);
+        std::ifstream grammar_file(grammar_filename);
-
+        GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
-    std::string grammar_str(grammar_size, ' ');
+        std::stringstream buffer;
-    fread(&grammar_str[0], 1, grammar_size, grammar_file);
+        buffer << grammar_file.rdbuf();
-    fclose(grammar_file);
+        grammar_str = buffer.str();
    }
    // Parse the GBNF grammar
    auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
@ -100,20 +103,15 @@ int main(int argc, char** argv) {
            grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
    // Read the input file
-    FILE* input_file = fopen(input_filename.c_str(), "r");
+    std::string input_str;
-    if (!input_file) {
+    {
-        fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str());
+        std::ifstream input_file(input_filename);
-        return 1;
+        GGML_ASSERT(input_file.is_open() && "Failed to open input file");
        std::stringstream buffer;
        buffer << input_file.rdbuf();
        input_str = buffer.str();
    }
    fseek(input_file, 0, SEEK_END);
    size_t input_size = ftell(input_file);
    fseek(input_file, 0, SEEK_SET);
    std::string input_str(input_size, ' ');
    fread(&input_str[0], 1, input_size, input_file);
    fclose(input_file);
    // Validate the input string against the grammar
    size_t error_pos;
    std::string error_msg;
--- a/examples/gguf-split/CMakeLists.txt
+++ b/examples/gguf-split/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET gguf-split)
+set(TARGET llama-gguf-split)
 add_executable(${TARGET} gguf-split.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@ -18,8 +18,8 @@ fi
 set -x
-SPLIT=$1/gguf-split
+SPLIT=$1/llama-gguf-split
-MAIN=$1/main
+MAIN=$1/llama-cli
 WORK_PATH=$TMP_DIR/gguf-split
 ROOT_DIR=$(realpath $(dirname $0)/../../)
--- a/examples/gguf/CMakeLists.txt
+++ b/examples/gguf/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET gguf)
+set(TARGET llama-gguf)
 add_executable(${TARGET} gguf.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/gritlm/CMakeLists.txt
+++ b/examples/gritlm/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET gritlm)
+set(TARGET llama-gritlm)
 add_executable(${TARGET} gritlm.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@ -26,7 +26,7 @@ $ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --ou
 Run the example using the downloaded model:
 ```console
-$ ./gritlm -m models/gritlm-7b_q4_1.gguf
+$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
--- a/examples/imatrix/CMakeLists.txt
+++ b/examples/imatrix/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET imatrix)
+set(TARGET llama-imatrix)
 add_executable(${TARGET} imatrix.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@ -6,7 +6,7 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
 ## Usage
 ```
-./imatrix \
+./llama-imatrix \
    -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
    [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
    [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
@ -28,8 +28,8 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 LLAMA_CUDA=1 make -j
 # generate importance matrix (imatrix.dat)
-./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
+./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
 # use the imatrix to perform a Q4_K_M quantization
-./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
+./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
 ```
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET infill)
+set(TARGET llama-infill)
 add_executable(${TARGET} infill.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@ -42,5 +42,5 @@ scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.ggu
 ```
 ```bash
-./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
+./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
 ```
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -223,7 +223,11 @@ int main(int argc, char ** argv) {
    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
    embd_inp = inp_pfx;
    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-    embd_inp.push_back(llama_token_middle(model));
+
    const llama_token middle_token = llama_token_middle(model);
    if (middle_token >= 0) {
        embd_inp.push_back(middle_token);
    }
    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
@ -528,7 +532,12 @@ int main(int argc, char ** argv) {
                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
                embd_inp = inp_pfx;
                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-                embd_inp.push_back(llama_token_middle(model));
+
                const llama_token middle_token = llama_token_middle(model);
                if (middle_token >= 0) {
                    embd_inp.push_back(middle_token);
                }
                embd.clear();
                n_remain = params.n_predict;
                n_past = 0;
--- a/examples/jeopardy/jeopardy.sh
+++ b/examples/jeopardy/jeopardy.sh
@ -21,7 +21,7 @@ counter=1
 echo 'Running'
 while IFS= read -r question
 do
-  exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
+  exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
  echo $counter
  echo "Current Question: $question"
  eval "$exe_cmd"
--- a/examples/json-schema-pydantic-example.py
+++ b/examples/json-schema-pydantic-example.py
@ -1,5 +1,5 @@
 # Usage:
-#! ./server -m some-model.gguf &
+#! ./llama-server -m some-model.gguf &
 #! pip install pydantic
 #! python json-schema-pydantic-example.py
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -523,7 +523,7 @@ class SchemaConverter:
 def main(args_in = None):
    parser = argparse.ArgumentParser(
        description='''
-            Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
+            Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
            given JSON schema. Only a subset of JSON schema features are supported; more may be
            added in the future.
        ''',
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@ -1,4 +1,4 @@
-# llama.cpp/example/llama-bench
+# llama.cpp/examples/llama-bench
 Performance testing tool for llama.cpp.
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -293,6 +293,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    params.output_format = cmd_params_defaults.output_format;
    params.output_format_stderr = cmd_params_defaults.output_format_stderr;
    params.reps = cmd_params_defaults.reps;
    params.numa = cmd_params_defaults.numa;
    for (int i = 1; i < argc; i++) {
        arg = argv[i];
@ -713,7 +714,6 @@ struct test {
    static const bool kompute;
    static const bool metal;
    static const bool sycl;
    static const bool rpc;
    static const bool gpu_blas;
    static const bool blas;
    static const std::string cpu_info;
@ -725,6 +725,7 @@ struct test {
    int n_batch;
    int n_ubatch;
    int n_threads;
    bool has_rpc;
    ggml_type type_k;
    ggml_type type_v;
    int n_gpu_layers;
@ -750,6 +751,7 @@ struct test {
        n_batch = inst.n_batch;
        n_ubatch = inst.n_ubatch;
        n_threads = inst.n_threads;
        has_rpc = !inst.rpc_servers.empty();
        type_k = inst.type_k;
        type_v = inst.type_v;
        n_gpu_layers = inst.n_gpu_layers;
@ -809,9 +811,6 @@ struct test {
        if (sycl) {
            return GGML_SYCL_NAME;
        }
        if (rpc) {
            return "RPC";
        }
        if (gpu_blas) {
            return "GPU BLAS";
        }
@ -881,7 +880,7 @@ struct test {
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
            std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
-            std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
+            std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_ubatch),
@ -915,7 +914,6 @@ const bool        test::metal        = !!ggml_cpu_has_metal();
 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 const bool        test::blas         = !!ggml_cpu_has_blas();
 const bool        test::sycl         = !!ggml_cpu_has_sycl();
 const bool        test::rpc          = !!ggml_cpu_has_rpc();
 const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();
@ -1181,6 +1179,9 @@ struct markdown_printer : public printer {
                value = buf;
            } else if (field == "backend") {
                value = test::get_backend();
                if (t.has_rpc) {
                    value += "+RPC";
                }
            } else if (field == "test") {
                if (t.n_prompt > 0 && t.n_gen == 0) {
                    snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@ -30,8 +30,9 @@ if(TARGET BUILD_INFO)
    add_dependencies(llava BUILD_INFO)
 endif()
-set(TARGET llava-cli)
+set(TARGET llama-llava-cli)
-add_executable(llava-cli llava-cli.cpp)
+add_executable(${TARGET} llava-cli.cpp)
-install(TARGETS llava-cli RUNTIME)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
-target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+install(TARGETS ${TARGET} RUNTIME)
-target_compile_features(llava PRIVATE cxx_std_11)
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
 Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
 ## Usage
-Build with cmake or run `make llava-cli` to build it.
+Build with cmake or run `make llama-llava-cli` to build it.
-After building, run: `./llava-cli` to see the usage. For example:
+After building, run: `./llama-llava-cli` to see the usage. For example:
 ```sh
-./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
+./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
    --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
    --image path/to/an/image.jpg \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
@ -62,7 +62,7 @@ python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
 ```sh
-./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
 ```
 Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
 ### case 1
 **input**
 ```sh
-/data/local/tmp/llava-cli \
+/data/local/tmp/llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
@ -102,7 +102,7 @@ llama_print_timings:       total time =   34731.93 ms
 ### case 2
 **input**
 ```sh
-/data/local/tmp/llava-cli \
+/data/local/tmp/llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
@ -126,7 +126,7 @@ llama_print_timings:       total time =   34570.79 ms
 #### llava-cli release-b2005
 **input**
 ```sh
-/data/local/tmp/llava-cli \
+/data/local/tmp/llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
@ -200,7 +200,7 @@ make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
 ### case 1
 **input**
 ```sh
-./llava-cli \
+./llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    --image /data/local/tmp/demo.jpeg \
@ -224,7 +224,7 @@ llama_print_timings:       total time =    1352.63 ms /   252 tokens
 ### case 2
 **input**
 ```sh
-./llava-cli \
+./llama-llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
 After API is confirmed, more models will be supported / uploaded.
 ## Usage
-Build with cmake or run `make llava-cli` to build it.
+Build with cmake or run `make llama-llava-cli` to build it.
-After building, run: `./llava-cli` to see the usage. For example:
+After building, run: `./llama-llava-cli` to see the usage. For example:
 ```sh
-./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```
 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@ -95,9 +95,9 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto
 python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```
-7) And finally we can run the llava-cli using the 1.6 model version:
+7) And finally we can run the llava cli using the 1.6 model version:
 ```console
-./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
+./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
 ```
 **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
--- a/Show more
+++ b/Show more
		`@ -0,0 +1 @@`
							`[INST] Act like a person who is extremely sad. [/INST]`
`@ -1,4 +1,4 @@`
	`# llama.cpp/example/llama-bench`	`# llama.cpp/examples/llama-bench`

	`Performance testing tool for llama.cpp.`	`Performance testing tool for llama.cpp.`