Merge branch 'master' into pr-train-mem-usage-improvements

2023-08-27 23:11:47 +02:00 · 2023-08-27 23:11:47 +02:00 · a6f3a47c39
commit a6f3a47c39
parent 3a91c975a6 103cfafc77
60 changed files with 5049 additions and 2118 deletions
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -0,0 +1,44 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} as build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102
 COPY requirements.txt requirements.txt
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN make
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@ -0,0 +1,44 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} as build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102
 COPY requirements.txt requirements.txt
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN make
 ENTRYPOINT [ "/app/main" ]
--- a/.dockerignore
+++ b/.dockerignore
@ -5,14 +5,7 @@
 .vscode/
 .DS_Store
-build/
+build*/
 build-em/
 build-debug/
 build-release/
 build-static/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
 models/*
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -291,24 +291,32 @@ jobs:
          cd build
          ctest -C Release --verbose --timeout 900
-      - name: Get commit hash
+      - name: Determine tag name
-        id: commit
+        id: tag
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        shell: bash
-        uses: pr-mpt/actions-commit-hash@v2
+        run: |
          BUILD_NUMBER="$(git rev-list --count HEAD)"
          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
          else
            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
      - name: Pack artifacts
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
-          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v3
        with:
          path: |
-            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
+            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
  windows-latest-cmake-cublas:
    runs-on: windows-latest
@ -338,23 +346,31 @@ jobs:
          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
          cmake --build . --config Release
-      - name: Get commit hash
+      - name: Determine tag name
-        id: commit
+        id: tag
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        shell: bash
-        uses: pr-mpt/actions-commit-hash@v2
+        run: |
          BUILD_NUMBER="$(git rev-list --count HEAD)"
          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
          else
            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
      - name: Pack artifacts
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
-          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v3
        with:
          path: |
-            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
      - name: Copy and pack Cuda runtime
        if: ${{ matrix.cuda == '12.1.0' }}
@ -400,21 +416,34 @@ jobs:
      - windows-latest-cmake-cublas
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
      - name: Determine tag name
        id: tag
        shell: bash
        run: |
          BUILD_NUMBER="$(git rev-list --count HEAD)"
          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
          else
            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
      - name: Download artifacts
        id: download-artifact
        uses: actions/download-artifact@v3
      - name: Get commit hash
        id: commit
        uses: pr-mpt/actions-commit-hash@v2
      - name: Create release
        id: create_release
        uses: anzz1/action-create-release@v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
-          tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
+          tag_name: ${{ steps.tag.outputs.name }}
      - name: Upload release
        id: upload_release
--- a/.gitignore
+++ b/.gitignore
@ -16,20 +16,7 @@
 .vs/
 .vscode/
-build/
+build*/
 build-em/
 build-debug/
 build-release/
 build-ci-debug/
 build-ci-release/
 build-static/
 build-cublas/
 build-opencl/
 build-metal/
 build-mpi/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
 out/
 tmp/
@ -60,6 +47,7 @@ compile_commands.json
 CMakeSettings.json
 __pycache__
 dist
 zig-out/
 zig-cache/
@ -70,7 +58,6 @@ perf-*.txt
 examples/jeopardy/results.txt
 pyproject.toml
 poetry.lock
 poetry.toml
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -74,6 +74,7 @@ set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kern
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
 option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
@ -352,6 +353,43 @@ if (LLAMA_CLBLAST)
    endif()
 endif()
 if (LLAMA_HIPBLAS)
    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
    endif()
    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
    endif()
    find_package(hip)
    find_package(hipblas)
    find_package(rocblas)
    if (${hipblas_FOUND} AND ${hip_FOUND})
        message(STATUS "HIP and hipBLAS found")
        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
        if (LLAMA_CUDA_FORCE_DMMV)
            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
        endif()
        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
        target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
        target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
        if (LLAMA_STATIC)
            message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
        endif()
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
    else()
        message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
    endif()
 endif()
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
        set(c_flags
--- a/24
+++ b/24
@ -280,6 +280,30 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_CLBLAST
 ifdef LLAMA_HIPBLAS
 	ROCM_PATH	?= /opt/rocm
 	HIPCC	    ?= $(ROCM_PATH)/bin/hipcc
 	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
 	CFLAGS      += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
 	CXXFLAGS    += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
 	LDFLAGS     += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	LDFLAGS		+= -lhipblas -lamdhip64 -lrocblas
 	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
 	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 	HIPFLAGS    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 	HIPFLAGS    += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 	HIPFLAGS    += -DCC_TURING=1000000000
 ifdef LLAMA_CUDA_FORCE_DMMV
 	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
 	OBJS        += ggml-cuda.o
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 endif # LLAMA_HIPBLAS
 ifdef LLAMA_METAL
 	CFLAGS   += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 	CXXFLAGS += -DGGML_USE_METAL
--- a/README.md
+++ b/README.md
@ -11,6 +11,10 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 ### Hot topics
 - #### IMPORTANT: Tokenizer fixes and API change (developers and projects using `llama.cpp` built-in tokenization must read): https://github.com/ggerganov/llama.cpp/pull/2810
 - GGUFv2 adds support for 64-bit sizes + backwards compatible: https://github.com/ggerganov/llama.cpp/pull/2821
 - Added support for Falcon models: https://github.com/ggerganov/llama.cpp/pull/2717
 - A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
@ -422,6 +426,35 @@ Building the program with BLAS support may lead to some performance improvements
  | LLAMA_CUDA_F16          | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
 - #### hipBLAS
  This provide BLAS acceleation on HIP supported GPU like AMD GPU.
  Make sure to have ROCm installed.
  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
  Windows support is coming soon...
  - Using `make`:
    ```bash
    make LLAMA_HIPBLAS=1
    ```
  - Using `CMake`:
    ```bash
    mkdir build
    cd build
    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
    cmake --build .
    ```
  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
  If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
  | Option                  | Legal values           | Default | Description |
  |-------------------------|------------------------|---------|-------------|
  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
 - #### CLBlast
  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
--- a/ci/run.sh
+++ b/ci/run.sh
@ -196,17 +196,17 @@ function gg_run_open_llama_3b_v2 {
    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    function check_ppl {
        qnt="$1"
@ -233,6 +233,48 @@ function gg_run_open_llama_3b_v2 {
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    # lora
    function compare_ppl {
        qnt="$1"
        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
            return 20
        fi
        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
        return 0
    }
    path_lora="../models-mnt/open-llama/3B-v2/lora"
    path_shakespeare="../models-mnt/shakespeare"
    shakespeare="${path_shakespeare}/shakespeare.txt"
    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
    python3 ../convert-lora-to-ggml.py ${path_lora}
    # f16
    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    # q8_0
    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    # q8_0 + f16 lora-base
    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    set +e
 }
@ -242,6 +284,7 @@ function gg_sum_open_llama_3b_v2 {
    gg_printf 'OpenLLaMA 3B-v2:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@ -253,6 +296,11 @@ function gg_sum_open_llama_3b_v2 {
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
    gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
    gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
    gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }
 # open_llama_7b_v2
@ -310,17 +358,17 @@ function gg_run_open_llama_7b_v2 {
    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
-    (time ./bin/main --model ${model_f16}  -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/main --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -359,6 +407,48 @@ function gg_run_open_llama_7b_v2 {
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    # lora
    function compare_ppl {
        qnt="$1"
        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
            return 20
        fi
        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
        return 0
    }
    path_lora="../models-mnt/open-llama/7B-v2/lora"
    path_shakespeare="../models-mnt/shakespeare"
    shakespeare="${path_shakespeare}/shakespeare.txt"
    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
    python3 ../convert-lora-to-ggml.py ${path_lora}
    # f16
    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    # currently not supported by the CUDA backend
    # q8_0
    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
    #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    # q8_0 + f16 lora-base
    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
    #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    set +e
 }
@ -368,6 +458,7 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf 'OpenLLaMA 7B-v2:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@ -379,6 +470,11 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
    #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
    #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
    #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }
 ## main
@ -391,6 +487,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    ln -sfn ${mnt_models} ${SRC}/models-mnt
    python3 -m pip install -r ${SRC}/requirements.txt
    python3 -m pip install --editable gguf-py
 fi
 ret=0
--- a/common/common.cpp
+++ b/common/common.cpp
@ -613,9 +613,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
 #ifdef GGML_USE_CUBLAS
    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
-    fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
+    fprintf(stdout, "                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif // GGML_USE_CUBLAS
 #endif
    fprintf(stdout, "  --mtest               compute maximum memory usage\n");
    fprintf(stdout, "  --export              export the computation graph to 'llama.ggml'\n");
@ -731,12 +733,12 @@ std::vector<llama_token> llama_tokenize(
    return result;
 }
-std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
+    const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_token_to_str(ctx, token, result.data(), result.size());
+        int check = llama_token_to_piece(ctx, token, result.data(), result.size());
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@ -744,3 +746,36 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok
    return std::string(result.data(), result.size());
 }
 std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
    const llama_token bos_id = llama_token_bos(ctx);
    std::string piece;
    std::string result;
    for (size_t i = 0; i < tokens.size(); ++i) {
        piece = llama_token_to_piece(ctx, tokens[i]);
        // remove the leading space of the first non-BOS token
        if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
            piece = piece.substr(1);
        }
        result += piece;
    }
    return result;
 }
 std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
    std::string piece;
    std::string result;
    for (size_t i = 0; i < tokens.size(); ++i) {
        piece = llama_token_to_piece(ctx, tokens[i]);
        result += piece;
    }
    return result;
 }
--- a/common/common.h
+++ b/common/common.h
@ -28,6 +28,7 @@ struct gpt_params {
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
@ -115,11 +116,31 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 // Vocab utils
 //
 // tokenizes a string into a vector of tokens
 // should work similar to Python's `tokenizer.encode`
 std::vector<llama_token> llama_tokenize(
        struct llama_context * ctx,
           const std::string & text,
                        bool   add_bos);
-std::string llama_token_to_str(
+// tokenizes a token into a piece
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string llama_token_to_piece(
        const struct llama_context * ctx,
                       llama_token   token);
 // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
 //       that takes into account the tokenizer type and decides how to handle the leading space
 //
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 // removes the leading space from the first non-BOS token
 std::string llama_detokenize_spm(
                         llama_context * ctx,
        const std::vector<llama_token> & tokens);
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 std::string llama_detokenize_bpe(
                         llama_context * ctx,
        const std::vector<llama_token> & tokens);
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@ -107,6 +107,7 @@ if "n_head_kv" in hparams:
 else:
    gguf_writer.add_head_count_kv(1)
 gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
 gguf_writer.add_file_type(ftype)
 # TOKENIZATION
--- a/convert.py
+++ b/convert.py
@ -3,6 +3,7 @@
 import gguf
 import argparse
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 import copy
 import enum
 import faulthandler
@ -17,13 +18,14 @@ import re
 import signal
 import struct
 import sys
 import time
 import zipfile
 import numpy as np
 from abc import ABCMeta, abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
-from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union)
+from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Sequence, Set, Tuple, TypeVar, Union)
 from sentencepiece import SentencePieceProcessor  # type: ignore
 if TYPE_CHECKING:
@ -37,30 +39,70 @@ NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
 ARCH=gguf.MODEL_ARCH.LLAMA
 NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
 DEFAULT_CONCURRENCY = 8
 #
 # data types
 #
@dataclass(frozen=True)
-class UnquantizedDataType:
+class DataType:
    name: str
    dtype: 'np.dtype[Any]'
    valid_conversions: List[str]
-DT_F16  = UnquantizedDataType('F16')
+    def elements_to_bytes(self, n_elements: int) -> int:
-DT_F32  = UnquantizedDataType('F32')
+        return n_elements * self.dtype.itemsize
 DT_I32  = UnquantizedDataType('I32')
 DT_BF16 = UnquantizedDataType('BF16')
-DataType = Union[UnquantizedDataType]
+@dataclass(frozen=True)
 class UnquantizedDataType(DataType):
    pass
-DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
+DT_F16  = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
-    DT_BF16: np.dtype(np.uint16),
+DT_F32  = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
-    DT_F16:  np.dtype(np.float16),
+DT_I32  = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
-    DT_F32:  np.dtype(np.float32),
+DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
    DT_I32:  np.dtype(np.int32),
 }
-NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \
+@dataclass(frozen=True)
-    {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
+class QuantizedDataType(DataType):
    block_size: int
    quantized_dtype: 'np.dtype[Any]'
    ggml_type: gguf.GGMLQuantizationType
    def quantize(self, arr: NDArray) -> NDArray:
        raise NotImplementedError(f'Quantization for {self.name} not implemented')
    def elements_to_bytes(self, n_elements: int) -> int:
        assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
        return self.quantized_dtype.itemsize * (n_elements // self.block_size)
@dataclass(frozen=True)
 class Q8_0QuantizedDataType(QuantizedDataType):
    # Mini Q8_0 quantization in Python!
    def quantize(self, arr: NDArray) -> NDArray:
        assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}'
        assert arr.dtype == np.float32, f'Bad array type {arr.dtype}'
        n_blocks = arr.size // self.block_size
        blocks = arr.reshape((n_blocks, self.block_size))
        # Much faster implementation of block quantization contributed by @Cebtenzzre
        def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[Tuple[Any, Any]]:
            d = abs(blocks).max(axis = 1) / np.float32(127)
            with np.errstate(divide = 'ignore'):
                qs = (blocks / d[:, None]).round()
            qs[d == 0] = 0
            yield from zip(d, qs)
        return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
 DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
    dtype = np.dtype(np.float32), valid_conversions = [],
    ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
    quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
 # Quantized types skipped here because they may also map to np.float32
 NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = {}
 for dt in (DT_BF16, DT_F16, DT_F32, DT_I32):
    if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
        raise ValueError(f'Invalid duplicate data type {dt}')
    NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt
 SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
    'BF16': DT_BF16,
@ -73,20 +115,22 @@ SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
 # TODO: rename to LLAMAFileType
 # TODO: move to `gguf.py`
 class GGMLFileType(enum.IntEnum):
-    AllF32    = 0
+    AllF32     = 0
-    MostlyF16 = 1  # except 1d tensors
+    MostlyF16  = 1  # except 1d tensors
    MostlyQ8_0 = 7  # except 1d tensors
    def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
-        if len(tensor.shape) == 1:
+        dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
-            # 1D tensors are always F32.
+        if dt is None:
            return DT_F32
        elif self == GGMLFileType.AllF32:
            return DT_F32
        elif self == GGMLFileType.MostlyF16:
            return DT_F16
        else:
            raise ValueError(self)
        # 1D tensors are always F32.
        return dt if len(tensor.shape) > 1 else DT_F32
 GGML_FILE_TYPE_TO_DATA_TYPE: Dict[GGMLFileType, DataType] = {
    GGMLFileType.AllF32    : DT_F32,
    GGMLFileType.MostlyF16 : DT_F16,
    GGMLFileType.MostlyQ8_0: DT_Q8_0,
 }
 #
 # hparams loading
@ -104,8 +148,14 @@ class Params:
    n_head_kv:  int
    f_norm_eps: float
    f_rope_freq_base: Optional[float] = None
    f_rope_scale: Optional[float] = None
    ftype: Optional[GGMLFileType] = None
    # path to the directory containing the model files
    path_model: Optional['Path'] = None
    @staticmethod
    def find_n_mult(n_ff: int, n_embd: int) -> int:
        # hardcoded magic range
@ -155,13 +205,20 @@ class Params:
    def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
        config = json.load(open(config_path))
-        n_vocab    = config["vocab_size"]
+        n_vocab          = config["vocab_size"]
-        n_embd     = config["hidden_size"]
+        n_embd           = config["hidden_size"]
-        n_layer    = config["num_hidden_layers"]
+        n_layer          = config["num_hidden_layers"]
-        n_ff       = config["intermediate_size"]
+        n_ff             = config["intermediate_size"]
-        n_head     = config["num_attention_heads"]
+        n_head           = config["num_attention_heads"]
-        n_head_kv  = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
+        n_head_kv        = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
-        f_norm_eps = config["rms_norm_eps"]
+        f_norm_eps       = config["rms_norm_eps"]
        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
        rope_scaling = config.get("rope_scaling")
        if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
            f_rope_scale = config["rope_scaling"].get("factor")
        else:
            f_rope_scale = None
        n_mult = Params.find_n_mult(n_ff, n_embd)
@ -174,15 +231,17 @@ class Params:
                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
        return Params(
-            n_vocab    = n_vocab,
+            n_vocab          = n_vocab,
-            n_embd     = n_embd,
+            n_embd           = n_embd,
-            n_mult     = n_mult,
+            n_mult           = n_mult,
-            n_layer    = n_layer,
+            n_layer          = n_layer,
-            n_ctx      = n_ctx,
+            n_ctx            = n_ctx,
-            n_ff       = n_ff,
+            n_ff             = n_ff,
-            n_head     = n_head,
+            n_head           = n_head,
-            n_head_kv  = n_head_kv,
+            n_head_kv        = n_head_kv,
-            f_norm_eps = f_norm_eps,
+            f_norm_eps       = f_norm_eps,
            f_rope_freq_base = f_rope_freq_base,
            f_rope_scale     = f_rope_scale,
        )
    # LLaMA v2 70B params.json
@ -191,15 +250,26 @@ class Params:
    def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
        config = json.load(open(config_path))
-        n_vocab    = config["vocab_size"]
+        n_vocab          = config["vocab_size"] if "vocab_size" in config else -1
-        n_embd     = config["dim"]
+        n_embd           = config["dim"]
-        n_layer    = config["n_layers"]
+        n_layer          = config["n_layers"]
-        n_mult     = config["multiple_of"]
+        n_mult           = config["multiple_of"]
-        n_ctx      = 2048 if config["norm_eps"] == 1e-06 else 4096 # hack to determine LLaMA v1 vs v2
+        n_ff             = -1
-        n_ff       = -1
+        n_head           = config["n_heads"]
-        n_head     = config["n_heads"]
+        n_head_kv        = config["n_kv_heads"] if "n_kv_heads" in config else n_head
-        n_head_kv  = config["n_kv_heads"] if "n_kv_heads" in config else n_head
+        f_norm_eps       = config["norm_eps"]
-        f_norm_eps = config["norm_eps"]
+        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
        # hack to determine LLaMA v1 vs v2 vs CodeLlama
        if f_rope_freq_base and f_rope_freq_base == 1000000:
            # CodeLlama
            n_ctx = 16384
        elif config["norm_eps"] == 1e-05:
            # LLaMA v2
            n_ctx = 4096
        else:
            # LLaMA v1
            n_ctx = 2048
        if n_vocab == -1:
            n_vocab = model["tok_embeddings.weight"].shape[0]
@ -208,15 +278,16 @@ class Params:
            n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
        return Params(
-            n_vocab    = n_vocab,
+            n_vocab          = n_vocab,
-            n_embd     = n_embd,
+            n_embd           = n_embd,
-            n_mult     = n_mult,
+            n_mult           = n_mult,
-            n_layer    = n_layer,
+            n_layer          = n_layer,
-            n_ctx      = n_ctx,
+            n_ctx            = n_ctx,
-            n_ff       = n_ff,
+            n_ff             = n_ff,
-            n_head     = n_head,
+            n_head           = n_head,
-            n_head_kv  = n_head_kv,
+            n_head_kv        = n_head_kv,
-            f_norm_eps = f_norm_eps,
+            f_norm_eps       = f_norm_eps,
            f_rope_freq_base = f_rope_freq_base,
        )
    @staticmethod
@ -231,6 +302,8 @@ class Params:
        else:
            params = Params.guessed(model_plus.model)
        params.path_model = model_plus.paths[0].parent
        return params
@ -386,7 +459,7 @@ class UnquantizedTensor(Tensor):
        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
    def astype(self, data_type: DataType) -> Tensor:
-        dtype = DATA_TYPE_TO_NUMPY[data_type]
+        dtype = data_type.dtype
        if self.data_type == DT_BF16:
            self.ndarray = bf16_to_fp32(self.ndarray)
        return UnquantizedTensor(self.ndarray.astype(dtype))
@ -425,22 +498,6 @@ def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, conv
 GGMLCompatibleTensor = Union[UnquantizedTensor]
 class DeferredPermutedTensor(Tensor):
    def __init__(self, base: Tensor, n_head: int, n_head_kv: int) -> None:
        self.base = base
        self.n_head = n_head
        self.data_type = self.base.data_type
    def astype(self, data_type: DataType) -> Tensor:
        return self.base.astype(data_type).permute(self.n_head, self.n_head_kv)
    def to_ggml(self) -> GGMLCompatibleTensor:
        return self.base.to_ggml().permute(self.n_head, self.n_head_kv)
    def permute(self, n_head: int, n_head_kv: int) -> Tensor:
        raise Exception("shouldn't permute twice")
@dataclass
 class LazyTensor:
    _load: Callable[[], Tensor]
@ -450,7 +507,9 @@ class LazyTensor:
    def load(self) -> Tensor:
        ret = self._load()
-        assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
+        # Should be okay if it maps to the same numpy type?
        assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
                (self.data_type, ret.data_type, self.description)
        return ret
    def astype(self, data_type: DataType) -> 'LazyTensor':
@ -461,8 +520,8 @@ class LazyTensor:
        return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
    def validate_conversion_to(self, data_type: DataType) -> None:
-        if data_type == self.data_type:
+        if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
-            return
+            raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
 LazyModel = Dict[str, LazyTensor]
@ -588,9 +647,7 @@ class LazyUnpickler(pickle.Unpickler):
        info = self.zip_file.getinfo(filename)
        def load(offset: int, elm_count: int) -> NDArray:
-            dtype = DATA_TYPE_TO_NUMPY.get(data_type)
+            dtype = data_type.dtype
            if dtype is None:
                raise Exception("tensor stored in unsupported format")
            fp = self.zip_file.open(info)
            fp.seek(offset * dtype.itemsize)
            size = elm_count * dtype.itemsize
@ -654,7 +711,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
    def convert(info: Dict[str, Any]) -> LazyTensor:
        data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
-        numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
+        numpy_dtype = data_type.dtype
        shape: List[int] = info['shape']
        begin, end = info['data_offsets']
        assert 0 <= begin <= end <= len(byte_buf)
@ -694,23 +751,35 @@ def lazy_load_file(path: Path) -> ModelPlus:
 In = TypeVar('In')
 Out = TypeVar('Out')
-def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
+def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: Optional[int] = None, factory: Callable = ThreadPoolExecutor) -> Iterable[Out]:
    '''Parallel map, but with backpressure.  If the caller doesn't call `next`
    fast enough, this will stop calling `func` at some point rather than
    letting results pile up in memory.  Specifically, there is a max of one
    output value buffered per thread.'''
-    with concurrent.futures.ThreadPoolExecutor() as executor:
+    if concurrency < 2:
        yield from map(func, iterable)
        # Not reached.
    iterable = iter(iterable)
    with factory(max_workers = max_workers) as executor:
        futures: List[concurrent.futures.Future[Out]] = []
-        items_rev = list(iterable)[::-1]
+        done = False
-        for i in range(min(concurrency, len(items_rev))):
+        for _ in range(concurrency):
-            futures.append(executor.submit(func, items_rev.pop()))
+            try:
                futures.append(executor.submit(func, next(iterable)))
            except StopIteration:
                done = True
                break
        while futures:
            result = futures.pop(0).result()
-            if items_rev:
+            while not done and len(futures) < concurrency:
-                futures.append(executor.submit(func, items_rev.pop()))
+                try:
                    futures.append(executor.submit(func, next(iterable)))
                except StopIteration:
                    done = True
                    break
            yield result
 def check_vocab_size(params: Params, vocab: Vocab) -> None:
    if params.n_vocab != vocab.vocab_size:
        assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
@ -733,11 +802,13 @@ class OutputFile:
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
    def add_meta_arch(self, params: Params) -> None:
-        ver = None
+        name = "LLaMA"
        if (params.n_ctx == 4096):
-            ver = "v2"
+            name = "LLaMA v2"
            if params.path_model:
                name = str(params.path_model.parent).split('/')[-1]
-        self.gguf.add_name                ("LLaMA" if ver == None else "LLaMA " + ver)
+        self.gguf.add_name                (name)
        self.gguf.add_context_length      (params.n_ctx)
        self.gguf.add_embedding_length    (params.n_embd)
        self.gguf.add_block_count         (params.n_layer)
@ -747,6 +818,12 @@ class OutputFile:
        self.gguf.add_head_count_kv       (params.n_head_kv)
        self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps)
        if params.f_rope_freq_base:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)
        if params.f_rope_scale:
            self.gguf.add_rope_scale_linear(params.f_rope_scale)
        if params.ftype:
            self.gguf.add_file_type(params.ftype)
@ -767,12 +844,11 @@ class OutputFile:
        self.gguf.add_token_types(toktypes)
    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
-        n_elements = 1
+        n_elements = int(np.prod(tensor.shape))
-        for dim in tensor.shape:
+        raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
-            n_elements *= dim
+        data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
-        data_type = DATA_TYPE_TO_NUMPY[tensor.data_type]
+        data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
-        data_nbytes = n_elements * data_type.itemsize
+        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype = raw_dtype)
        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes)
    def write_meta(self) -> None:
        self.gguf.write_header_to_file()
@ -798,7 +874,20 @@ class OutputFile:
        of.close()
    @staticmethod
-    def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
+    def do_item(item: Tuple[str, LazyTensor]) -> Tuple[DataType, NDArray]:
        name, lazy_tensor = item
        tensor = lazy_tensor.load().to_ggml()
        return (lazy_tensor.data_type, tensor.ndarray)
    @staticmethod
    def maybe_do_quantize(item: Tuple[DataType, NDArray]) -> NDArray:
        dt, arr = item
        if not isinstance(dt, QuantizedDataType):
            return arr
        return dt.quantize(arr)
    @staticmethod
    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
        check_vocab_size(params, vocab)
        of = OutputFile(fname_out)
@ -814,16 +903,19 @@ class OutputFile:
        of.write_meta()
        of.write_tensor_info()
        def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
            name, lazy_tensor = item
            return lazy_tensor.load().to_ggml().ndarray
        # tensor data
-        ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
+        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
        if ftype == GGMLFileType.MostlyQ8_0:
            ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, factory = ProcessPoolExecutor)
        else:
            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
        start = time.time()
        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
            elapsed = time.time() - start
            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
            padi = len(str(len(model)))
-            print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
+            print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
            of.gguf.write_tensor_data(ndarray)
        of.close()
@ -835,6 +927,8 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
        return GGMLFileType.AllF32
    if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
        return GGMLFileType.MostlyF16
    if output_type_str == "q8_0":
        return GGMLFileType.MostlyQ8_0
    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
@ -881,7 +975,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
            print(f"skipping tensor {name_new}")
            continue
        else:
-            print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type} | {lazy_tensor.shape}")
+            print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
            out[name_new] = lazy_tensor
    return out
@ -986,6 +1080,7 @@ def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
    namestr = {
        GGMLFileType.AllF32:    "f32",
        GGMLFileType.MostlyF16: "f16",
        GGMLFileType.MostlyQ8_0:"q8_0",
    }[file_type]
    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
    if ret in model_paths:
@ -1009,12 +1104,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
-    parser.add_argument("--outtype",     choices=["f32", "f16"], help="output format (default: based on input)")
+    parser.add_argument("--outtype",     choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
    parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
    args = parser.parse_args(args_in)
    if args.dump_single:
@ -1036,6 +1132,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
        params.ftype = {
            "f32": GGMLFileType.AllF32,
            "f16": GGMLFileType.MostlyF16,
            "q8_0": GGMLFileType.MostlyQ8_0,
        }[args.outtype]
    print(f"params = {params}")
@ -1067,7 +1164,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
        params.ftype = ftype
        print(f"Writing {outfile}, format {ftype}")
-        OutputFile.write_all(outfile, params, model, vocab)
+        OutputFile.write_all(outfile, ftype, params, model, vocab, concurrency = args.concurrency)
        print(f"Wrote {outfile}")
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -25,6 +25,7 @@ else()
    add_subdirectory(simple)
    add_subdirectory(embd-input)
    add_subdirectory(llama-bench)
    add_subdirectory(beam_search)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
--- a/examples/beam_search/CMakeLists.txt
+++ b/examples/beam_search/CMakeLists.txt
@ -0,0 +1,8 @@
 set(TARGET beam_search)
 add_executable(${TARGET} beam_search.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/beam_search/beam_search.cpp
+++ b/examples/beam_search/beam_search.cpp
@ -0,0 +1,188 @@
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
 #include <unistd.h>
 #elif defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #define NOMINMAX
 #include <windows.h>
 #include <signal.h>
 #endif
 // Used for debugging to print out beam tokens.
 struct ostream_beam_view {
    llama_context * ctx;
    llama_beam_view beam_view;
 };
 std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
    os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
    for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
        os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
    }
    return os << ')';
 }
 // Put here anything you want back in beam_search_callback().
 struct beam_search_callback_data {
    llama_context * ctx;
    std::vector<llama_token> response;
 };
 // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
 // For example, eob can be flagged due to maximum token length, stop words, etc.
 bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
    return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
 }
 // Function matching type llama_beam_search_callback_fn_t.
 // Custom callback example is called each time the beams lengths increase:
 //  * Show progress by printing ',' following by number of convergent beam tokens if any.
 //  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
 //    This is also called when the stop condition is met.
 //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
 void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
    auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
    // Mark beams as EOS as needed.
    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
        llama_beam_view& beam_view = beams_state.beam_views[i];
        if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
            beam_view.eob = true;
        }
    }
    printf(",");  // Show progress
    if (const size_t n = beams_state.common_prefix_length) {
        callback_data.response.resize(callback_data.response.size() + n);
        assert(0u < beams_state.n_beams);
        const llama_token * tokens = beams_state.beam_views[0].tokens;
        std::copy(tokens, tokens + n, callback_data.response.end() - n);
        printf("%lu", n);
    }
    fflush(stdout);
 #if 1 // DEBUG: print current beams for this iteration
    std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
        std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
    }
 #endif
 }
 int main(int argc, char ** argv)
 {
    gpt_params params;
    //params.n_gpu_layers = 200;
    //---------------------------------
    // Print help :
    //---------------------------------
    if ( argc < 2 || argv[1][0] == '-' )
    {
        printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
        return 1 ;
    }
    //---------------------------------
    // Load parameters :
    //---------------------------------
    params.model = argv[1];
    params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
    if ( argc > 3 )
    {
        params.prompt = argv[3];
    }
    if ( params.prompt.empty() )
    {
        params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
    }
    //---------------------------------
    // Init LLM :
    //---------------------------------
    llama_backend_init(params.numa);
    llama_model * model;
    llama_context * ctx;
    std::tie(model, ctx) = llama_init_from_gpt_params( params );
    if ( model == NULL )
    {
        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
        return 1;
    }
    //---------------------------------
    // Tokenize the prompt :
    //---------------------------------
    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
    const size_t max_context_size     = llama_n_ctx( ctx );
    const size_t max_tokens_list_size = max_context_size - 4 ;
    if (tokens_list.size() > max_tokens_list_size)
    {
        fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" ,
             __func__ , tokens_list.size() , max_tokens_list_size );
        return 1;
    }
    fprintf( stderr, "\n\n" );
    // Print the tokens from the prompt :
    for( auto id : tokens_list )
    {
        std::cout << llama_token_to_piece(ctx, id);
    }
    std::cout << std::flush;
    int n_past = llama_get_kv_cache_token_count(ctx);
    if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
    {
        fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
        return 1;
    }
    n_past += tokens_list.size();
    beam_search_callback_data callback_data{ctx, {}};
    size_t const beam_width = static_cast<size_t>(params.n_beams);
    int const n_predict = 256;
    llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads);
    std::cout << "\n\n";
    for (llama_token const token_id : callback_data.response) {
        std::cout << llama_token_to_piece(ctx,token_id);
    }
    std::cout << std::endl;
    llama_free( ctx );
    llama_free_model( model );
    llama_backend_free();
    return 0;
 }
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@ -12,18 +12,14 @@ usage: ./convert-llama2c-to-ggml [options]
 options:
  -h, --help                       show this help message and exit
-  --copy-vocab-from-model FNAME    model path from which to copy vocab (default 'tokenizer.bin')
+  --copy-vocab-from-model FNAME    path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf')
  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model
  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default ak_llama_model.bin')
 ```
 An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
-`$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin`
+`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
 For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually:
 `$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin`
 Now you can use the model with a command like:
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -10,9 +10,48 @@
 #include <ctime>
 #include <random>
 #include <stdexcept>
 #include <sstream>
 #include <algorithm>
 #include <string>
 // GGUF keys & tensor names.
 #define KV_GENERAL_ARCHITECTURE          "general.architecture"
 #define KV_GENERAL_NAME                  "general.name"
 #define KV_TOKENIZER_MODEL               "tokenizer.ggml.model"
 #define KV_TOKENIZER_LIST                "tokenizer.ggml.tokens"
 #define KV_TOKENIZER_TOKEN_TYPE          "tokenizer.ggml.token_type"
 #define KV_TOKENIZER_SCORES              "tokenizer.ggml.scores"
 #define KV_TOKENIZER_BOS_ID              "tokenizer.ggml.bos_token_id"
 #define KV_TOKENIZER_EOS_ID              "tokenizer.ggml.eos_token_id"
 #define KV_TOKENIZER_UNK_ID              "tokenizer.ggml.unknown_token_id"
 #define KV_TOKENIZER_SEP_ID              "tokenizer.ggml.seperator_token_id"
 #define KV_TOKENIZER_PAD_ID              "tokenizer.ggml.padding_token_id"
 #define KV_TOKENIZER_HF_JSON             "tokenizer.huggingface.json"
 #define KV_CONTEXT_LENGTH                "llama.context_length"
 #define KV_EMBEDDING_LENGTH              "llama.embedding_length"
 #define KV_BLOCK_COUNT                   "llama.block_count"
 #define KV_FEED_FORWARD_LENGTH           "llama.feed_forward_length"
 #define KV_ATTENTION_HEAD_COUNT          "llama.attention.head_count"
 #define KV_ATTENTION_HEAD_COUNT_KV       "llama.attention.head_count_kv"
 #define KV_ATTENTION_LAYERNORM_RMS_EPS   "llama.attention.layer_norm_rms_epsilon"
 #define KV_ROPE_DIMENSION_COUNT          "llama.rope.dimension_count"
 #define TN_TOKEN_EMBD  "token_embd.weight"
 #define TN_OUTPUT_NORM "output_norm.weight"
 #define TN_OUTPUT      "output.weight"
 #define TN_ATTN_NORM   "blk.%d.attn_norm.weight"
 #define TN_ATTN_Q      "blk.%d.attn_q.weight"
 #define TN_ATTN_K      "blk.%d.attn_k.weight"
 #define TN_ATTN_V      "blk.%d.attn_v.weight"
 #define TN_ATTN_OUTPUT "blk.%d.attn_output.weight"
 #define TN_FFN_NORM    "blk.%d.ffn_norm.weight"
 #define TN_FFN_GATE    "blk.%d.ffn_gate.weight"
 #define TN_FFN_DOWN    "blk.%d.ffn_down.weight"
 #define TN_FFN_UP      "blk.%d.ffn_up.weight"
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
@ -20,6 +59,11 @@
 #define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
 #define LLAMA_FILE_VERSION_GGJT_V3   3
 #define TOKENIZER_NAME "llama"
 #define UNKNOWN_TOKEN_ID 0
 #define BOS_TOKEN_ID 1
 #define EOS_TOKEN_ID 2
 //////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
 typedef struct {
    int dim; // transformer dimension
@ -183,6 +227,7 @@ struct my_llama_hparams {
    uint32_t n_vocab = 32000;
    uint32_t n_ctx   = 512;   // this is provided as user input?
    uint32_t n_embd  = 4096;
    uint32_t n_ff    = 11008;
    uint32_t n_mult  = 4;
    uint32_t n_head  = 32;
    uint32_t n_layer = 32;
@ -214,6 +259,8 @@ struct my_llama_layer {
 struct my_llama_model {
    struct ggml_context * ctx = NULL;
    std::string name;
    my_llama_hparams hparams;
    struct ggml_tensor * tok_embeddings;
@ -276,18 +323,13 @@ struct train_params {
    int mem_compute1_gb;
 };
 uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
    return n_ff;
 }
 void print_params(struct my_llama_hparams * params) {
    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
    printf("%s: n_head:  %d\n", __func__, params->n_head);
-    printf("%s: n_ff:    %d\n", __func__, get_n_ff(params));
+    printf("%s: n_ff:    %d\n", __func__, params->n_ff);
    printf("%s: n_layer: %d\n", __func__, params->n_layer);
    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
 }
@ -299,7 +341,7 @@ void init_model(struct my_llama_model * model) {
    const uint32_t n_layer = hparams.n_layer;
    const uint32_t n_vocab = hparams.n_vocab;
-    const uint32_t n_ff = get_n_ff(&hparams);
+    const uint32_t n_ff = hparams.n_ff;
    struct ggml_context * ctx = model->ctx;
    model->train_its = 0;
@ -481,21 +523,6 @@ struct llama_file {
        return std::string(chars.data(), len);
    }
    void write_raw(const void * ptr, size_t size) {
        if (size == 0) {
            return;
        }
        errno = 0;
        size_t ret = std::fwrite(ptr, size, 1, fp);
        if (ret != 1) {
            throw std::runtime_error(format("write error: %s", strerror(errno)));
        }
    }
    void write_u32(std::uint32_t val) {
        write_raw(&val, sizeof(val));
    }
    ~llama_file() {
        if (fp) {
            std::fclose(fp);
@ -503,30 +530,6 @@ struct llama_file {
    }
 };
 void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
    if (tensor == NULL) {
        file->write_u32(0);
        file->write_u32(0);
        file->write_u32(GGML_TYPE_F32);
        file->seek((0-file->tell()) & 31, SEEK_CUR);
        return;
    }
    const char * name = ggml_get_name(tensor);
    uint32_t name_len = strlen(name);
    uint32_t nd = tensor->n_dims;
    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
                       (uint32_t)tensor->ne[1],
                       (uint32_t)tensor->ne[2],
                       (uint32_t)tensor->ne[3] };
    file->write_u32(nd);
    file->write_u32(name_len);
    file->write_u32(tensor->type);
    file->write_raw(ne, sizeof(ne[0]) * nd);
    file->write_raw(name, name_len);
    file->seek((0-file->tell()) & 31, SEEK_CUR);
    file->write_raw(tensor->data, ggml_nbytes(tensor));
 }
 bool is_ggml_file(const char *filename) {
    llama_file file(filename, "rb");
    if (file.size < 4) {
@ -536,48 +539,96 @@ bool is_ggml_file(const char *filename) {
    return magic == GGUF_MAGIC;
 }
 static std::string llama_escape_whitespaces(const std::string& text) {
    std::ostringstream out;
    for (char c : text) {
        if (c == ' ') out << "\xe2\x96\x81";
        else out << c;
    }
    return out.str();
 }
 void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
-#pragma message("TODO: implement reading vocabulary using gguf")
+    if (is_ggml_file(filename)) {
-//    // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
+        struct ggml_context * ctx_data = NULL;
-//    if (is_ggml_file(filename)) {
+
-//
+        struct gguf_init_params params = {
-//        struct llama_context_params llama_params = llama_context_default_params();
+            /*.no_alloc = */ false,
-//        llama_params.vocab_only = true;
+            /*.ctx      = */ &ctx_data,
-//
+        };
-//        struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
+
-//        struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
+        struct gguf_context * ctx = gguf_init_from_file(filename, params);
-//
+        GGML_ASSERT(ctx != NULL);
-//        const int n_vocab = llama_n_vocab(lctx);
+
-//        vocab->id_to_token.resize(n_vocab);
+        const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL);
-//        for (int i=0; i<n_vocab; ++i) {
+        GGML_ASSERT(model_idx >= 0);
-//            vocab->id_to_token[i].text  = llama_token_get_text(lctx, i);
+        std::string tokenizer_name = gguf_get_val_str(ctx, model_idx);
-//            vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
+        GGML_ASSERT(tokenizer_name == TOKENIZER_NAME);
-//            vocab->id_to_token[i].type  = llama_token_get_type(lctx, i);
+
-//            vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
+        const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST);
-//        }
+        GGML_ASSERT(token_idx >= 0);
-//        llama_free(lctx);
+
-//        llama_free_model(lmodel);
+        const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES);
-//    } else
+        GGML_ASSERT(score_idx >= 0);
-    { // assume llama2.c vocabulary
+        const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
-        printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
+
        const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE);
        GGML_ASSERT(toktype_idx >= 0);
        const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
        const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
        vocab->id_to_token.resize(n_vocab);
        for (uint32_t i = 0; i < n_vocab; i++) {
            std::string word = gguf_get_arr_str(ctx, token_idx, i);
            vocab->token_to_id[word] = i;
            auto & token_data = vocab->id_to_token[i];
            token_data.text  = std::move(word);
            token_data.score = scores[i];
            token_data.type  = (llama_token_type) toktypes[i];
        }
        ggml_free(ctx_data);
        gguf_free(ctx);
    } else {
        // assume llama2.c vocabulary
        printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
        llama_file file(filename, "rb");
        const int  n_vocab = config->vocab_size;
        /* uint32_t max_token_length =  */ file.read_u32(); // unused
        vocab->id_to_token.resize(n_vocab);
-        for (int i=0; i<n_vocab; ++i) {
+        for (llama_vocab::id id=0; id<n_vocab; ++id) {
            float_t score = file.read_f32();
            uint32_t len = file.read_u32();
            std::string text = file.read_string(len);
-            // Special-case handling of <0xXX> single byte tokens.
+
-            char byte_val;
+            unsigned char byte_val;
-            if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
+            llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
-                char cstr[2] = { byte_val, 0 };
+            if (id == UNKNOWN_TOKEN_ID) {
-                text = cstr;
+                text = "<unk>";
                type = LLAMA_TOKEN_TYPE_UNKNOWN;
            } else if (id == BOS_TOKEN_ID) {
                text = "<s>";
                type = LLAMA_TOKEN_TYPE_CONTROL;
            } else if (id == EOS_TOKEN_ID) {
                text = "</s>";
                type = LLAMA_TOKEN_TYPE_CONTROL;
            } else if (text.empty()) {
                type = LLAMA_TOKEN_TYPE_CONTROL;
            } else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
                // Text of byte tokens is already in the expected format.
                type = LLAMA_TOKEN_TYPE_BYTE;
            } else {
                type = LLAMA_TOKEN_TYPE_NORMAL;
            }
-            vocab->id_to_token[i].text = text;
+            text = llama_escape_whitespaces(text);
-            vocab->id_to_token[i].score = score;
+
-            vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
+            vocab->id_to_token[id].text = text;
-            vocab->token_to_id.emplace(text, i);
+            vocab->id_to_token[id].score = score;
            vocab->id_to_token[id].type = type;
            vocab->token_to_id.emplace(text, id);
        }
    }
 }
@ -619,33 +670,6 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar
 }
 void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
    struct llama_file file(filename, "wb");
    if (file.fp == NULL) {
        return;
    }
 #pragma message("TODO: implement file saving using gguf")
    // write_magic
    file.write_u32(LLAMA_FILE_MAGIC_GGJT);   // magic
    file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version
    // write_hparams
    file.write_u32(model->hparams.n_vocab);
    file.write_u32(model->hparams.n_embd);
    file.write_u32(model->hparams.n_mult);
    file.write_u32(model->hparams.n_head);
    file.write_u32(model->hparams.n_layer);
    file.write_u32(model->hparams.n_rot);
    file.write_u32(LLAMA_FTYPE_ALL_F32);
    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
    uint32_t n_vocab = model->hparams.n_vocab;
    for (uint32_t i = 0; i < n_vocab; i++) {
        const auto & token_data = vocab->id_to_token.at(i);
        file.write_u32((uint32_t) token_data.text.size());
        file.write_raw(token_data.text.data(), token_data.text.size());
        file.write_raw(&token_data.score, sizeof(token_data.score));
    }
    // stuff AK weights into GG weights one by one.
    // w->token_embedding_table -> model->tok_embeddings
    // float*                   -> struct ggml_tensor
@ -658,8 +682,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
    // for rms-att-weight
    int row_length = model->hparams.n_embd;
    const auto & hparams = model->hparams;
-    //int n_ff = model->hparams.n_embd;
+    int n_ff = model->hparams.n_ff;
    int n_ff = get_n_ff(&hparams);
    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
        auto & layer = model->layers[i];
@ -677,28 +700,91 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
    }
    struct gguf_context * ctx = gguf_init_empty();
    std::vector<const char*> tokens;
    std::vector<float> scores;
    std::vector<llama_token_type> token_types;
    for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
        tokens.push_back(token_data.text.c_str());
        scores.push_back(token_data.score);
        token_types.push_back(token_data.type);
    }
    gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size());
    gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size());
    gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size());
    gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME);
    gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama");
    gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama");
    // special tokens
    gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
    gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
    gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
    gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
    gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
    gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
    gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
    gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
    // n_head_kv is optional, default to n_head
    // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
    gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
    gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
    gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
    // write tensors
-    write_tensor(&file, model->tok_embeddings);
+    ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD);
-    write_tensor(&file, model->norm);
+    gguf_add_tensor(ctx, model->tok_embeddings);
-    write_tensor(&file, model->output); // ?
+
    ggml_set_name(model->norm, TN_OUTPUT_NORM);
    gguf_add_tensor(ctx, model->norm);
    ggml_set_name(model->output, TN_OUTPUT);
    gguf_add_tensor(ctx, model->output);
    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
        auto & layer = model->layers[i];
-        write_tensor(&file, layer.attention_norm);
+        ggml_format_name(layer.wq, TN_ATTN_Q, i);
-        write_tensor(&file, layer.wq);
+        gguf_add_tensor(ctx, layer.wq);
-        write_tensor(&file, layer.wk);
+
-        write_tensor(&file, layer.wv);
+        ggml_format_name(layer.wk, TN_ATTN_K, i);
-        write_tensor(&file, layer.wo);
+        gguf_add_tensor(ctx, layer.wk);
-        write_tensor(&file, layer.ffn_norm);
+
-        write_tensor(&file, layer.w1);
+        ggml_format_name(layer.wv, TN_ATTN_V, i);
-        write_tensor(&file, layer.w2);
+        gguf_add_tensor(ctx, layer.wv);
-        write_tensor(&file, layer.w3);
+
        ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i);
        gguf_add_tensor(ctx, layer.wo);
        ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i);
        gguf_add_tensor(ctx, layer.attention_norm);
        ggml_format_name(layer.w1, TN_FFN_GATE, i);
        gguf_add_tensor(ctx, layer.w1);
        ggml_format_name(layer.w2, TN_FFN_DOWN, i);
        gguf_add_tensor(ctx, layer.w2);
        ggml_format_name(layer.w3, TN_FFN_UP, i);
        gguf_add_tensor(ctx, layer.w3);
        ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i);
        gguf_add_tensor(ctx, layer.ffn_norm);
    }
    gguf_write_to_file(ctx, filename, false);
    gguf_free(ctx);
 }
 struct train_params get_default_train_params() {
    struct train_params params;
-    params.fn_vocab_model    = "tokenizer.bin";
+    params.fn_vocab_model    = "models/7B/ggml-model-f16.gguf";
    params.fn_llama2c_output_model = "ak_llama_model.bin";
    params.fn_train_data     = "shakespeare.txt";
    params.fn_checkpoint_in  = "checkpoint.bin";
@ -751,7 +837,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h, --help                       show this help message and exit\n");
-    fprintf(stderr, "  --copy-vocab-from-model FNAME    llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --copy-vocab-from-model FNAME    path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
    fprintf(stderr, "  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
    fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
    fprintf(stderr, "\n");
@ -812,6 +898,14 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
    return true;
 }
 std::string basename(const std::string &path) {
    size_t pos = path.find_last_of("/");
    if (pos == std::string::npos) {
        return path;
    }
    return path.substr(pos + 1);
 }
 int main(int argc, char ** argv) {
    struct train_params params = get_default_train_params();
    if (!params_parse(argc, argv, &params)) {
@ -840,6 +934,7 @@ int main(int argc, char ** argv) {
    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
    model.hparams.n_ctx   = params.n_ctx;
    model.hparams.n_embd  = config.dim; //params.n_embd;
    model.hparams.n_ff    = config.hidden_dim;
    model.hparams.n_mult  = 32;//params.n_mult;
    model.hparams.n_head  = config.n_heads; //params.n_head;
    model.hparams.n_layer = config.n_layers; //params.n_layer;
@ -853,6 +948,7 @@ int main(int argc, char ** argv) {
    model.ctx = ggml_init(lcparams);
    init_model(&model);
    model.name = basename(params.fn_llama2c_model);
    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
    printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@ -214,7 +214,7 @@ const char * sampling(struct MyModel * mymodel) {
    if (id == llama_token_eos(ctx)) {
        ret = "</s>";
    } else {
-        ret = llama_token_to_str(ctx, id);
+        ret = llama_token_to_piece(ctx, id);
    }
    eval_id(mymodel, id);
    return ret.c_str();
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -56,9 +56,6 @@ int main(int argc, char ** argv) {
    int n_past = 0;
    // Add a space in front of the first character to match OG llama tokenizer behavior
    params.prompt.insert(0, 1, ' ');
    // tokenize the prompt
    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
@ -67,7 +64,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }
        fprintf(stderr, "\n");
    }
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -30,6 +30,9 @@ bool gguf_ex_write(const std::string & fname) {
    gguf_set_val_u32 (ctx, "some.parameter.uint32",   0x12345678);
    gguf_set_val_i32 (ctx, "some.parameter.int32",   -0x12345679);
    gguf_set_val_f32 (ctx, "some.parameter.float32",  0.123456789f);
    gguf_set_val_u64 (ctx, "some.parameter.uint64",   0x123456789abcdef0ull);
    gguf_set_val_i64 (ctx, "some.parameter.int64",   -0x123456789abcdef1ll);
    gguf_set_val_f64 (ctx, "some.parameter.float64",  0.1234567890123456789);
    gguf_set_val_bool(ctx, "some.parameter.bool",     true);
    gguf_set_val_str (ctx, "some.parameter.string",   "hello world");
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -18,9 +18,7 @@
 #include "llama.h"
 #include "common.h"
 #include "build-info.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #endif
 // utils
 static uint64_t get_time_ns() {
@ -443,6 +441,8 @@ struct test {
    static const std::string gpu_info;
    std::string model_filename;
    std::string model_type;
    uint64_t model_size;
    uint64_t model_n_params;
    int n_batch;
    int n_threads;
    bool f32_kv;
@ -459,8 +459,10 @@ struct test {
    test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
        model_filename = inst.model;
        char buf[128];
-        llama_model_type(lmodel, buf, sizeof(buf));
+        llama_model_desc(lmodel, buf, sizeof(buf));
        model_type = buf;
        model_size = llama_model_size(lmodel);
        model_n_params = llama_model_n_params(lmodel);
        n_batch = inst.n_batch;
        n_threads = inst.n_threads;
        f32_kv = inst.f32_kv;
@ -504,7 +506,7 @@ struct test {
    static std::string get_backend() {
        if (cuda) {
-            return "CUDA";
+            return GGML_CUDA_NAME;
        }
        if (opencl) {
            return "OpenCL";
@ -526,7 +528,7 @@ struct test {
            "build_commit", "build_number",
            "cuda", "opencl", "metal", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
-            "model_filename", "model_type",
+            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_threads", "f16_kv",
            "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
            "n_prompt", "n_gen", "test_time",
@ -540,6 +542,7 @@ struct test {
    static field_type get_field_type(const std::string & field) {
        if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
            field == "model_size" || field == "model_n_params" ||
            field == "n_gpu_layers" || field == "main_gpu" ||
            field == "n_prompt" || field == "n_gen" ||
            field == "avg_ns" || field == "stddev_ns") {
@ -575,7 +578,7 @@ struct test {
            build_commit, std::to_string(build_number),
            std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
-            model_filename, model_type,
+            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
@ -711,8 +714,15 @@ struct markdown_printer : public printer {
            return -30;
        }
        if (field == "t/s") {
-            return 15;
+            return 16;
        }
        if (field == "size" || field == "params") {
            return 10;
        }
        if (field == "n_gpu_layers") {
            return 3;
        }
        int width = std::max((int)field.length(), 10);
        if (test::get_field_type(field) == test::STRING) {
@ -721,9 +731,28 @@ struct markdown_printer : public printer {
        return width;
    }
    static std::string get_field_display_name(const std::string & field) {
        if (field == "n_gpu_layers") {
            return "ngl";
        }
        if (field == "n_threads") {
            return "threads";
        }
        if (field == "mul_mat_q") {
            return "mmq";
        }
        if (field == "tensor_split") {
            return "ts";
        }
        return field;
    }
    void print_header(const cmd_params & params) override {
        // select fields to print
-        fields = { "model", "backend" };
+        fields.push_back("model");
        fields.push_back("size");
        fields.push_back("params");
        fields.push_back("backend");
        bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
        if (!is_cpu_backend) {
            fields.push_back("n_gpu_layers");
@ -754,7 +783,7 @@ struct markdown_printer : public printer {
        fprintf(fout, "|");
        for (const auto & field : fields) {
-            fprintf(fout, " %*s |", get_field_width(field), field.c_str());
+            fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
        }
        fprintf(fout, "\n");
        fprintf(fout, "|");
@ -771,12 +800,26 @@ struct markdown_printer : public printer {
        fprintf(fout, "|");
        for (const auto & field : fields) {
            std::string value;
            char buf[128];
            if (field == "model") {
                value = t.model_type;
            } else if (field == "size") {
                if (t.model_size < 1024*1024*1024) {
                    snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
                } else {
                    snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
                }
                value = buf;
            } else if (field == "params") {
                if (t.model_n_params < 1000*1000*1000) {
                    snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
                } else {
                    snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
                }
                value = buf;
            } else if (field == "backend") {
                value = test::get_backend();
            } else if (field == "test") {
                char buf[128];
                if (t.n_prompt > 0 && t.n_gen == 0) {
                    snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
                } else if (t.n_gen > 0 && t.n_prompt == 0) {
@ -787,7 +830,6 @@ struct markdown_printer : public printer {
                }
                value = buf;
            } else if (field == "t/s") {
                char buf[128];
                snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
                value = buf;
            } else if (vmap.find(field) != vmap.end()) {
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -189,12 +189,14 @@ int main(int argc, char ** argv) {
        }
    }
-    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+    // Add BOS if SPM tokenizer
    const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
    // tokenize the prompt
    std::vector<llama_token> embd_inp;
    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
-        embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
    } else {
        embd_inp = session_tokens;
    }
@ -209,10 +211,9 @@ int main(int argc, char ** argv) {
    int guidance_offset = 0;
    int original_prompt_len = 0;
    if (ctx_guidance) {
-        params.cfg_negative_prompt.insert(0, 1, ' ');
+        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
        original_prompt_len = original_inp.size();
        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
    }
@ -259,7 +260,7 @@ int main(int argc, char ** argv) {
    }
    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", is_spm);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);
    // in instruct mode, we inject a prefix and a suffix to each input by the user
@ -278,7 +279,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }
        if (ctx_guidance) {
@ -286,14 +287,14 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
            }
        }
        if (params.n_keep > 0) {
        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
+                fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
            }
            fprintf(stderr, "'\n");
        }
@ -449,7 +450,7 @@ int main(int argc, char ** argv) {
                //printf("\n---\n");
                //printf("resetting: '");
                //for (int i = 0; i < (int) embd.size(); i++) {
-                //    printf("%s", llama_token_to_str(ctx, embd[i]));
+                //    printf("%s", llama_token_to_piece(ctx, embd[i]));
                //}
                //printf("'\n");
                //printf("\n---\n");
@ -502,7 +503,7 @@ int main(int argc, char ** argv) {
                    input_size = embd_guidance.size();
                    //fprintf(stderr, "\n---------------------\n");
                    //for (int i = 0; i < (int) embd_guidance.size(); i++) {
-                        //fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
+                        //fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
                    //}
                    //fprintf(stderr, "\n---------------------\n");
                } else {
@ -597,7 +598,12 @@ int main(int argc, char ** argv) {
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                    last_n_repeat, alpha_frequency, alpha_presence);
                if (!penalize_nl) {
-                    logits[llama_token_nl(ctx)] = nl_logit;
+                    for (size_t idx = 0; idx < candidates_p.size; idx++) {
                        if (candidates_p.data[idx].id == llama_token_nl(ctx)) {
                            candidates_p.data[idx].logit = nl_logit;
                            break;
                        }
                    }
                }
                if (grammar != NULL) {
@ -661,7 +667,7 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo) {
            for (auto id : embd) {
-                printf("%s", llama_token_to_str(ctx, id).c_str());
+                printf("%s", llama_token_to_piece(ctx, id).c_str());
            }
            fflush(stdout);
        }
@ -677,7 +683,7 @@ int main(int argc, char ** argv) {
            if (params.antiprompt.size()) {
                std::string last_output;
                for (auto id : last_n_tokens) {
-                    last_output += llama_token_to_str(ctx, id);
+                    last_output += llama_token_to_piece(ctx, id);
                }
                is_antiprompt = false;
@ -798,7 +804,8 @@ int main(int argc, char ** argv) {
        }
        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
-        if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
+        // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
        if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
            n_remain = params.n_predict;
            is_interacting = true;
        }
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -6,6 +6,8 @@
 #include <ctime>
 #include <sstream>
 #include <cstring>
 #include <thread>
 #include <mutex>
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -27,6 +29,40 @@ std::vector<float> softmax(const std::vector<float>& logits) {
    return probs;
 }
 float log_softmax(int n_vocab, const float * logits, int tok) {
    float max_logit = logits[0];
    for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
    double sum_exp = 0.0;
    for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
    return logits[tok] - max_logit - log(sum_exp);
 }
 void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread>& workers,
        double& nll, double& nll2) {
    std::mutex mutex;
    int counter = 0;
    auto compute = [&mutex, &counter, &nll, &nll2, n_vocab, logits, tokens, n_token] () {
        double local_nll = 0, local_nll2 = 0;
        while (true) {
            std::unique_lock<std::mutex> lock(mutex);
            int i = counter++;
            if (i >= n_token) {
                nll += local_nll; nll2 += local_nll2;
                break;
            }
            lock.unlock();
            double v = -log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
            local_nll += v;
            local_nll2 += v*v;
        }
    };
    for (auto& w : workers) w = std::thread(compute);
    compute();
    for (auto& w : workers) w.join();
 }
 void perplexity_v2(llama_context * ctx, const gpt_params & params) {
    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
@ -154,10 +190,14 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
    const bool add_bos = is_spm;
    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
    auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
    const int n_chunk_max = tokens.size() / params.n_ctx;
    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
@ -166,9 +206,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    int count = 0;
    double nll = 0.0;
    double nll2 = 0.0;
    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * params.n_ctx;
        const int end   = start + params.n_ctx;
@ -228,26 +271,32 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
+        const int first = std::min(512, params.n_ctx/2);
-            // Calculate probability of next token, given the previous ones.
+        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, workers, nll, nll2);
-            const std::vector<float> tok_logits(
+        count += params.n_ctx - first - 1;
                logits.begin() + (j + 0) * n_vocab,
                logits.begin() + (j + 1) * n_vocab);
            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
            nll += -std::log(prob);
            ++count;
        }
        // perplexity is e^(average negative log-likelihood)
        if (params.ppl_output_type == 0) {
            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
        } else {
-            printf("%8d  %.4lf\n", i*params.n_ctx, std::exp(nll / count));
+            double av = nll/count;
            double av2 = nll2/count - av*av;
            if (av2 > 0) av2 = sqrt(av2/(count-1));
            printf("%8d  %.4lf  %4lf  %4lf\n", i*params.n_ctx, std::exp(nll / count), av, av2);
        }
        fflush(stdout);
    }
    printf("\n");
    nll2 /= count;
    nll /= count;
    nll2 -= nll * nll;
    if (nll2 > 0) {
        nll2 = sqrt(nll2/(count-1));
        double ppl = exp(nll);
        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
    } else {
        printf("Unexpected negative standard deviation of log(prob)\n");
    }
 }
 std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
@ -306,6 +355,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
    fprintf(stderr, "================================= is_spm = %d\n", is_spm);
    // This is needed as usual for LLaMA models
    const bool add_bos = is_spm;
@ -346,7 +396,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        hs_data[i].context = prompt_lines[idx*6];
        hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
        for (size_t j=0; j < 4; j++) {
-            hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j];
+            hs_data[i].ending[j] = prompt_lines[idx*6+2+j];
        }
        // Delete the selected random example from the prompt
@ -361,6 +411,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    double acc = 0.0f;
    const int n_vocab = llama_n_vocab(ctx);
    std::vector<std::vector<int>> ending_tokens(4);
    std::vector<float> tok_logits(n_vocab);
    for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
@ -368,11 +420,21 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
        size_t context_size = context_embd.size();
        for (int i = 0; i < 4; ++i) {
            ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos);
            for (int k = 0; k < int(context_size); ++k) {
                if (ending_tokens[i][k] != context_embd[k]) {
                    fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);
                    break;
                }
            }
        }
        // Do the 1st ending
        // In this case we include the context when evaluating
-        auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
+        //auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
        auto query_embd = ending_tokens[0];
        auto query_size = query_embd.size();
        //printf("First query: %d\n",(int)query_size);
        // Stop if query wont fit the ctx window
        if (query_size > (size_t)params.n_ctx) {
@ -417,7 +479,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
            // Tokenize the query
-            query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
+            query_embd.resize(ending_tokens[ending_idx].size() - context_size);
            std::memcpy(query_embd.data(), ending_tokens[ending_idx].data() + context_size, query_embd.size()*sizeof(int));
            query_size = query_embd.size();
            // Stop if query wont fit the ctx window
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -87,7 +87,7 @@ int main(int argc, char ** argv) {
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        auto next_token = llama_sample_token(ctx, &candidates_p);
-        auto next_token_str = llama_token_to_str(ctx, next_token);
+        auto next_token_str = llama_token_to_piece(ctx, next_token);
        last_n_tokens_data.push_back(next_token);
        printf("%s", next_token_str.c_str());
@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        auto next_token = llama_sample_token(ctx2, &candidates_p);
-        auto next_token_str = llama_token_to_str(ctx2, next_token);
+        auto next_token_str = llama_token_to_piece(ctx2, next_token);
        last_n_tokens_data.push_back(next_token);
        printf("%s", next_token_str.c_str());
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -77,34 +77,31 @@ You need to have [Node.js](https://nodejs.org/en) installed.
 ```bash
 mkdir llama-client
 cd llama-client
 npm init
 npm install axios
 ```
 Create a index.js file and put inside this:
 ```javascript
 const axios = require("axios");
 const prompt = `Building a website can be done in 10 simple steps:`;
 async function Test() {
-    let result = await axios.post("http://127.0.0.1:8080/completion", {
+    let response = await fetch("http://127.0.0.1:8080/completion", {
-        prompt,
+        method: 'POST',
-        n_predict: 512,
+        body: JSON.stringify({
-    });
+            prompt,
-
+            n_predict: 512,
-    // the response is received until completion finish
+        })
-    console.log(result.data.content);
+    })
    console.log((await response.json()).content)
 }
-Test();
+Test()
 ```
 And run it:
 ```bash
-node .
+node index.js
 ```
 ## API Endpoints
@ -167,6 +164,12 @@ node .
    Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
 -   **POST** `/detokenize`: Convert tokens to text.
    *Options:*
    `tokens`: Set the tokens to detokenize.
 -   **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
    *Options:*
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -102,6 +102,17 @@
      padding: 0.5em;
    }
    .prob-set {
      padding: 0.3em;
      border-bottom: 1px solid #ccc;
    }
    .popover-content {
      position: absolute;
      background-color: white;
      padding: 0.2em;
      box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
    }
    textarea {
      padding: 5px;
@ -133,11 +144,17 @@
      font-size: 80%;
      color: #888;
    }
    @media (prefers-color-scheme: dark) {
      .popover-content {
        background-color: black;
      }
    }
  </style>
  <script type="module">
    import {
-      html, h, signal, effect, computed, render, useSignal, useEffect, useRef
+      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
    } from '/index.js';
    import { llama } from '/completion.js';
@ -168,6 +185,7 @@
      mirostat_tau: 5, // target entropy
      mirostat_eta: 0.1, // learning rate
      grammar: '',
      n_probs: 0, // no completion_probabilities
    })
    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
@ -334,10 +352,21 @@
      const prompt = template(session.value.template, {
        message: msg,
-        history: session.value.transcript.flatMap(([name, message]) => template(session.value.historyTemplate, {name, message})).join("\n"),
+        history: session.value.transcript.flatMap(
          ([name, data]) =>
            template(
              session.value.historyTemplate,
              {
                name,
                message: Array.isArray(data) ?
                  data.map(msg => msg.content).join('').replace(/^\s/, '') :
                  data,
              }
            )
        ).join("\n"),
      });
-      let currentMessage = '';
+      const currentMessages = [];
      const history = session.value.transcript
      const llamaParams = {
@ -347,15 +376,19 @@
      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
        const data = chunk.data;
        currentMessage += data.content;
        // remove leading whitespace
        currentMessage = currentMessage.replace(/^\s+/, "")
        transcriptUpdate([...history, ["{{char}}", currentMessage]])
        if (data.stop) {
-          console.log("Completion finished: '", currentMessage, "', summary: ", data);
+          while (
            currentMessages.length > 0 &&
            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
          ) {
            currentMessages.pop();
          }
          transcriptUpdate([...history, ["{{char}}", currentMessages]])
          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
        } else {
          currentMessages.push(data);
          transcriptUpdate([...history, ["{{char}}", currentMessages]])
        }
        if (data.timings) {
@ -420,8 +453,18 @@
        }
      }, [messages])
-      const chatLine = ([user, msg]) => {
+      const chatLine = ([user, data], index) => {
-        return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdownish} text=${template(msg)} /></p>`
+        let message
        const isArrayMessage = Array.isArray(data)
        if (params.value.n_probs > 0 && isArrayMessage) {
          message = html`<${Probabilities} data=${data} />`
        } else {
          const text = isArrayMessage ?
            data.map(msg => msg.content).join('').replace(/^\s+/, '') :
            data;
          message = html`<${Markdownish} text=${template(text)} />`
        }
        return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
      };
      return html`
@ -568,10 +611,71 @@
              ${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
              ${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
            </fieldset>
            <fieldset>
              ${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})}
            </fieldset>
          </details>
        </form>
      `
    }
    const probColor = (p) => {
      const r = Math.floor(192 * (1 - p));
      const g = Math.floor(192 * p);
      return `rgba(${r},${g},0,0.3)`;
    }
    const Probabilities = (params) => {
      return params.data.map(msg => {
        const { completion_probabilities } = msg;
        if (
          !completion_probabilities ||
          completion_probabilities.length === 0
        ) return msg.content
        if (completion_probabilities.length > 1) {
          // Not for byte pair
          if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
          const splitData = completion_probabilities.map(prob => ({
            content: prob.content,
            completion_probabilities: [prob]
          }))
          return html`<${Probabilities} data=${splitData} />`
        }
        const { probs, content } = completion_probabilities[0]
        const found = probs.find(p => p.tok_str === msg.content)
        const pColor = found ? probColor(found.prob) : 'transparent'
        const popoverChildren = html`
          <div class="prob-set">
            ${probs.map((p, index) => {
              return html`
                <div
                  key=${index}
                  title=${`prob: ${p.prob}`}
                  style=${{
                    padding: '0.3em',
                    backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
                  }}
                >
                  <span>${p.tok_str}: </span>
                  <span>${Math.floor(p.prob * 100)}%</span>
                </div>
              `
            })}
          </div>
        `
        return html`
          <${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
            ${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
          </>
        `
      });
    }
    // poor mans markdown replacement
    const Markdownish = (params) => {
      const md = params.text
@ -600,10 +704,121 @@
      `
    }
    // simple popover impl
    const Popover = (props) => {
      const isOpen = useSignal(false);
      const position = useSignal({ top: '0px', left: '0px' });
      const buttonRef = useRef(null);
      const popoverRef = useRef(null);
      const togglePopover = () => {
        if (buttonRef.current) {
          const rect = buttonRef.current.getBoundingClientRect();
          position.value = {
            top: `${rect.bottom + window.scrollY}px`,
            left: `${rect.left + window.scrollX}px`,
          };
        }
        isOpen.value = !isOpen.value;
      };
      const handleClickOutside = (event) => {
        if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
          isOpen.value = false;
        }
      };
      useEffect(() => {
        document.addEventListener('mousedown', handleClickOutside);
        return () => {
          document.removeEventListener('mousedown', handleClickOutside);
        };
      }, []);
      return html`
        <span style=${props.style} ref=${buttonRef} onClick=${togglePopover}>${props.children}</span>
        ${isOpen.value && html`
          <${Portal} into="#portal">
            <div
              ref=${popoverRef}
              class="popover-content"
              style=${{
                top: position.value.top,
                left: position.value.left,
              }}
            >
              ${props.popoverChildren}
            </div>
          </${Portal}>
        `}
      `;
    };
    // Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
    /** Redirect rendering of descendants into the given CSS selector */
    class Portal extends Component {
      componentDidUpdate(props) {
        for (let i in props) {
          if (props[i] !== this.props[i]) {
            return setTimeout(this.renderLayer);
          }
        }
      }
      componentDidMount() {
        this.isMounted = true;
        this.renderLayer = this.renderLayer.bind(this);
        this.renderLayer();
      }
      componentWillUnmount() {
        this.renderLayer(false);
        this.isMounted = false;
        if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
      }
      findNode(node) {
        return typeof node === 'string' ? document.querySelector(node) : node;
      }
      renderLayer(show = true) {
        if (!this.isMounted) return;
        // clean up old node if moving bases:
        if (this.props.into !== this.intoPointer) {
          this.intoPointer = this.props.into;
          if (this.into && this.remote) {
            this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
          }
          this.into = this.findNode(this.props.into);
        }
        this.remote = render(html`
          <${PortalProxy} context=${this.context}>
            ${show && this.props.children || null}
          </${PortalProxy}>
        `, this.into, this.remote);
      }
      render() {
        return null;
      }
    }
    // high-order component that renders its first child if it exists.
    // used as a conditional rendering proxy.
    class PortalProxy extends Component {
      getChildContext() {
        return this.props.context;
      }
      render({ children }) {
        return children || null;
      }
    }
    function App(props) {
      return html`
-        <div id="container">
+        <div>
          <header>
            <h1>llama.cpp</h1>
          </header>
@ -624,11 +839,13 @@
      `;
    }
-    render(h(App), document.body);
+    render(h(App), document.querySelector('#container'));
  </script>
 </head>
 <body>
  <div id="container"></div>
  <div id="portal"></div>
 </body>
 </html>
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -94,7 +94,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
    std::string ret;
    for (; begin != end; ++begin)
    {
-        ret += llama_token_to_str(ctx, *begin);
+        ret += llama_token_to_piece(ctx, *begin);
    }
    return ret;
 }
@ -123,9 +123,10 @@ static void server_log(const char *level, const char *function, int line,
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
 {
-    std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
+    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
-    // if first bit is 1, meaning it's a partial character
+    // if the size is 1 and first bit is 1, meaning it's a partial character
-    if (out.size() > 0 && (out[0] & 0x80) == 0x80)
+    //   (size > 1 meaning it's already a known token)
    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
    {
        std::stringstream ss;
        ss << std::hex << (out[0] & 0xff);
@ -285,7 +286,6 @@ struct llama_server_context
                    std::vector<llama_token> p;
                    if (first)
                    {
                        s.insert(0, 1, ' '); // add a space if it's the first
                        p = ::llama_tokenize(ctx, s, add_bos);
                        first = false;
                    }
@ -308,7 +308,6 @@ struct llama_server_context
        else
        {
            auto s = json_prompt.template get<std::string>();
            s.insert(0, 1, ' '); // always add a first space
            prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
        }
@ -565,7 +564,7 @@ struct llama_server_context
        if (!embd.empty() && embd.back() == llama_token_eos(ctx))
        {
-            // stopping_word = llama_token_to_str(ctx, embd.back());
+            // stopping_word = llama_token_to_piece(ctx, embd.back());
            has_next_token = false;
            stopped_eos = true;
            LOG_VERBOSE("eos token found", {});
@ -612,7 +611,7 @@ struct llama_server_context
    {
        const completion_token_output token_with_probs = nextToken();
-        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok);
+        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
        generated_text += token_text;
        if (params.n_probs > 0)
@ -1103,6 +1102,12 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
        {"tokens", tokens}};
 }
 static json format_detokenized_response(std::string content)
 {
    return json{
        {"content", content}};
 }
 template <typename T>
 static T json_value(const json &body, const std::string &key, const T &default_value)
 {
@ -1208,6 +1213,62 @@ static void log_server_request(const Request &req, const Response &res)
                           });
 }
 bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
    return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
 }
 // Function matching type llama_beam_search_callback_fn_t.
 // Custom callback example is called each time the beams lengths increase:
 //  * Show progress by printing ',' following by number of convergent beam tokens if any.
 //  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
 //    This is also called when the stop condition is met.
 //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
 void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
    auto & llama = *static_cast<llama_server_context*>(callback_data);
    // Mark beams as EOS as needed.
    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
        llama_beam_view& beam_view = beams_state.beam_views[i];
        if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) {
            beam_view.eob = true;
        }
    }
    printf(",");  // Show progress
    if (const size_t n = beams_state.common_prefix_length) {
        llama.generated_token_probs.resize(llama.generated_token_probs.size() + n);
        assert(0u < beams_state.n_beams);
        const llama_token * tokens = beams_state.beam_views[0].tokens;
        const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
        std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
        printf("%lu", n);
    }
    fflush(stdout);
 #if 0 // DEBUG: print current beams for this iteration
    std::cout << "\n\nCurrent beams:\n";
    for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
        std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
    }
 #endif
 }
 struct token_translator {
    llama_context * ctx;
    std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
    std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
 };
 void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
    auto & gtps = llama.generated_token_probs;
    auto translator = token_translator{llama.ctx};
    auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
    const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
    if (llama.generated_text.capacity() < llama.generated_text.size() + len) {
        llama.generated_text.reserve(llama.generated_text.size() + len);
    }
    for (const completion_token_output & cto : gtps) {
        llama.generated_text += translator(cto);
    }
 }
 int main(int argc, char **argv)
 {
    // own arguments required by this example
@ -1290,22 +1351,30 @@ int main(int argc, char **argv)
        llama.beginCompletion();
        if (!llama.stream) {
-            size_t stop_pos = std::string::npos;
+            if (llama.params.n_beams) {
                // Fill llama.generated_token_probs vector with final beam.
                llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
                                  llama.n_past, llama.n_remain, llama.params.n_threads);
                // Translate llama.generated_token_probs to llama.generated_text.
                append_to_generated_text_from_generated_token_probs(llama);
            } else {
                size_t stop_pos = std::string::npos;
-            while (llama.has_next_token) {
+                while (llama.has_next_token) {
-                const completion_token_output token_with_probs = llama.doCompletion();
+                    const completion_token_output token_with_probs = llama.doCompletion();
-                const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
+                    const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok);
-                stop_pos = llama.findStoppingStrings(llama.generated_text,
+                    stop_pos = llama.findStoppingStrings(llama.generated_text,
-                    token_text.size(), STOP_FULL);
+                        token_text.size(), STOP_FULL);
-            }
+                }
-            if (stop_pos == std::string::npos) {
+                if (stop_pos == std::string::npos) {
-                stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
+                    stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
-            }
+                }
-            if (stop_pos != std::string::npos) {
+                if (stop_pos != std::string::npos) {
-                llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
+                    llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
-                    llama.generated_text.end());
+                        llama.generated_text.end());
                }
            }
            const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
@ -1321,59 +1390,86 @@ int main(int argc, char **argv)
                while (llama.has_next_token) {
                    const completion_token_output token_with_probs = llama.doCompletion();
-                    const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
+                    if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
                    if (llama.multibyte_pending > 0) {
                        continue;
                    }
                    const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
                    size_t pos = std::min(sent_count, llama.generated_text.size());
                    const std::string str_test = llama.generated_text.substr(pos);
                    bool is_stop_full = false;
                    size_t stop_pos =
                        llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
                    if (stop_pos != std::string::npos) {
                        is_stop_full = true;
                        llama.generated_text.erase(
                            llama.generated_text.begin() + pos + stop_pos,
                            llama.generated_text.end());
                        pos = std::min(sent_count, llama.generated_text.size());
                    } else {
                        is_stop_full = false;
                        stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
                            STOP_PARTIAL);
                    }
-                    const std::string to_send = llama.generated_text.substr(pos, stop_pos);
+                    if (
-                    sent_count += to_send.size();
+                        stop_pos == std::string::npos ||
                        // Send rest of the text if we are at the end of the generation
                        (!llama.has_next_token && !is_stop_full && stop_pos > 0)
                    ) {
                        const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
-                    std::vector<completion_token_output> probs_output = {};
+                        sent_count += to_send.size();
-                    if (llama.params.n_probs > 0) {
+                        std::vector<completion_token_output> probs_output = {};
-                        const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
+
-                        size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
+                        if (llama.params.n_probs > 0) {
-                        size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
+                            const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
-                        if (probs_pos < probs_stop_pos) {
+                            size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
-                            probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
+                            size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
                            if (probs_pos < probs_stop_pos) {
                                probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
                            }
                            sent_token_probs_index = probs_stop_pos;
                        }
                        const json data = format_partial_response(llama, to_send, probs_output);
                        const std::string str =
                            "data: " +
                            data.dump(-1, ' ', false, json::error_handler_t::replace) +
                            "\n\n";
                        LOG_VERBOSE("data stream", {
                            { "to_send", str }
                        });
                        if (!sink.write(str.data(), str.size())) {
                            LOG_VERBOSE("stream closed", {});
                            llama_print_timings(llama.ctx);
                            return false;
                        }
                        sent_token_probs_index = probs_stop_pos;
                    }
-                    const json data = llama.has_next_token
+                    if (!llama.has_next_token) {
-                                          ? format_partial_response(llama, to_send, probs_output)
+                        // Generation is done, send extra information.
-                                          // Generation is done, send extra information.
+                        const json data = format_final_response(llama, "", llama.generated_token_probs);
                                          : format_final_response(llama, to_send, llama.generated_token_probs);
-                    const std::string str =
+                        const std::string str =
-                        "data: " +
+                            "data: " +
-                        data.dump(-1, ' ', false, json::error_handler_t::replace) +
+                            data.dump(-1, ' ', false, json::error_handler_t::replace) +
-                        "\n\n";
+                            "\n\n";
-                    LOG_VERBOSE("data stream", {
+                        LOG_VERBOSE("data stream", {
-                        { "to_send", str }
+                            { "to_send", str }
-                    });
+                        });
-                    if (!sink.write(str.data(), str.size())) {
+                        if (!sink.write(str.data(), str.size())) {
-                        LOG_VERBOSE("stream closed", {});
+                            LOG_VERBOSE("stream closed", {});
-                        llama_print_timings(llama.ctx);
+                            llama_print_timings(llama.ctx);
-                        return false;
+                            return false;
                        }
                    }
                }
@ -1409,6 +1505,21 @@ int main(int argc, char **argv)
        const json data = format_tokenizer_response(tokens);
        return res.set_content(data.dump(), "application/json"); });
    svr.Post("/detokenize", [&llama](const Request &req, Response &res)
             {
        auto lock = llama.lock();
        const json body = json::parse(req.body);
        std::string content;
        if (body.count("tokens") != 0)
        {
            const std::vector<llama_token> tokens = body["tokens"];
            content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
        }
        const json data = format_detokenized_response(content);
        return res.set_content(data.dump(), "application/json"); });
    svr.Post("/embedding", [&llama](const Request &req, Response &res)
             {
        auto lock = llama.lock();
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -63,7 +63,7 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "\n\n");
    for (auto id : tokens_list) {
-        fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
    }
    fflush(stderr);
@ -112,7 +112,7 @@ int main(int argc, char ** argv) {
        }
        // print the new token :
-        printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
+        printf("%s", llama_token_to_piece(ctx, new_token_id).c_str());
        fflush(stdout);
        // push this new token for next evaluation
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -1150,7 +1150,7 @@ void print_matrix(struct ggml_tensor * probs) {
 void print_token(struct llama_context * ctx, llama_token token) {
-    printf("%s", llama_token_to_str(ctx, token).c_str());
+    printf("%s", llama_token_to_piece(ctx, token).c_str());
 }
 void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
@ -1331,7 +1331,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
        const char * in  = buf.data();
        const char * end = buf.data() + buf.size();
        for (int i = 0; i < (int) out.size(); ++i) {
-            std::string s = llama_token_to_str(lctx, out[i]);
+            std::string s = llama_token_to_piece(lctx, out[i]);
            int len = s.length();
            if (in >= end) {
                printf("%s: unexpected end of original text.\n", __func__);
--- a/flake.lock
+++ b/flake.lock
@ -5,11 +5,11 @@
        "systems": "systems"
      },
      "locked": {
-        "lastModified": 1685518550,
+        "lastModified": 1692799911,
-        "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
+        "narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=",
        "owner": "numtide",
        "repo": "flake-utils",
-        "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
+        "rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44",
        "type": "github"
      },
      "original": {
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1685931219,
+        "lastModified": 1692913444,
-        "narHash": "sha256-8EWeOZ6LKQfgAjB/USffUSELPRjw88A+xTcXnOUvO5M=",
+        "narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "7409480d5c8584a1a83c422530419efe4afb0d19",
+        "rev": "18324978d632ffc55ef1d928e81630c620f4f447",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@ -6,6 +6,9 @@
  outputs = { self, nixpkgs, flake-utils }:
    flake-utils.lib.eachDefaultSystem (system:
      let
        name = "llama.cpp";
        src = ./.;
        meta.mainProgram = "llama";
        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
        buildInputs = with pkgs; [ openmpi ];
        osSpecific = with pkgs; buildInputs ++
@ -21,11 +24,17 @@
              CoreGraphics
              CoreVideo
            ]
          else if isDarwin then
            with pkgs.darwin.apple_sdk.frameworks; [
              Accelerate
              CoreGraphics
              CoreVideo
            ]
          else
            with pkgs; [ openblas ]
        );
        pkgs = import nixpkgs { inherit system; };
-        nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
+        nativeBuildInputs = with pkgs; [ cmake ninja pkgconfig ];
        llama-python =
          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
        postPatch = ''
@ -38,35 +47,35 @@
          mv $out/bin/server $out/bin/llama-server
        '';
        cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
-      in {
+      in
      {
        packages.default = pkgs.stdenv.mkDerivation {
-          name = "llama.cpp";
+          inherit name src meta postPatch nativeBuildInputs buildInputs postInstall;
          src = ./.;
          postPatch = postPatch;
          nativeBuildInputs = nativeBuildInputs;
          buildInputs = osSpecific;
          cmakeFlags = cmakeFlags
            ++ (if isAarch64 && isDarwin then [
-              "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
+            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
-              "-DLLAMA_METAL=ON"
+            "-DLLAMA_METAL=ON"
-            ] else [
+          ] else [
-              "-DLLAMA_BLAS=ON"
+            "-DLLAMA_BLAS=ON"
-              "-DLLAMA_BLAS_VENDOR=OpenBLAS"
+            "-DLLAMA_BLAS_VENDOR=OpenBLAS"
          ]);
          postInstall = postInstall;
          meta.mainProgram = "llama";
        };
        packages.opencl = pkgs.stdenv.mkDerivation {
-          name = "llama.cpp";
+          inherit name src meta postPatch nativeBuildInputs postInstall;
          src = ./.;
          postPatch = postPatch;
          nativeBuildInputs = nativeBuildInputs;
          buildInputs = with pkgs; buildInputs ++ [ clblast ];
          cmakeFlags = cmakeFlags ++ [
            "-DLLAMA_CLBLAST=ON"
          ];
-          postInstall = postInstall;
+        };
-          meta.mainProgram = "llama";
+        packages.rocm = pkgs.stdenv.mkDerivation {
          inherit name src meta postPatch nativeBuildInputs postInstall;
          buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
          cmakeFlags = cmakeFlags ++ [
            "-DLLAMA_HIPBLAS=1"
            "-DCMAKE_C_COMPILER=hipcc"
            "-DCMAKE_CXX_COMPILER=hipcc"
            "-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
          ];
        };
        apps.llama-server = {
          type = "app";
@ -80,8 +89,13 @@
          type = "app";
          program = "${self.packages.${system}.default}/bin/llama";
        };
        apps.quantize = {
          type = "app";
          program = "${self.packages.${system}.default}/bin/quantize";
        };
        apps.default = self.apps.${system}.llama;
        devShells.default = pkgs.mkShell {
          buildInputs = [ llama-python ];
          packages = nativeBuildInputs ++ osSpecific;
        };
      });
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -8,6 +8,7 @@
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
 //#define GGML_ALLOCATOR_DEBUG
@ -67,8 +68,8 @@ struct ggml_allocr {
    struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
    size_t max_size;
    bool measure;
-    int parse_seq[GGML_MAX_NODES];
+    int parse_seq[GGML_MAX_CONCUR];
-    bool has_parse_seq;
+    int parse_seq_len;
 #ifdef GGML_ALLOCATOR_DEBUG
    struct ggml_tensor * allocated_tensors[1024];
@ -243,14 +244,10 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
 }
 void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
    int pos = 0;
    for (int i = 0; i < n; i++) {
-        if (list[i] != -1) {
+        alloc->parse_seq[i] = list[i];
            alloc->parse_seq[pos] = list[i];
            pos++;
        }
    }
-    alloc->has_parse_seq = true;
+    alloc->parse_seq_len = n;
 }
 void ggml_allocr_reset(struct ggml_allocr * alloc) {
@ -273,7 +270,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
        /*.max_size      = */ 0,
        /*.measure       = */ false,
        /*.parse_seq     = */ {0},
-        /*.has_parse_seq = */ false,
+        /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
        /*.allocated_tensors = */ = {0},
 #endif
@ -302,7 +299,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
        /*.max_size      = */ 0,
        /*.measure       = */ true,
        /*.parse_seq     = */ {0},
-        /*.has_parse_seq = */ false,
+        /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
        /*.allocated_tensors = */ = {0},
 #endif
@ -449,8 +446,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                        else {
                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                            node->data = parent->data;
                            return;
                        }
                        return;
                    }
                }
            }
@ -501,69 +498,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                allocate_node(alloc, input);
            }
        }
-        for (int ind = 0; ind < gf->n_nodes; ind++) {
+        // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
-            int i;
+        int last_barrier_pos = 0;
-            if (alloc->has_parse_seq) {
+        int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
                i = alloc->parse_seq[ind];
            } else {
                i = ind;
            }
            struct ggml_tensor * node = gf->nodes[i];
-            // allocate parents (leafs)
+        for (int ind = 0; ind < n_nodes; ind++) {
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
+            // allocate a node if there is no parse_seq or this is not a barrier
-                struct ggml_tensor * parent = node->src[j];
+            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
-                if (parent == NULL) {
+                int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
-                    break;
+                struct ggml_tensor * node = gf->nodes[i];
                // allocate parents (leafs)
                for (int j = 0; j < GGML_MAX_SRC; j++) {
                    struct ggml_tensor * parent = node->src[j];
                    if (parent == NULL) {
                        break;
                    }
                    allocate_node(alloc, parent);
                }
-                allocate_node(alloc, parent);
+
                // allocate node
                allocate_node(alloc, node);
                AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
                for (int j = 0; j < GGML_MAX_SRC; j++) {
                    struct ggml_tensor * parent = node->src[j];
                    if (parent == NULL) {
                        break;
                    }
                    AT_PRINTF("%s", parent->name);
                    if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
                        AT_PRINTF(", ");
                    }
                }
                AT_PRINTF("\n");
            }
            // allocate node
            allocate_node(alloc, node);
            AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
            for (int j = 0; j < GGML_MAX_SRC; j++) {
                struct ggml_tensor * parent = node->src[j];
                if (parent == NULL) {
                    break;
                }
                AT_PRINTF("%s", parent->name);
                if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
                    AT_PRINTF(", ");
                }
            }
            AT_PRINTF("\n");
            // update parents
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
+            // update immediately if there is no parse_seq
-                struct ggml_tensor * parent = node->src[j];
+            // update only at barriers if there is parse_seq
-                if (parent == NULL) {
+            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
-                    break;
+                int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
                int update_end   = alloc->parse_seq_len ? ind              : ind + 1;
                for (int i = update_start; i < update_end; i++) {
                    int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
                    struct ggml_tensor * node = gf->nodes[node_i];
                    for (int j = 0; j < GGML_MAX_SRC; j++) {
                        struct ggml_tensor * parent = node->src[j];
                        if (parent == NULL) {
                            break;
                        }
                        struct hash_node * p_hn = hash_get(ht, parent);
                        p_hn->n_children -= 1;
                        //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
                        if (p_hn->n_children == 0 && p_hn->n_views == 0) {
                            if (ggml_is_view(parent)) {
                                struct ggml_tensor * view_src = get_view_source(parent);
                                struct hash_node * view_src_hn = hash_get(ht, view_src);
                                view_src_hn->n_views -= 1;
                                AT_PRINTF("view_src %s\n", view_src->name);
                                if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
                                    ggml_allocator_free_tensor(alloc, view_src);
                                }
                            }
                            else {
                                if (parent->data != node->data) {
                                    ggml_allocator_free_tensor(alloc, parent);
                                }
                            }
                        }
                    }
                }
-                struct hash_node * p_hn = hash_get(ht, parent);
+                AT_PRINTF("\n");
-                p_hn->n_children -= 1;
+                if (alloc->parse_seq_len) {
-
+                    last_barrier_pos = ind + 1;
                //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
                if (p_hn->n_children == 0 && p_hn->n_views == 0) {
                    if (ggml_is_view(parent)) {
                        struct ggml_tensor * view_src = get_view_source(parent);
                        struct hash_node * view_src_hn = hash_get(ht, view_src);
                        view_src_hn->n_views -= 1;
                        AT_PRINTF("view_src %s\n", view_src->name);
                        if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
                            ggml_allocator_free_tensor(alloc, view_src);
                        }
                    }
                    else {
                        if (parent->data != node->data) {
                            ggml_allocator_free_tensor(alloc, parent);
                        }
                    }
                }
            }
            AT_PRINTF("\n");
        }
        // free graph outputs here that wouldn't be freed otherwise because they have no children
        if (outputs != NULL && outputs[g] != NULL) {
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -6,15 +6,116 @@
 #include <atomic>
 #include <assert.h>
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
 #ifdef __HIP_PLATFORM_AMD__
 // for rocblas_initialize()
 #include "rocblas/rocblas.h"
 #endif
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N HIPBLAS_OP_N
 #define CUBLAS_OP_T HIPBLAS_OP_T
 #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
 #define CUBLAS_TF32_TENSOR_OP_MATH 0
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_32F  HIPBLAS_R_32F
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasCreate hipblasCreate
 #define cublasGemmEx hipblasGemmEx
 #define cublasHandle_t hipblasHandle_t
 #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
 #define cublasSetStream hipblasSetStream
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
 #define cudaEventDisableTiming hipEventDisableTiming
 #define cudaEventRecord hipEventRecord
 #define cudaEvent_t hipEvent_t
 #define cudaEventDestroy hipEventDestroy
 #define cudaFree hipFree
 #define cudaFreeHost hipHostFree
 #define cudaGetDevice hipGetDevice
 #define cudaGetDeviceCount hipGetDeviceCount
 #define cudaGetDeviceProperties hipGetDeviceProperties
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetLastError hipGetLastError
 #define cudaMalloc hipMalloc
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
 #define cudaMemcpy hipMemcpy
 #define cudaMemcpy2DAsync hipMemcpy2DAsync
 #define cudaMemcpyAsync hipMemcpyAsync
 #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
 #define cudaMemset hipMemset
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define cudaStreamNonBlocking hipStreamNonBlocking
 #define cudaStreamSynchronize hipStreamSynchronize
 #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
 #define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
 #else
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
 #endif
 #include "ggml-cuda.h"
 #include "ggml.h"
 #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
 #ifndef CC_TURING
 #define CC_TURING   700
 #endif
 #if defined(GGML_USE_HIPBLAS)
 #define __CUDA_ARCH__ 1300
 typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
 static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
    return reinterpret_cast<const int&>(c);
 }
 static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
    c = __builtin_amdgcn_sdot4(a, b, c, false);
 #elif defined(__gfx1100__)
    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
 #elif defined(__gfx1010__) || defined(__gfx900__)
    int tmp1;
    int tmp2;
    asm("\n \
        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
        v_add3_u32 %0, %1, %2, %0 \n \
        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
        v_add3_u32 %0, %1, %2, %0 \n \
        "
        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
        : "v"(a), "v"(b)
    );
 #else
    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
 #endif
    return c;
 }
 #endif
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -205,11 +306,11 @@ typedef struct {
 #define QI4_K (QK_K / (4*QR4_K))
 #ifdef GGML_QKK_64
 typedef struct {
-    half    d[2];              // super-block scales/mins
+    half    dm[2];             // super-block scales/mins
    uint8_t scales[2];         // 4-bit block scales/mins
    uint8_t qs[QK_K/2];        // 4--bit quants
 } block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
+static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
 #else
 typedef struct {
    half2 dm;                  // super-block scale for quantized scales/mins
@ -424,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
 static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q4_1 * x = (const block_q4_1 *) vx;
-    const dfloat d = x[ib].dm.x;
+    const dfloat d = __low2half(x[ib].dm);
-    const dfloat m = x[ib].dm.y;
+    const dfloat m = __high2half(x[ib].dm);
    const int vui = x[ib].qs[iqs];
@ -467,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
 static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q5_1 * x = (const block_q5_1 *) vx;
-    const dfloat d = x[ib].dm.x;
+    const dfloat d = __low2half(x[ib].dm);
-    const dfloat m = x[ib].dm.y;
+    const dfloat m = __high2half(x[ib].dm);
    uint32_t qh;
    memcpy(&qh, x[ib].qh, sizeof(qh));
@ -520,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
    const uint8_t q = x[i].qs[32*n + l];
    float * y = yy + i*QK_K + 128*n;
-    float dall = x[i].dm.x;
+    float dall = __low2half(x[i].dm);
-    float dmin = x[i].dm.y;
+    float dmin = __high2half(x[i].dm);
    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@ -531,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
    const int il = tid%16;  // 0...15
    const uint8_t q = x[i].qs[il] >> (2*is);
    float * y = yy + i*QK_K + 16*is + il;
-    float dall = x[i].dm.x;
+    float dall = __low2half(x[i].dm);
-    float dmin = x[i].dm.y;
+    float dmin = __high2half(x[i].dm);
    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
 #endif
@ -618,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
    float * y = yy + i*QK_K + 64*il + n*ir;
-    const float dall = x[i].dm.x;
+    const float dall = __low2half(x[i].dm);
-    const float dmin = x[i].dm.y;
+    const float dmin = __high2half(x[i].dm);
    const uint8_t * q = x[i].qs + 32*il + n*ir;
@ -636,8 +737,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
    const int tid = threadIdx.x;
    const uint8_t * q = x[i].qs;
    float * y = yy + i*QK_K;
-    const float d = (float)x[i].d[0];
+    const float d = (float)x[i].dm[0];
-    const float m = (float)x[i].d[1];
+    const float m = (float)x[i].dm[1];
    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
 #endif
@ -657,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
    float * y = yy + i*QK_K + 64*il + 2*ir;
-    const float dall = x[i].dm.x;
+    const float dall = __low2half(x[i].dm);
-    const float dmin = x[i].dm.y;
+    const float dmin = __high2half(x[i].dm);
    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
    const uint8_t * qh = x[i].qh + 2*ir;
@ -770,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
        const float   * y = yy + i * QK_K + y_offset;
        const uint8_t * q = x[i].qs + q_offset;
-        const float dall = x[i].dm.x;
+        const float dall = __low2half(x[i].dm);
-        const float dmin = x[i].dm.y;
+        const float dmin = __high2half(x[i].dm);
        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
        aux[0] = a[0] & 0x0f0f0f0f;
@ -991,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
        const float   * y1 = yy + i*QK_K + y_offset;
        const float   * y2 = y1 + 128;
-        const float dall = x[i].dm.x;
+        const float dall = __low2half(x[i].dm);
-        const float dmin = x[i].dm.y;
+        const float dmin = __high2half(x[i].dm);
        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux[0] = a[im+0] & kmask1;
@ -1054,8 +1155,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux16[0] = a[0] & 0x0f0f;
        aux16[1] = (a[0] >> 4) & 0x0f0f;
-        const float d = (float)x[i].d[0];
+        const float d = (float)x[i].dm[0];
-        const float m = (float)x[i].d[1];
+        const float m = (float)x[i].dm[1];
        float sum = 0.f;
        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
@ -1124,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
        const float   * y1  = yy + i*QK_K + y_offset;
        const float   * y2  = y1 + 128;
-        const float dall = x[i].dm.x;
+        const float dall = __low2half(x[i].dm);
-        const float dmin = x[i].dm.y;
+        const float dmin = __high2half(x[i].dm);
        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux[0] = a[im+0] & kmask1;
@ -1348,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
        return;
    }
-    y[ib].ds.x = d;
+    reinterpret_cast<half&>(y[ib].ds.x) = d;
-    y[ib].ds.y = sum;
+    reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@ -2346,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
    }
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
 }
 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@ -2432,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
 #pragma unroll
    for (int i = 0; i < QR2_K; ++ i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds.x;
+        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
    }
    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@ -2551,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
 #pragma unroll
    for (int i = 0; i < QR3_K; ++i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds.x;
+        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
    }
    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@ -2720,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    for (int i = 0; i < QR4_K; ++i) {
        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds.x;
+        d8[i] = __low2half(bq8i->ds);
        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
        u[2*i+0] = q8[0];
@ -2744,11 +2845,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    aux16[0] = a[0] & 0x0f0f;
    aux16[1] = (a[0] >> 4) & 0x0f0f;
-    const float dall = bq4_K->d[0];
+    const float dall = bq4_K->dm[0];
-    const float dmin = bq4_K->d[1];
+    const float dmin = bq4_K->dm[1];
-    const float d8_1 = bq8_1[0].ds.x;
+    const float d8_1 = __low2float(bq8_1[0].ds);
-    const float d8_2 = bq8_1[1].ds.x;
+    const float d8_2 = __low2float(bq8_1[1].ds);
    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@ -2828,7 +2929,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
 #if QK_K == 256
        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
 #else
        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
 #endif
    }
 #pragma unroll
@ -2901,7 +3006,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
 #pragma unroll
    for (int i = 0; i < QR5_K; ++i) {
        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds.x;
+        d8[i] = __low2float(bq8i->ds);
        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
        u[2*i+0] = q8[0];
@ -2919,8 +3024,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
    const float d = bq5_K->d;
-    const float d8_1 = bq8_1[0].ds.x;
+    const float d8_1 = __low2half(bq8_1[0].ds);
-    const float d8_2 = bq8_1[1].ds.x;
+    const float d8_2 = __low2half(bq8_1[1].ds);
    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@ -3018,7 +3123,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
 #if QK_K == 256
        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
 #endif
    }
 #pragma unroll
@ -3075,7 +3182,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
 #pragma unroll
    for (int i = 0; i < QR6_K; ++i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
+        d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
    }
    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@ -3243,7 +3350,7 @@ static __device__ __forceinline__ void mul_mat_q(
                    *dsi_dst = *dsi_src;
                } else {
                    float * dfi_dst = (float *) dsi_dst;
-                    *dfi_dst = (*dsi_src).x;
+                    *dfi_dst = __low2half(*dsi_src);
                }
            }
@ -3907,28 +4014,27 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
    dst[i + 1] = x0*sin_theta + x1*cos_theta;
 }
-// TODO: this implementation is wrong!
+static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
-//static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
+                                const float p_delta, const int p_delta_rows, const float theta_scale) {
-//                                const float p_delta, const int p_delta_rows, const float theta_scale) {
+    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
-//    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+
-//
+    if (col >= ncols) {
-//    if (col >= ncols) {
+        return;
-//        return;
+    }
-//    }
+
-//
+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
-//    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+    const int i = row*ncols + col/2;
-//    const int i = row*ncols + col/2;
+
-//
+    const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
-//    const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
+    const float sin_theta = sinf(theta);
-//    const float sin_theta = sinf(theta);
+    const float cos_theta = cosf(theta);
-//    const float cos_theta = cosf(theta);
+
-//
+    const float x0 = x[i + 0];
-//    const float x0 = x[i + 0];
+    const float x1 = x[i + ncols/2];
-//    const float x1 = x[i + ncols/2];
+
-//
+    dst[i + 0]       = x0*cos_theta - x1*sin_theta;
-//    dst[i + 0]       = x0*cos_theta - x1*sin_theta;
+    dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
-//    dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
+}
 //}
 static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;
@ -4609,6 +4715,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 #if QK_K == 256
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = g_compute_capabilities[id];
@ -4640,6 +4748,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
 #endif
 }
 static void ggml_mul_mat_q4_K_q8_1_cuda(
@ -4799,13 +4908,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
 static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
                          const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
-    GGML_ASSERT(nrows % 2 == 0);
+    GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
    const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
    const dim3 block_nums(nrows, num_blocks_x, 1);
    rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
 }
 static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
                          const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
    const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
    const dim3 block_nums(nrows, num_blocks_x, 1);
    rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
 }
 static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
    GGML_ASSERT(nrows % 4 == 0);
    const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
@ -4937,10 +5054,18 @@ void ggml_init_cublas() {
    static bool initialized = false;
    if (!initialized) {
 #ifdef __HIP_PLATFORM_AMD__
        // Workaround for a rocBLAS bug when using multiple graphics cards:
        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
        rocblas_initialize();
        CUDA_CHECK(cudaDeviceSynchronize());
 #endif
        CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
        int64_t total_vram = 0;
-        fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
+        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
        for (int id = 0; id < g_device_count; ++id) {
            cudaDeviceProp prop;
            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
@ -5548,8 +5673,9 @@ inline void ggml_cuda_op_rope(
        const float block_p = max(p - (n_ctx - 2.f), 0.f);
        rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
    } else if (is_neox) {
-        GGML_ASSERT(false && "RoPE NeoX not implemented yet");
+        GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
-#pragma message("TODO: implement RoPE NeoX for CUDA")
+        const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
        rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
    } else {
        const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
        rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
@ -6211,9 +6337,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
 void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
    const int mode = ((int32_t *) dst->op_params)[2];
    const bool is_glm = mode & 4;
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
 }
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -2,6 +2,14 @@
 #include "ggml.h"
 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
 #define GGML_CUBLAS_NAME "hipBLAS"
 #else
 #define GGML_CUDA_NAME "CUDA"
 #define GGML_CUBLAS_NAME "cuBLAS"
 #endif
 #ifdef  __cplusplus
 extern "C" {
 #endif
--- a/ggml.c
+++ b/ggml.c
@ -19397,7 +19397,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
 ////////////////////////////////////////////////////////////////////////////////
 struct gguf_str {
-    uint32_t n;
+    uint64_t n;  // GGUFv2
    char * data;
 };
@ -19411,9 +19411,12 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
    [GGUF_TYPE_FLOAT32] = sizeof(float),
    [GGUF_TYPE_BOOL]    = sizeof(bool),
    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
    [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
    [GGUF_TYPE_INT64]   = sizeof(int64_t),
    [GGUF_TYPE_FLOAT64] = sizeof(double),
    [GGUF_TYPE_ARRAY]   = 0, // undefined
 };
-static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
+static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
 static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
    [GGUF_TYPE_UINT8]   = "u8",
@ -19426,8 +19429,11 @@ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
    [GGUF_TYPE_BOOL]    = "bool",
    [GGUF_TYPE_STRING]  = "str",
    [GGUF_TYPE_ARRAY]   = "arr",
    [GGUF_TYPE_UINT64]  = "u64",
    [GGUF_TYPE_INT64]   = "i64",
    [GGUF_TYPE_FLOAT64] = "f64",
 };
-static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
+static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
 union gguf_value {
    uint8_t  uint8;
@ -19437,6 +19443,9 @@ union gguf_value {
    uint32_t uint32;
    int32_t  int32;
    float    float32;
    uint64_t uint64;
    int64_t  int64;
    double   float64;
    bool     bool_;
    struct gguf_str str;
@ -19444,7 +19453,7 @@ union gguf_value {
    struct {
        enum gguf_type type;
-        uint32_t n;
+        uint64_t n;  // GGUFv2
        void * data;
    } arr;
 };
@ -19452,8 +19461,6 @@ union gguf_value {
 struct gguf_kv {
    struct gguf_str key;
    uint32_t n_bytes; // TODO: is this actually needed?
    enum  gguf_type  type;
    union gguf_value value;
 };
@ -19461,15 +19468,15 @@ struct gguf_kv {
 struct gguf_header {
    uint32_t magic;
    uint32_t version;
-    uint32_t n_tensors;
+    uint64_t n_tensors; // GGUFv2
-    uint32_t n_kv;
+    uint64_t n_kv;      // GGUFv2
 };
 struct gguf_tensor_info {
    struct gguf_str name;
    uint32_t n_dims;
-    uint32_t ne[GGML_MAX_DIMS];
+    uint64_t ne[GGML_MAX_DIMS];
    enum ggml_type type;
@ -19500,19 +19507,32 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
    return n == size;
 }
-static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
+// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
 static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
    p->n    = 0;
    p->data = NULL;
    bool ok = true;
    // TODO: how to avoid mallocs for strings?
    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
    ok = ok && gguf_fread_el(file,  p->data, p->n,         offset);
    return ok;
 }
 static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
    p->n    = 0;
    p->data = NULL;
    bool ok = true;
    uint32_t n = 0;
    ok = ok && gguf_fread_el(file, &n,       sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
    ok = ok && gguf_fread_el(file,  p->data, p->n,      offset);
    return ok;
 }
 struct gguf_context * gguf_init_empty(void) {
    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
@ -19568,8 +19588,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
        ctx->data  = NULL;
        ok = ok && gguf_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
-        ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
+
-        ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
+        if (ctx->header.version == 1) {
            // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
            uint32_t n_tensors = 0;
            uint32_t n_kv      = 0;
            ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
            ok = ok && gguf_fread_el(file, &n_kv,      sizeof(n_kv),      &offset);
            ctx->header.n_tensors = n_tensors;
            ctx->header.n_kv      = n_kv;
        } else {
            ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
            ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
        }
        if (!ok) {
            fprintf(stderr, "%s: failed to read header\n", __func__);
@ -19579,6 +19612,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
        }
    }
    // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
    bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
    if (ctx->header.version == 1) {
        gguf_fread_str = gguf_fread_str_v1;
    }
    // read the kv pairs
    {
        ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
@ -19588,9 +19627,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
-            ok = ok && gguf_fread_str(file, &kv->key,                          &offset);
+            ok = ok && gguf_fread_str(file, &kv->key,                    &offset);
-          //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
+            ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
            ok = ok && gguf_fread_el (file, &kv->type,    sizeof(kv->type),    &offset);
            //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
@ -19602,12 +19640,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                case GGUF_TYPE_UINT32:  ok = ok && gguf_fread_el (file, &kv->value.uint32,  sizeof(kv->value.uint32),  &offset); break;
                case GGUF_TYPE_INT32:   ok = ok && gguf_fread_el (file, &kv->value.int32,   sizeof(kv->value.int32),   &offset); break;
                case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
                case GGUF_TYPE_UINT64:  ok = ok && gguf_fread_el (file, &kv->value.uint64,  sizeof(kv->value.uint64),  &offset); break;
                case GGUF_TYPE_INT64:   ok = ok && gguf_fread_el (file, &kv->value.int64,   sizeof(kv->value.int64),   &offset); break;
                case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
                case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (file, &kv->value.bool_,   sizeof(kv->value.bool_),   &offset); break;
                case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(file, &kv->value.str,                                &offset); break;
                case GGUF_TYPE_ARRAY:
                    {
                        ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
-                        ok = ok && gguf_fread_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n),    &offset);
+
                        if (ctx->header.version == 1) {
                            // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
                            uint32_t n = 0;
                            ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
                            kv->value.arr.n = n;
                        } else {
                            ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
                        }
                        switch (kv->value.arr.type) {
                            case GGUF_TYPE_UINT8:
@ -19617,6 +19666,9 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                            case GGUF_TYPE_UINT32:
                            case GGUF_TYPE_INT32:
                            case GGUF_TYPE_FLOAT32:
                            case GGUF_TYPE_UINT64:
                            case GGUF_TYPE_INT64:
                            case GGUF_TYPE_FLOAT64:
                            case GGUF_TYPE_BOOL:
                                {
                                    kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
@ -19663,7 +19715,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
            ok = ok && gguf_fread_str(file, &info->name,                          &offset);
            ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);
            for (uint32_t j = 0; j < info->n_dims; ++j) {
-                ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
+                if (ctx->header.version == 1) {
                    // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
                    uint32_t t = 0;
                    ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
                    info->ne[j] = t;
                } else {
                    ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
                }
            }
            ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
            ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
@ -19957,6 +20016,18 @@ float gguf_get_val_f32(struct gguf_context * ctx, int i) {
    return ctx->kv[i].value.float32;
 }
 uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
    return ctx->kv[i].value.uint64;
 }
 int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
    return ctx->kv[i].value.int64;
 }
 double gguf_get_val_f64(struct gguf_context * ctx, int i) {
    return ctx->kv[i].value.float64;
 }
 bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
    return ctx->kv[i].value.bool_;
 }
@ -20003,7 +20074,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
    const int n_kv = gguf_get_n_kv(ctx);
    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
-    ctx->kv[n_kv].key.n    = strlen(key) + 1;
+    ctx->kv[n_kv].key.n    = strlen(key);
    ctx->kv[n_kv].key.data = strdup(key);
    ctx->header.n_kv++;
@ -20059,6 +20130,27 @@ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
    ctx->kv[idx].value.float32 = val;
 }
 void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type         = GGUF_TYPE_UINT64;
    ctx->kv[idx].value.uint64 = val;
 }
 void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type        = GGUF_TYPE_INT64;
    ctx->kv[idx].value.int64 = val;
 }
 void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type          = GGUF_TYPE_FLOAT64;
    ctx->kv[idx].value.float64 = val;
 }
 void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
    const int idx = gguf_get_or_add_key(ctx, key);
@ -20070,7 +20162,7 @@ void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char *
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type           = GGUF_TYPE_STRING;
-    ctx->kv[idx].value.str.n    = strlen(val) + 1;
+    ctx->kv[idx].value.str.n    = strlen(val);
    ctx->kv[idx].value.str.data = strdup(val);
 }
@ -20093,7 +20185,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
    ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
    for (int i = 0; i < n; i++) {
        struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
-        str->n    = strlen(data[i]) + 1;
+        str->n    = strlen(data[i]);
        str->data = strdup(data[i]);
    }
 }
@ -20109,6 +20201,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
            case GGUF_TYPE_UINT32:  gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32);   break;
            case GGUF_TYPE_INT32:   gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32);    break;
            case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32);  break;
            case GGUF_TYPE_UINT64:  gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64);   break;
            case GGUF_TYPE_INT64:   gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64);    break;
            case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64);  break;
            case GGUF_TYPE_BOOL:    gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_);    break;
            case GGUF_TYPE_STRING:  gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
            case GGUF_TYPE_ARRAY:
@ -20137,7 +20232,7 @@ void gguf_add_tensor(
    const int idx = ctx->header.n_tensors;
    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
-    ctx->infos[idx].name.n    = strlen(tensor->name) + 1;
+    ctx->infos[idx].name.n    = strlen(tensor->name);
    ctx->infos[idx].name.data = strdup(tensor->name);
    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
@ -20270,6 +20365,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
            case GGUF_TYPE_UINT32:  gguf_bwrite_el (buf, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
            case GGUF_TYPE_INT32:   gguf_bwrite_el (buf, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
            case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
            case GGUF_TYPE_UINT64:  gguf_bwrite_el (buf, &kv->value.uint64,  sizeof(kv->value.uint64) ); break;
            case GGUF_TYPE_INT64:   gguf_bwrite_el (buf, &kv->value.int64,   sizeof(kv->value.int64)  ); break;
            case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
            case GGUF_TYPE_BOOL:    gguf_bwrite_el (buf, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
            case GGUF_TYPE_STRING:  gguf_bwrite_str(buf, &kv->value.str                               ); break;
            case GGUF_TYPE_ARRAY:
@ -20285,6 +20383,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
                        case GGUF_TYPE_UINT32:
                        case GGUF_TYPE_INT32:
                        case GGUF_TYPE_FLOAT32:
                        case GGUF_TYPE_UINT64:
                        case GGUF_TYPE_INT64:
                        case GGUF_TYPE_FLOAT64:
                        case GGUF_TYPE_BOOL:
                            {
                                gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
@ -20519,6 +20620,14 @@ int ggml_cpu_has_sse3(void) {
 #endif
 }
 int ggml_cpu_has_ssse3(void) {
 #if defined(__SSSE3__)
    return 1;
 #else
    return 0;
 #endif
 }
 int ggml_cpu_has_vsx(void) {
 #if defined(__POWER9_VECTOR__)
    return 1;
--- a/ggml.h
+++ b/ggml.h
@ -216,7 +216,7 @@
 #define GGML_EXIT_ABORTED 1
 #define GGUF_MAGIC   0x46554747 // "GGUF"
-#define GGUF_VERSION 1
+#define GGUF_VERSION 2
 #define GGUF_DEFAULT_ALIGNMENT 32
@ -1832,6 +1832,9 @@ extern "C" {
        GGUF_TYPE_BOOL    = 7,
        GGUF_TYPE_STRING  = 8,
        GGUF_TYPE_ARRAY   = 9,
        GGUF_TYPE_UINT64  = 10,
        GGUF_TYPE_INT64   = 11,
        GGUF_TYPE_FLOAT64 = 12,
        GGUF_TYPE_COUNT,       // marks the end of the enum
    };
@ -1872,6 +1875,9 @@ extern "C" {
    GGML_API uint32_t     gguf_get_val_u32 (struct gguf_context * ctx, int i);
    GGML_API int32_t      gguf_get_val_i32 (struct gguf_context * ctx, int i);
    GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);
    GGML_API uint64_t     gguf_get_val_u64 (struct gguf_context * ctx, int i);
    GGML_API int64_t      gguf_get_val_i64 (struct gguf_context * ctx, int i);
    GGML_API double       gguf_get_val_f64 (struct gguf_context * ctx, int i);
    GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);
    GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
    GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);
@ -1891,6 +1897,9 @@ extern "C" {
    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
    GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
    GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t  val);
    GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double   val);
    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
@ -1949,6 +1958,7 @@ extern "C" {
    GGML_API int ggml_cpu_has_clblast    (void);
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
    GGML_API int ggml_cpu_has_vsx        (void);
    //
--- a/gguf-py/LICENSE
+++ b/gguf-py/LICENSE
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2023 Georgi Gerganov
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@ -0,0 +1,55 @@
 ## gguf
 This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
 (GGML Universal File) format.
 See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py)
 as an example for its usage.
 ## Installation
 ```sh
 pip install gguf
 ```
 ## Development
 Maintainers who participate in development of this package are advised to install it in editable mode:
 ```sh
 cd /path/to/llama.cpp/gguf-py
 pip install --editable .
 ```
 **Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`.
 In this case, upgrade Pip to the latest:
 ```sh
 pip install --upgrade pip
 ```
 ## Publishing
 To publish the package, you need to have `twine` and `build` installed:
 ```sh
 pip install build twine
 ```
 Then, folow these steps to release a new version:
 1. Update the version in `pyproject.toml`.
 2. Build the package:
 ```sh
 python -m build
 ```
 3. Upload the generated distribution archives:
 ```sh
 python -m twine upload dist/*
 ```
 ## TODO
 - [ ] Add tests
 - [ ] Include conversion scripts as command line entry points in this package.
 - Add CI workflow for releasing the package.
--- a/gguf-py/gguf/init.py
+++ b/gguf-py/gguf/init.py
@ -0,0 +1 @@
 from .gguf import *
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@ -13,7 +13,7 @@ from typing import Any, IO, List, Optional
 #
 GGUF_MAGIC             = 0x46554747
-GGUF_VERSION           = 1
+GGUF_VERSION           = 2
 GGUF_DEFAULT_ALIGNMENT = 32
 # general
@ -47,6 +47,7 @@ KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
 # RoPE
 KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
 KEY_ROPE_FREQ_BASE       = "{arch}.rope.freq_base"
 KEY_ROPE_SCALE_LINEAR    = "{arch}.rope.scale_linear"
 # tokenization
@ -364,6 +365,9 @@ class GGUFValueType(IntEnum):
    BOOL    = 7
    STRING  = 8
    ARRAY   = 9
    UINT64  = 10
    INT64   = 11
    FLOAT64 = 12
    @staticmethod
    def get_type(val):
@ -377,6 +381,7 @@ class GGUFValueType(IntEnum):
            return GGUFValueType.BOOL
        elif isinstance(val, int):
            return GGUFValueType.INT32
        # TODO: need help with 64-bit types in Python
        else:
            print("Unknown type: "+str(type(val)))
            sys.exit()
@ -399,8 +404,8 @@ class GGUFWriter:
    def write_header_to_file(self):
        self.fout.write(struct.pack("<I", GGUF_MAGIC))
        self.fout.write(struct.pack("<I", GGUF_VERSION))
-        self.fout.write(struct.pack("<I", self.ti_data_count))
+        self.fout.write(struct.pack("<Q", self.ti_data_count))
-        self.fout.write(struct.pack("<I", self.kv_data_count))
+        self.fout.write(struct.pack("<Q", self.kv_data_count))
        self.flush()
 #        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
@ -443,6 +448,18 @@ class GGUFWriter:
        self.add_key(key)
        self.add_val(val, GGUFValueType.FLOAT32)
    def add_uint64(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.UINT64)
    def add_int64(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.INT64)
    def add_float64(self, key: str, val: float):
        self.add_key(key)
        self.add_val(val, GGUFValueType.FLOAT64)
    def add_bool(self, key: str, val: bool):
        self.add_key(key)
        self.add_val(val, GGUFValueType.BOOL)
@ -482,17 +499,23 @@ class GGUFWriter:
            self.kv_data += struct.pack("<i", val)
        elif vtype == GGUFValueType.FLOAT32:
            self.kv_data += struct.pack("<f", val)
        elif vtype == GGUFValueType.UINT64:
            self.kv_data += struct.pack("<Q", val)
        elif vtype == GGUFValueType.INT64:
            self.kv_data += struct.pack("<q", val)
        elif vtype == GGUFValueType.FLOAT64:
            self.kv_data += struct.pack("<d", val)
        elif vtype == GGUFValueType.BOOL:
            self.kv_data += struct.pack("?", val)
        elif vtype == GGUFValueType.STRING:
            encoded_val = val.encode("utf8") if isinstance(val, str) else val
-            self.kv_data += struct.pack("<I", len(encoded_val))
+            self.kv_data += struct.pack("<Q", len(encoded_val))
            self.kv_data += encoded_val
        elif vtype == GGUFValueType.ARRAY:
            ltype = set([GGUFValueType.get_type(item) for item in val])
            assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
            self.kv_data += struct.pack("<I", list(ltype)[0])
-            self.kv_data += struct.pack("<I", len(val))
+            self.kv_data += struct.pack("<Q", len(val))
            for item in val:
                self.add_val(item, add_vtype=False)
        else:
@ -506,12 +529,12 @@ class GGUFWriter:
        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
        encoded_name = name.encode("utf8")
-        self.ti_data += struct.pack("<I", len(encoded_name))
+        self.ti_data += struct.pack("<Q", len(encoded_name))
        self.ti_data += encoded_name
        n_dims = len(tensor_shape)
        self.ti_data += struct.pack("<I", n_dims)
        for i in range(n_dims):
-            self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
+            self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
        if raw_dtype is None:
            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
        else:
@ -663,7 +686,10 @@ class GGUFWriter:
        self.add_uint32(
            KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
-    def add_rope_scale_linear(self, value:  float):
+    def add_rope_freq_base(self, value: float):
        self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
    def add_rope_scale_linear(self, value: float):
        self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
    def add_tokenizer_model(self, model: str):
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@ -0,0 +1,28 @@
 [tool.poetry]
 name = "gguf"
 version = "0.2.1"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
    {include = "gguf"},
 ]
 readme = "README.md"
 homepage = "https://ggml.ai"
 repository = "https://github.com/ggerganov/llama.cpp"
 keywords = ["ggml", "gguf", "llama.cpp"]
 classifiers = [
    "Programming Language :: Python :: 3",
    "License :: OSI Approved :: MIT License",
    "Operating System :: OS Independent",
 ]
 [tool.poetry.dependencies]
 python = ">=3.8"
 numpy = ">=1.17"
 [tool.poetry.dev-dependencies]
 pytest = "^5.2"
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
--- a/gguf-py/tests/test_gguf.py
+++ b/gguf-py/tests/test_gguf.py
@ -0,0 +1,7 @@
 import gguf
 # TODO: add tests
 def test_write_gguf():
    pass
--- a/llama.cpp
+++ b/llama.cpp
@ -1,9 +1,6 @@
 // Defines fileno on msys:
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #endif
 #include "llama.h"
@ -62,6 +59,9 @@
 #include <cinttypes>
 #include <climits>
 #include <cstdarg>
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
@ -114,12 +114,17 @@ static size_t utf8_len(char src) {
 }
 void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    for (size_t pos = 0; ; pos += replace.length()) {
+    std::string result;
-        pos = s.find(search, pos);
+    for (size_t pos = 0; ; pos += search.length()) {
-        if (pos == std::string::npos) break;
+        auto new_pos = s.find(search, pos);
-        s.erase(pos, search.length());
+        if (new_pos == std::string::npos) {
-        s.insert(pos, replace);
+            result += s.substr(pos, s.size() - pos);
            break;
        }
        result += s.substr(pos, new_pos - pos) + replace;
        pos = new_pos;
    }
    s = std::move(result);
 }
 static void zeros(std::ofstream & file, size_t n) {
@ -195,6 +200,7 @@ enum llm_kv {
    LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_FREQ_BASE,
    LLM_KV_ROPE_SCALE_LINEAR,
    LLM_KV_TOKENIZER_MODEL,
@ -238,6 +244,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,   "%s.attention.layer_norm_rms_epsilon" },
    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count" },
    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"       },
    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"    },
    { LLM_KV_TOKENIZER_MODEL,               "tokenizer.ggml.model"              },
@ -794,12 +801,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
    (void) tensor;
 }
-static std::string llama_token_to_text(const struct llama_context * ctx, llama_token token) {
+static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
+    const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_token_to_str(ctx, token, result.data(), result.size());
+        int check = llama_token_to_piece(ctx, token, result.data(), result.size());
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@ -827,6 +834,7 @@ enum e_model {
    MODEL_7B,
    MODEL_13B,
    MODEL_30B,
    MODEL_34B,
    MODEL_40B,
    MODEL_65B,
    MODEL_70B,
@ -952,10 +960,10 @@ struct llama_vocab {
    id linefeed_id = 13;
    int find_bpe_rank(std::string token_left, std::string token_right) const {
-        replace_all(token_left,  " ",  "Ġ");
+        replace_all(token_left,  " ",  "\u0120");
-        replace_all(token_left,  "\n", "Ċ");
+        replace_all(token_left,  "\n", "\u010A");
-        replace_all(token_right, " ",  "Ġ");
+        replace_all(token_right, " ",  "\u0120");
-        replace_all(token_right, "\n", "Ċ");
+        replace_all(token_right, "\n", "\u010A");
        auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
        if (it == bpe_ranks.end()) {
@ -1141,11 +1149,13 @@ static bool llama_kv_cache_init(
 enum llama_fver {
    GGUF_FILE_VERSION_V1 = 1,
    GGUF_FILE_VERSION_V2 = 2,
 };
 static const char * llama_file_version_name(llama_fver version) {
    switch (version) {
-        case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
+        case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
        case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
    }
    return "unknown";
@ -1518,6 +1528,7 @@ static const char * llama_model_type_name(e_model type) {
        case MODEL_7B:  return "7B";
        case MODEL_13B: return "13B";
        case MODEL_30B: return "30B";
        case MODEL_34B: return "34B";
        case MODEL_40B: return "40B";
        case MODEL_65B: return "65B";
        case MODEL_70B: return "70B";
@ -1559,12 +1570,26 @@ static void llm_load_hparams(
    hparams.n_head_kv = hparams.n_head;
    GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
-    // TODO: manually setting rope scale should override this
+    // TODO: manually setting rope freq base and scale should override this
    // FIXME: partial fix when the param specified is not the default value, but
    //        will not work for overriding the model value to the params default
    llama_context_params defaults = llama_context_default_params();
    // rope_freq_base
    {
        float ropebase = 10000.0f;
        GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
        if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
            rope_freq_base = ropebase;
        }
    }
    // rope_freq_scale (inverse of the kv) is optional
    {
        float ropescale = 1.0f;
        GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
-        if (ropescale != 1.0f) {
+        if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
            rope_freq_scale = 1.0f/ropescale;
        }
    }
@ -1590,6 +1615,7 @@ static void llm_load_hparams(
                    case 26: model.type = e_model::MODEL_3B; break;
                    case 32: model.type = e_model::MODEL_7B; break;
                    case 40: model.type = e_model::MODEL_13B; break;
                    case 48: model.type = e_model::MODEL_34B; break;
                    case 60: model.type = e_model::MODEL_30B; break;
                    case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
@ -1616,7 +1642,8 @@ static void llm_load_hparams(
 }
 // TODO: This should probably be in llama.h
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape);
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
 static void llm_load_vocab(
        llama_model_loader & ml,
@ -1718,7 +1745,11 @@ static void llm_load_vocab(
    }
    // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
-    vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0];
+    if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
        vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
    } else {
        vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
    }
    // special tokens
    GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
@ -1817,7 +1848,7 @@ static void llm_load_tensors(
    (void) main_gpu;
    (void) mul_mat_q;
 #if defined(GGML_USE_CUBLAS)
-    LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
+    LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
    ggml_cuda_set_main_device(main_gpu);
    ggml_cuda_set_mul_mat_q(mul_mat_q);
 #define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
@ -1939,6 +1970,14 @@ static void llm_load_tensors(
                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
                        if (backend_norm == GGML_BACKEND_GPU) {
                            vram_weights += ggml_nbytes(model.output_norm);
                            vram_weights += ggml_nbytes(model.output_norm_b);
                        }
                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
                            vram_weights += ggml_nbytes(model.output);
                        }
                    }
                    const uint32_t n_ff = hparams.n_ff;
@ -1948,7 +1987,7 @@ static void llm_load_tensors(
                    model.layers.resize(n_layer);
                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+                        const ggml_backend backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
                        const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
                        auto & layer = model.layers[i];
@ -1959,6 +1998,11 @@ static void llm_load_tensors(
                        if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
                            layer.attn_norm_2   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
                            layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, backend);
                            if (backend == GGML_BACKEND_GPU) {
                                vram_weights += ggml_nbytes(layer.attn_norm_2);
                                vram_weights += ggml_nbytes(layer.attn_norm_2_b);
                            }
                        }
                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
@ -1966,6 +2010,13 @@ static void llm_load_tensors(
                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
                        if (backend == GGML_BACKEND_GPU) {
                            vram_weights +=
                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
                                ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.wo)          +
                                ggml_nbytes(layer.w2)        + ggml_nbytes(layer.w3);
                        }
                    }
                } break;
            default:
@ -2596,18 +2647,20 @@ static struct ggml_cgraph * llm_build_falcon(
            const size_t wsize = ggml_type_size(cur->type);
-            struct ggml_tensor * tmpq = ggml_view_3d(
+            // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
            //       non-contiguous views is added for the rope operator
            struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
                ctx0, cur, n_embd_head, n_head, N,
                wsize * n_embd_head,
                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                0);
+                0));
            offload_func_kq(tmpq);
-            struct ggml_tensor * tmpk = ggml_view_3d(
+            struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
                ctx0, cur, n_embd_head, n_head_kv, N,
                wsize * n_embd_head,
                wsize * n_embd_head * (n_head + 2 * n_head_kv),
-                wsize * n_embd_head *  n_head);
+                wsize * n_embd_head *  n_head));
            offload_func_kq(tmpk);
            struct ggml_tensor * tmpv = ggml_view_3d(
@ -2704,11 +2757,6 @@ static struct ggml_cgraph * llm_build_falcon(
            struct ggml_tensor * inpFF = attn_norm;
            cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
            // TODO: this is temporary needed to introduce artificial dependency between FF and ATTN
            //       adding this, because there seems to be a bug in the Metal concurrency optimization
            //       without this line, the results are non-deterministic and wrong
            cur->src[2] = attn_out;
            offload_func(cur);
            cur = ggml_gelu(ctx0, cur);
@ -2797,7 +2845,6 @@ static bool llama_eval_internal(
    GGML_ASSERT(n_tokens > 0);
    GGML_ASSERT(n_past >= 0);
    GGML_ASSERT(n_threads > 0);
    // TODO: keep the values of n_batch and n_ctx
    // GGML_ASSERT(n_tokens <= n_batch);
    // GGML_ASSERT(n_past + n_tokens <= n_ctx);
@ -2808,6 +2855,8 @@ static bool llama_eval_internal(
    ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
 #endif
    GGML_ASSERT(n_threads > 0);
    const int N = n_tokens;
    const auto & model   = lctx.model;
@ -2992,16 +3041,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
    return vocab.token_to_id.at(buf);
 }
-static std::string llama_escape_whitespace(const std::string& text) {
+static void llama_escape_whitespace(std::string & text) {
-    std::string result = "\xe2\x96\x81";
+    replace_all(text, " ", "\xe2\x96\x81");
    for (size_t offs = 0; offs < text.length(); ++offs) {
        if (text[offs] == ' ') {
            result += "\xe2\x96\x81";
        } else {
            result += text[offs];
        }
    }
    return result;
 }
 static void llama_unescape_whitespace(std::string & word) {
@ -3185,7 +3226,7 @@ struct llm_bigram_bpe {
 };
 struct llm_tokenizer_bpe {
-    llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; }
+    llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
        int final_prev_index = -1;
@ -3337,8 +3378,6 @@ private:
        return words;
    }
    bool flag_g2ws = false;
    const llama_vocab & vocab;
    std::vector<llm_symbol> symbols;
@ -3347,9 +3386,18 @@ private:
    llm_bigram_bpe::queue work_queue;
 };
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
    std::vector<llama_vocab::id> output;
    // OG tokenizer behavior:
    //
    // tokenizer.encode('', add_bos=True)  returns [1]
    // tokenizer.encode('', add_bos=False) returns []
    if (bos && vocab.special_bos_id != -1) {
        output.push_back(vocab.special_bos_id);
    }
    if (raw_text.empty()) {
        return output;
    }
@ -3357,29 +3405,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
    switch (vocab.type) {
        case LLAMA_VOCAB_TYPE_SPM:
            {
                // without adding this leading whitespace, we do not get the same results as the original tokenizer
                raw_text = " " + raw_text;
                llm_tokenizer_spm tokenizer(vocab);
-
+                llama_escape_whitespace(raw_text);
-                if (bos) {
+                tokenizer.tokenize(raw_text, output);
                    output.push_back(vocab.special_bos_id);
                }
                std::string text;
                if (escape) {
                    text = llama_escape_whitespace(raw_text);
                } else {
                    text = raw_text;
                }
                tokenizer.tokenize(text, output);
            } break;
        case LLAMA_VOCAB_TYPE_BPE:
            {
-                llm_tokenizer_bpe tokenizer(vocab, escape);
+                llm_tokenizer_bpe tokenizer(vocab);
                if (bos && vocab.special_bos_id != -1) {
                    output.push_back(vocab.special_bos_id);
                }
                tokenizer.tokenize(raw_text, output);
            } break;
    };
@ -3874,7 +3909,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
    // Calculate absolute value of second derivatives
    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = abs(second_derivatives[i]);
+        second_derivatives[i] = std::abs(second_derivatives[i]);
    }
    // Normalize the second derivatives
@ -4065,16 +4100,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
    std::vector<llama_grammar_candidate>                              candidates_grammar;
    for (size_t i = 0; i < candidates->size; ++i) {
-        const llama_token id   = candidates->data[i].id;
+        const llama_token id    = candidates->data[i].id;
-        const std::string text = llama_token_to_text(ctx, id);
+        const std::string piece = llama_token_to_str(ctx, id);
        if (id == eos) {
            if (!allow_eos) {
                candidates->data[i].logit = -INFINITY;
            }
-        } else if (text.empty() || text[0] == 0) {
+        } else if (piece.empty() || piece[0] == 0) {
            candidates->data[i].logit = -INFINITY;
        } else {
-            candidates_decoded.push_back(decode_utf8(text.c_str(), grammar->partial_utf8));
+            candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
            candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
        }
    }
@ -4278,10 +4313,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
        GGML_ASSERT(false);
    }
-    const std::string text = llama_token_to_text(ctx, token);
+    const std::string piece = llama_token_to_str(ctx, token);
    // Note terminating 0 in decoded string
-    const auto   decoded     = decode_utf8(text.c_str(), grammar->partial_utf8);
+    const auto   decoded     = decode_utf8(piece.c_str(), grammar->partial_utf8);
    const auto & code_points = decoded.first;
    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
        grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
@ -4292,6 +4327,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
 }
 //
 // Beam search
 //
 struct llama_beam {
    std::vector<llama_token> tokens;
    float p;  // Cumulative beam probability (renormalized relative to all beams)
    bool eob; // Initialize end-of-beam to false. Callback sets this to true.
    // Sort beams by probability. In case of ties, prefer beams at eob.
    bool operator<(const llama_beam & rhs) const {
        return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
    }
    // Shift off first n tokens and discard them.
    void shift_tokens(const size_t n) {
        if (n) {
            std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
            tokens.resize(tokens.size() - n);
        }
    }
    llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
 };
 // A struct for calculating logit-related info.
 struct llama_logit_info {
    const float * const logits;
    const int n_vocab;
    const float max_l;
    const float normalizer;
    struct sum_exp {
        float max_l;
        float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
    };
    llama_logit_info(llama_context * ctx)
      : logits(llama_get_logits(ctx))
      , n_vocab(llama_n_vocab(ctx))
      , max_l(*std::max_element(logits, logits + n_vocab))
      , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
      { }
    llama_token_data get_token_data(const llama_token token_id) const {
        constexpr auto p = std::numeric_limits<float>::quiet_NaN();  // never used
        return {token_id, logits[token_id], p};
    }
    // Return top k token_data by logit.
    std::vector<llama_token_data> top_k(size_t k) {
        std::vector<llama_token_data> min_heap;  // min-heap by logit
        const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
        min_heap.reserve(k_min);
        for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
            min_heap.push_back(get_token_data(token_id));
        }
        auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
        std::make_heap(min_heap.begin(), min_heap.end(), comp);
        for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
            if (min_heap.front().logit < logits[token_id]) {
                std::pop_heap(min_heap.begin(), min_heap.end(), comp);
                min_heap.back().id = token_id;
                min_heap.back().logit = logits[token_id];
                std::push_heap(min_heap.begin(), min_heap.end(), comp);
            }
        }
        return min_heap;
    }
    float probability_from_logit(float logit) {
        return normalizer * std::exp(logit - max_l);
    }
 };
 struct llama_beam_search_data {
    llama_context * ctx;
    size_t n_beams;
    int n_past;
    int n_predict;
    int n_threads;
    std::vector<llama_beam> beams;
    std::vector<llama_beam> next_beams;
    // Re-calculated on each loop iteration
    size_t common_prefix_length;
    // Used to communicate to/from callback on beams state.
    std::vector<llama_beam_view> beam_views;
    llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
      : ctx(ctx)
      , n_beams(n_beams)
      , n_past(n_past)
      , n_predict(n_predict)
      , n_threads(n_threads)
      , beam_views(n_beams) {
        beams.reserve(n_beams);
        next_beams.reserve(n_beams);
    }
    // Collapse beams to a single beam given by index.
    void collapse_beams(const size_t beam_idx) {
        if (0u < beam_idx) {
            std::swap(beams[0], beams[beam_idx]);
        }
        beams.resize(1);
    }
    // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
    // The repetative patterns below reflect the 2 stages of heaps:
    //  * Gather elements until the vector is full, then call std::make_heap() on it.
    //  * If the heap is full and a new element is found that should be included, pop the
    //    least element to the back(), replace it with the new, then push it into the heap.
    void fill_next_beams_by_top_probabilities(llama_beam & beam) {
        // Min-heaps use a greater-than comparator.
        const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
        if (beam.eob) {
            // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
            if (next_beams.size() < n_beams) {
                next_beams.push_back(std::move(beam));
                if (next_beams.size() == n_beams) {
                    std::make_heap(next_beams.begin(), next_beams.end(), comp);
                }
            } else if (next_beams.front().p < beam.p) {
                std::pop_heap(next_beams.begin(), next_beams.end(), comp);
                next_beams.back() = std::move(beam);
                std::push_heap(next_beams.begin(), next_beams.end(), comp);
            }
        } else {
            // beam is not at end-of-sentence, so branch with next top_k tokens.
            if (!beam.tokens.empty()) {
                llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
            }
            llama_logit_info logit_info(ctx);
            std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
            size_t i=0;
            if (next_beams.size() < n_beams) {
                for (; next_beams.size() < n_beams ; ++i) {
                    llama_beam next_beam = beam;
                    next_beam.tokens.push_back(next_tokens[i].id);
                    next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
                    next_beams.push_back(std::move(next_beam));
                }
                std::make_heap(next_beams.begin(), next_beams.end(), comp);
            } else {
                for (; next_beams.front().p == 0.0f ; ++i) {
                    std::pop_heap(next_beams.begin(), next_beams.end(), comp);
                    next_beams.back() = beam;
                    next_beams.back().tokens.push_back(next_tokens[i].id);
                    next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
                    std::push_heap(next_beams.begin(), next_beams.end(), comp);
                }
            }
            for (; i < n_beams ; ++i) {
                const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
                if (next_beams.front().p < next_p) {
                    std::pop_heap(next_beams.begin(), next_beams.end(), comp);
                    next_beams.back() = beam;
                    next_beams.back().tokens.push_back(next_tokens[i].id);
                    next_beams.back().p = next_p;
                    std::push_heap(next_beams.begin(), next_beams.end(), comp);
                }
            }
        }
    }
    // Find common_prefix_length based on beams.
    // Requires beams is not empty.
    size_t find_common_prefix_length() {
        size_t common_prefix_length = beams[0].tokens.size();
        for (size_t i = 1 ; i < beams.size() ; ++i) {
            common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
            for (size_t j = 0 ; j < common_prefix_length ; ++j) {
                if (beams[0].tokens[j] != beams[i].tokens[j]) {
                    common_prefix_length = j;
                    break;
                }
            }
        }
        return common_prefix_length;
    }
    // Construct beams_state to send back to caller via the callback function.
    // Side effect: set common_prefix_length = find_common_prefix_length();
    llama_beams_state get_beams_state(const bool last_call) {
        for (size_t i = 0 ; i < beams.size() ; ++i) {
            beam_views[i] = beams[i].view();
        }
        common_prefix_length = find_common_prefix_length();
        return {beam_views.data(), beams.size(), common_prefix_length, last_call};
    }
    // Loop:
    //  * while i < n_predict, AND
    //  * any of the beams have not yet reached end-of-beam (eob), AND
    //  * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
    //    (since all other beam probabilities can only decrease)
    void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
        beams.push_back({{}, 1.0f, false});  // Start with one empty beam w/ probability = 1.0 and !eob.
        const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
        for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
                       !beams[top_beam_index()].eob ; ++i) {
            callback(callback_data, get_beams_state(false));  // Sets common_prefix_length
            update_beams_from_beam_views();   // Update values (p,eob) that callback may have changed.
            if (common_prefix_length) {
                llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
                n_past += common_prefix_length;
            }
            // Zero-out next_beam probabilities to place them last in following min-heap.
            std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
            for (llama_beam & beam : beams) {
                beam.shift_tokens(common_prefix_length);
                fill_next_beams_by_top_probabilities(beam);
            }
            // next_beams become the beams of next/final iteration. Swap them to re-use memory.
            beams.swap(next_beams);
            renormalize_beam_probabilities(beams);
        }
        collapse_beams(top_beam_index());
        callback(callback_data, get_beams_state(true));
    }
    // As beams grow, the cumulative probabilities decrease.
    // Renormalize them to avoid floating point underflow.
    static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
        const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
        const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
        std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
    }
    // Assumes beams is non-empty.  Uses llama_beam::operator<() for ordering.
    size_t top_beam_index() {
        return std::max_element(beams.begin(), beams.end()) - beams.begin();
    }
    // Copy (p,eob) for each beam which may have been changed by the callback.
    void update_beams_from_beam_views() {
        for (size_t i = 0 ; i < beams.size() ; ++i) {
            beams[i].p = beam_views[i].p;
            beams[i].eob = beam_views[i].eob;
        }
    }
 };
 void llama_beam_search(llama_context * ctx,
                       llama_beam_search_callback_fn_t callback, void * callback_data,
                       size_t n_beams, int n_past, int n_predict, int n_threads) {
    assert(ctx);
    const int64_t t_start_sample_us = ggml_time_us();
    llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
    beam_search_data.loop(callback, callback_data);
    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
    ctx->n_sample++;
 }
 //
 // quantization
 //
@ -4389,6 +4675,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
    llama_model model;
    llm_load_arch(*ml, model);
    llm_load_hparams(*ml, model, 0, 0, 0);
    const size_t align = GGUF_DEFAULT_ALIGNMENT;
    struct gguf_context * ctx_out = gguf_init_empty();
@ -4414,6 +4704,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            ++n_feed_forward_w2;
        }
    }
    if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
        LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
                __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
    }
    int i_attention_wv = 0;
    int i_feed_forward_w2 = 0;
@ -4490,8 +4784,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
                int nx = tensor->ne[0];
-                int ny = tensor->ne[1];
+                if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
-                if (nx % QK_K == 0 && ny % QK_K == 0) {
+                    new_type = GGML_TYPE_Q8_0;
                }
                else if (new_type != GGML_TYPE_Q8_0) {
                    new_type = GGML_TYPE_Q6_K;
                }
            } else if (name.find("attn_v.weight") != std::string::npos) {
@ -4505,21 +4801,49 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
                else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
                        (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
                if (model.type == MODEL_70B) {
                    // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
                    // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
                    // nearly negligible increase in model size by quantizing this tensor with more bits:
                    if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
                }
                ++i_attention_wv;
            } else if (name.find("ffn_down.weight") != std::string::npos) {
                if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-                    new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+                    new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
                             : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
                             : GGML_TYPE_Q3_K;
                }
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
                    new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
                }
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
                    if (model.arch == LLM_ARCH_FALCON) {
                        new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
                                   use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
                    } else {
                        if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
                    }
                }
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
                    new_type = GGML_TYPE_Q5_K;
                }
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
                else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
                         use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
                ++i_feed_forward_w2;
            } else if (name.find("attn_output.weight") != std::string::npos) {
-                if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K  ) new_type = GGML_TYPE_Q3_K;
+                if (model.arch != LLM_ARCH_FALCON) {
-                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
+                    if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K  ) new_type = GGML_TYPE_Q3_K;
-                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
+                    else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
                    else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
                } else {
                    if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
                }
            }
            else if (name.find("attn_qkv.weight") != std::string::npos) {
                if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
            }
            else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
                if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@ -4534,8 +4858,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
                int nx = tensor->ne[0];
                int ny = tensor->ne[1];
-                if (nx % QK_K != 0 || ny % QK_K != 0) {
+                if (nx % QK_K != 0) {
-                    LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
+                    LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
                    convert_incompatible_tensor = true;
                }
            }
@ -5263,13 +5587,29 @@ int llama_model_n_embd(const struct llama_model * model) {
    return model->hparams.n_embd;
 }
-int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
+int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
    return snprintf(buf, buf_size, "%s %s %s",
            model->name.c_str(),
            llama_model_type_name(model->type),
            llama_model_ftype_name(model->ftype).c_str());
 }
 uint64_t llama_model_size(const struct llama_model * model) {
    uint64_t size = 0;
    for (const auto & it : model->tensors_by_name) {
        size += ggml_nbytes(it.second);
    }
    return size;
 }
 uint64_t llama_model_n_params(const struct llama_model * model) {
    uint64_t nparams = 0;
    for (const auto & it : model->tensors_by_name) {
        nparams += ggml_nelements(it.second);
    }
    return nparams;
 }
 int llama_model_quantize(
        const char * fname_inp,
        const char * fname_out,
@ -5794,8 +6134,7 @@ int llama_tokenize_with_model(
                 llama_token * tokens,
                         int   n_max_tokens,
                        bool   add_bos) {
-    auto escape = llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM;
+    auto res = llama_tokenize_internal(model->vocab, text, add_bos);
    auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
    if (n_max_tokens < (int) res.size()) {
        LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@ -5809,12 +6148,12 @@ int llama_tokenize_with_model(
    return res.size();
 }
-int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) {
+int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
-    return llama_token_to_str_with_model(&ctx->model, token, buf, length);
+    return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
 }
-// does not write null-terminator to str
+// does not write null-terminator to buf
-int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
+int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
    if (0 <= token && token < llama_model_n_vocab(model)) {
        if (llama_is_normal_token(model->vocab, token)) {
            std::string result = model->vocab.id_to_token[token].text;
@ -5902,6 +6241,7 @@ const char * llama_print_system_info(void) {
    s += "WASM_SIMD = "   + std::to_string(ggml_cpu_has_wasm_simd())   + " | ";
    s += "BLAS = "        + std::to_string(ggml_cpu_has_blas())        + " | ";
    s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";
    s += "SSSE3 = "       + std::to_string(ggml_cpu_has_ssse3())       + " | ";
    s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
    return s.c_str();
--- a/llama.h
+++ b/llama.h
@ -254,7 +254,11 @@ extern "C" {
    LLAMA_API int llama_model_n_embd (const struct llama_model * model);
    // Get a string describing the model type
-    LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
+    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
    // Returns the total size of all the tensors in the model in bytes
    LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
    // Returns the total number of parameters in the model
    LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
    // Returns 0 on success
    LLAMA_API int llama_model_quantize(
@ -348,7 +352,7 @@ extern "C" {
    LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
-    LLAMA_API llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
+    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
    // Special tokens
    LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence
@ -377,15 +381,17 @@ extern "C" {
                             int   n_max_tokens,
                            bool   add_bos);
-    // Token Id -> String. Uses the vocabulary in the provided context
+    // Token Id -> Piece.
-    // Does not write null terminator to the buffer
+    // Uses the vocabulary in the provided context.
-    LLAMA_API int llama_token_to_str(
+    // Does not write null terminator to the buffer.
    // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
    LLAMA_API int llama_token_to_piece(
            const struct llama_context * ctx,
                           llama_token   token,
                                  char * buf,
                                  int    length);
-    LLAMA_API int llama_token_to_str_with_model(
+    LLAMA_API int llama_token_to_piece_with_model(
              const struct llama_model * model,
                           llama_token   token,
                                  char * buf,
@ -465,6 +471,43 @@ extern "C" {
    /// @details Accepts the sampled token into the grammar
    LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
    //
    // Beam search
    //
    struct llama_beam_view {
        const llama_token * tokens;
        size_t n_tokens;
        float p;   // Cumulative beam probability (renormalized relative to all beams)
        bool eob;  // Callback should set this to true when a beam is at end-of-beam.
    };
    // Passed to beam_search_callback function.
    // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
    // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
    // These pointers are valid only during the synchronous callback, so should not be saved.
    struct llama_beams_state {
        struct llama_beam_view * beam_views;
        size_t n_beams;               // Number of elements in beam_views[].
        size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
        bool last_call;               // True iff this is the last callback invocation.
    };
    // Type of pointer to the beam_search_callback function.
    // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
    // passed back to beam_search_callback. This avoids having to use global variables in the callback.
    typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, llama_beams_state);
    /// @details Deterministically returns entire sentence constructed by a beam search.
    /// @param ctx Pointer to the llama_context.
    /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
    /// @param callback_data A pointer that is simply passed back to callback.
    /// @param n_beams Number of beams to use.
    /// @param n_past Number of tokens already evaluated.
    /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
    /// @param n_threads Number of threads as passed to llama_eval().
    LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +1,3 @@
 numpy==1.24
 sentencepiece==0.1.98
 gguf>=0.1.0
--- a/scripts/convert-gg.sh
+++ b/scripts/convert-gg.sh
@ -0,0 +1,26 @@
 #!/bin/bash
 set -e
 # LLaMA v1
 python3 convert.py ../llama1/7B  --outfile models/llama-7b/ggml-model-f16.gguf  --outtype f16
 python3 convert.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16
 python3 convert.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16
 python3 convert.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16
 # LLaMA v2
 python3 convert.py ../llama2/llama-2-7b  --outfile models/llama-7b-v2/ggml-model-f16.gguf  --outtype f16
 python3 convert.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16
 python3 convert.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16
 # Code Llama
 python3 convert.py ../codellama/CodeLlama-7b/  --outfile models/codellama-7b/ggml-model-f16.gguf  --outtype f16
 python3 convert.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16
 python3 convert.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16
 # Falcon
 python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b  1
 mv -v ../falcon/falcon-7b/ggml-model-f16.gguf models/falcon-7b/ggml-model-f16.gguf
 python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-40b 1
 mv -v ../falcon/falcon-40b/ggml-model-f16.gguf models/falcon-40b/ggml-model-f16.gguf
--- a/scripts/perf-run-all.sh
+++ b/scripts/perf-run-all.sh
@ -1,93 +0,0 @@
 #!/bin/bash
 #
 # Measure the performance (time per token) of the various quantization techniques
 #
 QUANTIZE=0
 if [ "$1" != "" ]; then
    echo "Quantizing"
    QUANTIZE=1
 fi
 if [ "$QUANTIZE" != "0" ]; then
    #
    # quantize
    #
    # 7B
    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt
    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt
    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt
    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt
    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt
    # 13B
    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt
    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt
    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt
    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt
    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt
 fi
 #
 # perf
 # run each command twice
 #
 set -x
 # 7B - 4 threads
     ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
 time ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-f16.txt  | grep llama_print_timings
     ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
 time ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q4_0.txt | grep llama_print_timings
     ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
 time ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q4_1.txt | grep llama_print_timings
     ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
 time ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q5_0.txt | grep llama_print_timings
     ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
 time ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q5_1.txt | grep llama_print_timings
     ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
 time ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q8_0.txt | grep llama_print_timings
 # 7B - 8 threads
     ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
 time ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-f16.txt  | grep llama_print_timings
     ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
 time ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q4_0.txt | grep llama_print_timings
     ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
 time ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q4_1.txt | grep llama_print_timings
     ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
 time ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q5_0.txt | grep llama_print_timings
     ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
 time ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q5_1.txt | grep llama_print_timings
     ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
 time ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q8_0.txt | grep llama_print_timings
 # 13B - 4 threads
     ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
 time ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-f16.txt  | grep llama_print_timings
     ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
 time ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q4_0.txt | grep llama_print_timings
     ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
 time ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q4_1.txt | grep llama_print_timings
     ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
 time ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q5_0.txt | grep llama_print_timings
     ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
 time ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q5_1.txt | grep llama_print_timings
     ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
 time ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q8_0.txt | grep llama_print_timings
 # 13B - 8 threads
     ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
 time ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-f16.txt  | grep llama_print_timings
     ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
 time ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q4_0.txt | grep llama_print_timings
     ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
 time ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q4_1.txt | grep llama_print_timings
     ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
 time ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q5_0.txt | grep llama_print_timings
     ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
 time ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q5_1.txt | grep llama_print_timings
     ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
 time ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q8_0.txt | grep llama_print_timings
--- a/scripts/ppl-run-all.sh
+++ b/scripts/ppl-run-all.sh
@ -1,39 +0,0 @@
 #!/bin/bash
 #
 # quantize
 #
 # 7B
 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt
 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt
 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt
 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt
 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt
 # 13B
 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt
 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt
 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt
 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt
 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt
 #
 # perplexity
 #
 # 7B
 time ./bin/perplexity -m ../models/7B/ggml-model-f16.bin  -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-f16.txt
 time ./bin/perplexity -m ../models/7B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_0.txt
 time ./bin/perplexity -m ../models/7B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_1.txt
 time ./bin/perplexity -m ../models/7B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_0.txt
 time ./bin/perplexity -m ../models/7B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_1.txt
 time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q8_0.txt
 # 13B
 time ./bin/perplexity -m ../models/13B/ggml-model-f16.bin  -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-f16.txt
 time ./bin/perplexity -m ../models/13B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_0.txt
 time ./bin/perplexity -m ../models/13B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_1.txt
 time ./bin/perplexity -m ../models/13B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_0.txt
 time ./bin/perplexity -m ../models/13B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_1.txt
 time ./bin/perplexity -m ../models/13B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q8_0.txt
--- a/scripts/qnt-all.sh
+++ b/scripts/qnt-all.sh
@ -0,0 +1,29 @@
 #!/bin/bash
 qnt=(q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k)
 args=""
 if [ -z "$1" ]; then
    echo "usage: $0 <model> [qnt] [args]"
    echo "default: $0 <model> \"${qnt[@]}\" \"${args}\""
    exit 1
 fi
 if [ ! -z "$2" ]; then
    qnt=($2)
 fi
 if [ ! -z "$3" ]; then
    args="$3"
 fi
 model="$1"
 out="../tmp/results-${model}"
 set -e
 mkdir -p ${out}
 for q in ${qnt[@]}; do
    time ./bin/quantize ../models/${model}/ggml-model-f16.gguf ../models/${model}/ggml-model-${q}.gguf ${q} 2>&1 ${args} | tee ${out}/qnt-${q}.txt
 done
--- a/scripts/run-all-perf.sh
+++ b/scripts/run-all-perf.sh
@ -0,0 +1,33 @@
 #!/bin/bash
 qnt=(f16 q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k)
 args="-ngl 999 -n 64 -p 512"
 if [ -z "$1" ]; then
    echo "usage: $0 <model> [qnt] [args]"
    echo "default: $0 <model> \"${qnt[@]}\" \"${args}\""
    exit 1
 fi
 if [ ! -z "$2" ]; then
    qnt=($2)
 fi
 if [ ! -z "$3" ]; then
    args="$3"
 fi
 model="$1"
 out="../tmp/results-${model}"
 set -e
 mkdir -p ${out}
 mstr=""
 for q in ${qnt[@]}; do
    mstr="${mstr} -m ../models/${model}/ggml-model-${q}.gguf"
 done
 ./bin/llama-bench ${mstr} ${args} 2> /dev/null
--- a/scripts/run-all-ppl.sh
+++ b/scripts/run-all-ppl.sh
@ -0,0 +1,29 @@
 #!/bin/bash
 qnt=(f16 q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k)
 args="-ngl 999 -t 8"
 if [ -z "$1" ]; then
    echo "usage: $0 <model> [qnt] [args]"
    echo "default: $0 <model> \"${qnt[@]}\" \"${args}\""
    exit 1
 fi
 if [ ! -z "$2" ]; then
    qnt=($2)
 fi
 if [ ! -z "$3" ]; then
    args="$3"
 fi
 set -e
 model="$1"
 out="../tmp/results-${model}"
 mkdir -p ${out}
 for q in ${qnt[@]}; do
    time ./bin/perplexity -m ../models/${model}/ggml-model-f16.gguf -f ./wiki.test.raw ${args} 2>&1 | tee ${out}/ppl-${q}.txt
 done
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -25,8 +25,10 @@ endfunction()
 llama_build_and_test_executable(test-quantize-fns.cpp)
 llama_build_and_test_executable(test-quantize-perf.cpp)
 llama_build_and_test_executable(test-sampling.cpp)
-llama_build_executable(test-tokenizer-0.cpp)
+llama_build_executable(test-tokenizer-0-llama.cpp)
-llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
 llama_build_executable(test-tokenizer-0-falcon.cpp)
 #llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_build_executable(test-tokenizer-1.cpp)
 # test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
 #llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@ -0,0 +1,178 @@
 #include "llama.h"
 #include "common.h"
 #include <cstdio>
 #include <string>
 #include <map>
 #include <vector>
 #include <fstream>
 // generate using test-tokenizer-0-falcon.py
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
    static std::map<std::string, std::vector<llama_token>> _k_tests = {
        { ""                      , {  }, },
        { " "                     , {     204, }, },
        { "  "                    , {     258, }, },
        { "   "                   , {     466, }, },
        { "\t"                    , {     192, }, },
        { "\n"                    , {     193, }, },
        { "\t\n"                  , {   19125, }, },
        { "Hello world"           , {    9856,   1079, }, },
        { " Hello world"          , {   23090,   1079, }, },
        { "Hello World"           , {    9856,   2889, }, },
        { " Hello World"          , {   23090,   2889, }, },
        { " Hello World!"         , {   23090,   2889,     12, }, },
        { "Hello, world!"         , {    9856,     23,   1079,     12, }, },
        { " Hello, world!"        , {   23090,     23,   1079,     12, }, },
        { " this is 🦙.cpp"        , {     414,    304,   3346,    111,    231,     25,  29247, }, },
        { "w048 7tuijk dsdfhu"    , {      98,  55866,    204,     34,  16682,   7149,  36190,   6869,  11481, }, },
        { "нещо на Български"     , {     150,    133,   6207,    151,    215,    150,    134,   5052,    133,   6279,   5052,    223,    151,    216,  49679,    123,  53110,  47043,   7795, }, },
        { "កាន់តែពិសេសអាចខលចេញ"   , {   38154,    206,  38154,    126,  38154,    225,    167,    237,    217,  38154,    221,    167,    237,    208,  38154,    228,  38154,    127,  38154,    237,    167,    237,    207,  38154,    237,  38154,    107,  38154,    126,  38154,    211,  38154,    207,  38154,    233,  38154,    211,    167,    237,    207,  38154,    215, }, },
        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    2571,    232,    206,    204,     19,  11003,     20,   8196,    126,    283,    219,  48778,    116,  13392,    204,     19,  51831,    732,  63209,   1741,   7955,    522,     20,  22438,    211,    204,     19,   7927,  53360,    325,    504,    701,    946,  10930,     20, }, },
        { "Hello"                 , {    9856, }, },
        { " Hello"                , {   23090, }, },
        { "  Hello"               , {     204,  23090, }, },
        { "   Hello"              , {     258,  23090, }, },
        { "    Hello"             , {     466,  23090, }, },
        { "    Hello\n    Hello"  , {     466,  23090,    742,  23090, }, },
    };
    return _k_tests;
 }
 int main(int argc, char **argv) {
    if (argc < 2) {
        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
        return 1;
    }
    const std::string fname = argv[1];
    std::string fname_text;
    if (argc > 2) {
        fname_text = argv[2];
    }
    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
    llama_model * model;
    llama_context * ctx;
    llama_backend_init(false);
    // load the vocab
    {
        auto lparams = llama_context_default_params();
        lparams.vocab_only = true;
        model = llama_load_model_from_file(fname.c_str(), lparams);
        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            return 1;
        }
        ctx = llama_new_context_with_model(model, lparams);
        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            llama_free_model(model);
            return 1;
        }
    }
    if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_BPE) {
        fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
        llama_free_model(model);
        llama_free(ctx);
        return 2;
    }
    bool success = true;
    for (const auto & test_kv : k_tests()) {
        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, false);
        printf("\n");
        printf("src: '%s'\n", test_kv.first.c_str());
        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
        printf("tok: ");
        for (const auto & tok : res) {
            printf("%d ", tok);
        }
        printf("\n");
        bool correct = res.size() == test_kv.second.size();
        for (int i = 0; i < (int) res.size() && correct; ++i) {
            if (test_kv.second[i] != res[i]) {
                correct = false;
            }
        }
        if (!correct) {
            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
                llama_detokenize_bpe(ctx, res).c_str(),
                llama_detokenize_bpe(ctx, test_kv.second).c_str());
            fprintf(stderr, "%s : expected tokens: ", __func__);
            for (const auto & t : test_kv.second) {
                fprintf(stderr, "%6d, ", t);
            }
            fprintf(stderr, "\n");
            fprintf(stderr, "%s : got tokens:      ", __func__);
            for (const auto & t : res) {
                fprintf(stderr, "%6d, ", t);
            }
            fprintf(stderr, "\n");
            success = false;
        }
    }
    if (!fname_text.empty()) {
        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
        std::string text;
        {
            std::ifstream ifs(fname_text);
            if (!ifs) {
                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
                return 1;
            }
            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
        }
        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
        {
            const std::string fname_out = fname_text + ".tokcpp";
            std::ofstream ofs(fname_out);
            if (!ofs) {
                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
                return 1;
            }
            for (const auto & tok : res) {
                ofs << tok << " ";
            }
            ofs << "\n";
        }
        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
    }
    llama_free_model(model);
    llama_free(ctx);
    llama_backend_free();
    return success ? 0 : 3;
 }
--- a/tests/test-tokenizer-0-falcon.py
+++ b/tests/test-tokenizer-0-falcon.py
@ -0,0 +1,83 @@
 # tests with BPE tokenizer
 import os
 import sys
 import argparse
 from transformers import AutoTokenizer
 parser = argparse.ArgumentParser()
 parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
 parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
 args = parser.parse_args()
 dir_tokenizer = args.dir_tokenizer
 tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
 tests = [
        "",
        " ",
        "  ",
        "   ",
        "\t",
        "\n",
        "\t\n",
        "Hello world",
        " Hello world",
        "Hello World",
        " Hello World",
        " Hello World!",
        "Hello, world!",
        " Hello, world!",
        " this is 🦙.cpp",
        "w048 7tuijk dsdfhu",
        "нещо на Български",
        "កាន់តែពិសេសអាចខលចេញ",
        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
        "Hello",
        " Hello",
        "  Hello",
        "   Hello",
        "    Hello",
        "    Hello\n    Hello",
    ]
 for text in tests:
    print('text: ', text)
    print(tokenizer.encode(text))
    print(tokenizer.decode(tokenizer.encode(text)))
 print("\n\ntests for C++:\n")
 for text in tests:
    res = tokenizer.encode(text)
    k = text.replace('\n', '\\n')
    k = k.replace('\t', '\\t')
    k = '"' + k + '"'
    print("{ %-24s, { " % k, end='')
    for x in res:
        print("%7d," % x, end='')
    print(" }, },")
 print(tokenizer.encode('hello'))
 print(tokenizer.encode('world'))
 print(tokenizer.encode(' world'))
 print(tokenizer.encode('hello world'))
 fname_tok = args.fname_tok
 if fname_tok:
    print('tokenizing file: ', fname_tok)
    fname_out = fname_tok + '.tok'
    with open(fname_tok, 'r') as f:
        lines = f.readlines()
        s = ''.join(lines)
        res = tokenizer.encode(s)
        # write to file
        with open(fname_out, 'w') as f:
            for x in res:
                f.write(str(x) + ' ')
            f.write('\n')
        print('len(res): ', len(res))
        print('len(lines): ', len(lines))
    print('results written to: ', fname_out)
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@ -0,0 +1,182 @@
 #include "llama.h"
 #include "common.h"
 #include <cstdio>
 #include <string>
 #include <map>
 #include <vector>
 #include <fstream>
 // generate using test-tokenizer-0-llama.py
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
    static std::map<std::string, std::vector<llama_token>> _k_tests = {
        { ""                      , {  }, },
        { " "                     , {     259, }, },
        { "  "                    , {    1678, }, },
        { "   "                   , {     268, }, },
        { "\t"                    , {   29871,     12, }, },
        { "\n"                    , {   29871,     13, }, },
        { "\t\n"                  , {   29871,     12,     13, }, },
        { "Hello world"           , {   15043,   3186, }, },
        { " Hello world"          , {   29871,  15043,   3186, }, },
        { "Hello World"           , {   15043,   2787, }, },
        { " Hello World"          , {   29871,  15043,   2787, }, },
        { " Hello World!"         , {   29871,  15043,   2787,  29991, }, },
        { "Hello, world!"         , {   15043,  29892,   3186,  29991, }, },
        { " Hello, world!"        , {   29871,  15043,  29892,   3186,  29991, }, },
        { " this is 🦙.cpp"        , {   29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
        { "w048 7tuijk dsdfhu"    , {     281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
        { "нещо на Български"     , {    1538,   4851,    665,   1386,  29713,   1305, }, },
        { "កាន់តែពិសេសអាចខលចេញ"   , {   29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,    146,    228,    162,    133,    228,    161,    153,    228,    161,    186,  31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,    161,    136,    228,    161,    132,    228,    161,    158,    228,    161,    136,    228,    162,    132,    228,    161,    140, }, },
        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {   29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,    243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,    313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,    313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
        { "Hello"                 , {   15043, }, },
        { " Hello"                , {   29871,  15043, }, },
        { "  Hello"               , {     259,  15043, }, },
        { "   Hello"              , {    1678,  15043, }, },
        { "    Hello"             , {     268,  15043, }, },
        { "    Hello\n    Hello"  , {     268,  15043,     13,   1678,  15043, }, },
    };
    return _k_tests;
 }
 int main(int argc, char **argv) {
    if (argc < 2) {
        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
        return 1;
    }
    const std::string fname = argv[1];
    std::string fname_text;
    if (argc > 2) {
        fname_text = argv[2];
    }
    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
    llama_model * model;
    llama_context * ctx;
    llama_backend_init(false);
    // load the vocab
    {
        auto lparams = llama_context_default_params();
        lparams.vocab_only = true;
        model = llama_load_model_from_file(fname.c_str(), lparams);
        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            return 1;
        }
        ctx = llama_new_context_with_model(model, lparams);
        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            llama_free_model(model);
            return 1;
        }
    }
    if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_SPM) {
        fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
        llama_free_model(model);
        llama_free(ctx);
        return 2;
    }
    bool success = true;
    for (const auto & test_kv : k_tests()) {
        const std::vector<llama_token> res_bos   = llama_tokenize(ctx, test_kv.first, true);
        const std::vector<llama_token> res_nobos = llama_tokenize(ctx, test_kv.first, false);
        printf("\n");
        printf("src: '%s'\n", test_kv.first.c_str());
        printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str());
        printf("tok: ");
        for (const auto & tok : res_bos) {
            printf("%d ", tok);
        }
        printf("\n");
        bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1;
        for (int i = 0; i < (int) res_nobos.size() && correct; ++i) {
            if (test_kv.second[i] != res_bos[i + 1]) {
                correct = false;
            }
            if (test_kv.second[i] != res_nobos[i]) {
                correct = false;
            }
        }
        if (!correct) {
            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
                llama_detokenize_spm(ctx, res_nobos).c_str(),
                llama_detokenize_spm(ctx, test_kv.second).c_str());
            fprintf(stderr, "%s : expected tokens: ", __func__);
            for (const auto & t : test_kv.second) {
                fprintf(stderr, "%6d, ", t);
            }
            fprintf(stderr, "\n");
            fprintf(stderr, "%s : got tokens:      ", __func__);
            for (const auto & t : res_nobos) {
                fprintf(stderr, "%6d, ", t);
            }
            fprintf(stderr, "\n");
            success = false;
        }
    }
    if (!fname_text.empty()) {
        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
        std::string text;
        {
            std::ifstream ifs(fname_text);
            if (!ifs) {
                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
                return 1;
            }
            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
        }
        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
        {
            const std::string fname_out = fname_text + ".tokcpp";
            std::ofstream ofs(fname_out);
            if (!ofs) {
                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
                return 1;
            }
            for (const auto & tok : res) {
                ofs << tok << " ";
            }
            ofs << "\n";
        }
        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
    }
    llama_free_model(model);
    llama_free(ctx);
    llama_backend_free();
    return success ? 0 : 3;
 }
--- a/tests/test-tokenizer-0-llama.py
+++ b/tests/test-tokenizer-0-llama.py
@ -0,0 +1,95 @@
 # tests with SPM tokenizer
 import os
 import sys
 import argparse
 from sentencepiece import SentencePieceProcessor
 parser = argparse.ArgumentParser()
 parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
 parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
 args = parser.parse_args()
 dir_tokenizer = args.dir_tokenizer
 tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
 tests = [
        "",
        " ",
        "  ",
        "   ",
        "\t",
        "\n",
        "\t\n",
        "Hello world",
        " Hello world",
        "Hello World",
        " Hello World",
        " Hello World!",
        "Hello, world!",
        " Hello, world!",
        " this is 🦙.cpp",
        "w048 7tuijk dsdfhu",
        "нещо на Български",
        "កាន់តែពិសេសអាចខលចេញ",
        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
        "Hello",
        " Hello",
        "  Hello",
        "   Hello",
        "    Hello",
        "    Hello\n    Hello",
    ]
 for text in tests:
    print('text: ', text)
    print('\nwith bos:')
    print(tokenizer.encode(text, add_bos=True))
    print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))
    print('\nwithout bos:')
    print(tokenizer.encode(text, add_bos=False))
    print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))
 print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello'
 print("'" + tokenizer.id_to_piece(29871) + "'") # '_'
 print("'" + tokenizer.decode([15043]) + "'")        # 'Hello'
 print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello'
 print("'" + tokenizer.decode([29871, 15043]) + "'")               # ' Hello'
 print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello  Hello'
 print("\n\ntests for C++:\n")
 for text in tests:
    res = tokenizer.encode(text, add_bos=False)
    k = text.replace('\n', '\\n')
    k = k.replace('\t', '\\t')
    k = '"' + k + '"'
    print("{ %-24s, { " % k, end='')
    for x in res:
        print("%7d," % x, end='')
    print(" }, },")
 print(tokenizer.encode('hello'))
 print(tokenizer.encode('world'))
 print(tokenizer.encode(' world'))
 print(tokenizer.encode('hello world'))
 fname_tok = args.fname_tok
 if fname_tok:
    print('tokenizing file: ', fname_tok)
    fname_out = fname_tok + '.tok'
    with open(fname_tok, 'r') as f:
        lines = f.readlines()
        s = ''.join(lines)
        res = tokenizer.encode(s, add_bos=True)
        # write to file
        with open(fname_out, 'w') as f:
            for x in res:
                f.write(str(x) + ' ')
            f.write('\n')
        print('len(res): ', len(res))
        print('len(lines): ', len(lines))
    print('results written to: ', fname_out)
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -1,140 +0,0 @@
 #include "llama.h"
 #include "common.h"
 #include <cstdio>
 #include <string>
 #include <map>
 #include <vector>
 static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) {
    std::string result;
    for (size_t i = 0; i < tokens.size(); ++i) {
        result += llama_token_to_str(ctx, tokens[i]);
    }
    return result;
 }
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
    static std::map<std::string, std::vector<llama_token>> _k_tests = {
        { " ",                      {1,    259, }, },
        { "  ",                     { 1,    1678, }, },
        { "   ",                    { 1,     268, }, },
        { "\t",                     { 1,    29871,   12, }, },
        { "\n",                     { 1,    29871,   13, }, },
        { "\t\n",                   { 1,    29871,   12,     13, }, },
        { "Hello world",            { 1,  15043,   3186, }, },
        { " Hello world",           { 1,  29871,  15043,   3186, }, },
        { "Hello World",            { 1,  15043,   2787, }, },
        { " Hello World",           { 1,  29871,  15043,   2787, }, },
        { " Hello World!",          { 1,  29871,  15043,   2787,  29991, }, },
        { " this is 🦙.cpp",        { 1,  29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
        { "w048 7tuijk dsdfhu",     { 1,    281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
        { "нещо на Български",      { 1,   1538,   4851,    665,   1386,  29713,   1305, }, },
        { "កាន់តែពិសេសអាចខលចេញ",   { 1,  29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,
                                     146,    228,    162,    133,    228,    161,    153,    228,    161,    186,
                                     31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,
                                     161,    136,    228,    161,    132,    228,    161,    158,    228,    161,
                                     136,    228,    162,    132,    228,    161,    140, }, },
        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
            { 1,  29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,
                243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,
                313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,
                313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
        { "Hello",                  { 1,    15043 }, },
        { " Hello",                 { 1,    29871,  15043 }, },
        { "  Hello",                { 1,    259,    15043 }, },
        { "   Hello",               { 1,    1678,   15043 }, },
        { "    Hello",              { 1,    268,    15043 }, },
        { "    Hello\n    Hello",   { 1,    268,    15043,  13,     1678,   15043 }, },
    };
    return _k_tests;
 }
 int main(int argc, char **argv) {
    if (argc < 2) {
        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
        return 1;
    }
    const std::string fname = argv[1];
    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
    llama_model * model;
    llama_context * ctx;
    llama_backend_init(false);
    // load the vocab
    {
        auto lparams = llama_context_default_params();
        lparams.vocab_only = true;
        model = llama_load_model_from_file(fname.c_str(), lparams);
        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            return 1;
        }
        ctx = llama_new_context_with_model(model, lparams);
        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            llama_free_model(model);
            return 1;
        }
    }
    const int n_vocab = llama_n_vocab(ctx);
    if (n_vocab != 32000) {
        fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
        llama_free_model(model);
        llama_free(ctx);
        return 2;
    }
    bool success = true;
    for (const auto & test_kv : k_tests()) {
        std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);
        fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
            __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());
        bool correct = res.size() == test_kv.second.size();
        for (int i = 0; i < (int) res.size() && correct; ++i) {
            if (res[i] != test_kv.second[i]) {
                correct = false;
            }
        }
        if (!correct) {
            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
                unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str());
            fprintf(stderr, "%s : expected tokens: ", __func__);
            for (const auto & t : test_kv.second) {
                fprintf(stderr, "%6d, ", t);
            }
            fprintf(stderr, "\n");
            fprintf(stderr, "%s : got tokens:      ", __func__);
            for (const auto & t : res) {
                fprintf(stderr, "%6d, ", t);
            }
            fprintf(stderr, "\n");
            success = false;
        }
    }
    llama_free_model(model);
    llama_free(ctx);
    llama_backend_free();
    return success ? 0 : 3;
 }
--- a/tests/test-tokenizer-1.cpp
+++ b/tests/test-tokenizer-1.cpp
@ -22,14 +22,6 @@ static std::string escape_whitespace(const std::string& text) {
    return result;
 }
 static std::string unescape_whitespace(llama_context * ctx, const std::vector<llama_token> & tokens) {
    std::string result;
    for (size_t i = 0; i < tokens.size(); ++i) {
        result += llama_token_to_str(ctx, tokens[i]);
    }
    return result;
 }
 int main(int argc, char **argv) {
    if (argc < 2) {
        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
@ -72,13 +64,13 @@ int main(int argc, char **argv) {
    const int n_vocab = llama_n_vocab(ctx);
    for (int i = 0; i < n_vocab; ++i) {
-        std::string forward = llama_token_to_str(ctx, i);
+        std::string forward = llama_token_to_piece(ctx, i);
        std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
        if (tokens.size() == 1) {
            if (i != tokens[0]) {
-                std::string backward = llama_token_to_str(ctx, tokens[0]);
+                std::string backward = llama_token_to_piece(ctx, tokens[0]);
                fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
-                    __func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
+                    __func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str());
                return 2;
            }
        }