Merge branch 'master' of https://github.com/JoanFM/llama.cpp into feat-jina-v2-base-code

2024-05-31 15:22:31 +02:00 · 2024-05-31 15:22:31 +02:00 · 96a6f55222
commit 96a6f55222
parent 9a65c7a273 16926dff92
59 changed files with 1176 additions and 1019 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -31,6 +31,6 @@ ENV LLAMA_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev
-RUN make
+RUN make -j$(nproc)
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -18,7 +18,7 @@ COPY . .
 ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
 ENV LC_ALL=C.utf8
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -23,7 +23,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1
-RUN make
+RUN make -j$(nproc)
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
    rm /etc/apt/sources.list.d/intel-graphics.list && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
-RUN make
+RUN make -j$(nproc)
 ENTRYPOINT [ "/app/main" ]
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@ -9,7 +9,7 @@ WORKDIR /app
 COPY . .
-RUN make
+RUN make -j$(nproc)
 FROM ubuntu:$UBUNTU_VERSION as runtime
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@ -25,7 +25,7 @@ ENV LLAMA_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
    rm /etc/apt/sources.list.d/intel-graphics.list && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git libcurl4-openssl-dev
@ -19,6 +27,14 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
    rm /etc/apt/sources.list.d/intel-graphics.list && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev
--- a/.devops/server-rocm.Dockerfile
+++ b/.devops/server-rocm.Dockerfile
@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev
-RUN make
+RUN make -j$(nproc)
 ENTRYPOINT [ "/app/server" ]
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@ -11,7 +11,7 @@ COPY . .
 ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
 FROM ubuntu:$UBUNTU_VERSION as runtime
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -8,7 +8,7 @@ arg1="$1"
 shift
 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert.py "$@"
+    python3 ./convert-hf-to-gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    ./quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
--- a/.github/ISSUE_TEMPLATE/06-question.yml
+++ b/.github/ISSUE_TEMPLATE/06-question.yml
@ -1,38 +0,0 @@
 name: Question
 description: Used to ask questions about llama.cpp
 title: "Question: "
 labels: ["question"]
 body:
  - type: markdown
    attributes:
      value: |
        [Please search your question first in Discussion if you got a common general question.](https://github.com/ggerganov/llama.cpp/discussions/categories/q-a)
  - type: checkboxes
    id: prerequisites
    attributes:
      label: Prerequisites
      description: Please confirm the following before submitting your question.
      options:
        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
          required: true
        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new useful question to share that cannot be answered within Discussions.
          required: true
  - type: textarea
    id: background-description
    attributes:
      label: Background Description
      description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an question.
      placeholder: Detailed description of your question
    validations:
      required: true
  - type: textarea
    id: possible-answer
    attributes:
      label: Possible Answer
      description: If you have some idea of possible answers you want to confirm, that would also be appreciated.
      placeholder: Your idea of possible answers
    validations:
      required: false
--- a/.github/ISSUE_TEMPLATE/06-research.yml
+++ b/.github/ISSUE_TEMPLATE/06-research.yml
@ -0,0 +1,52 @@
 name: Research
 description: Track new technical research area
 title: "Research: "
 labels: ["research 🔬"]
 body:
  - type: markdown
    attributes:
      value: |
        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
  - type: checkboxes
    id: research-stage
    attributes:
      label: Research Stage
      description: Track general state of this research ticket
      options:
        - label: Background Research (Let's try to avoid reinventing the wheel)
        - label: Hypothesis Formed (How do you think this will work and it's effect?)
        - label: Strategy / Implementation Forming
        - label: Analysis of results
        - label: Debrief / Documentation (So people in the future can learn from us)
  - type: textarea
    id: background
    attributes:
      label: Previous existing literature and research
      description: Whats the current state of the art and whats the motivation for this research?
  - type: textarea
    id: hypothesis
    attributes:
      label: Hypothesis
      description: How do you think this will work and it's effect?
  - type: textarea
    id: implementation
    attributes:
      label: Implementation
      description: Got an approach? e.g. a PR ready to go?
  - type: textarea
    id: analysis
    attributes:
      label: Analysis
      description: How does the proposed implementation behave?
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      render: shell
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -0,0 +1,13 @@
 blank_issues_enabled: true
 contact_links:
  - name: Got an idea?
    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
    about: Pop it there. It may then become an enhancement ticket.
  - name: Got a question?
    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
    about: Ask a question there!
  - name: Want to contribute?
    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
    about: Head to the contribution guide page of the wiki for areas you can help with
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -42,9 +42,8 @@ jobs:
          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          # TODO: Disabled due to build issues https://github.com/ggerganov/llama.cpp/issues/7507
+          - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
-          #- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
          #- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -628,6 +628,10 @@ if (LLAMA_SYCL)
        add_compile_definitions(GGML_SYCL_F16)
    endif()
    if (LLAMA_CUDA_FORCE_MMQ)
        add_compile_definitions(GGML_SYCL_FORCE_MMQ)
    endif()
    add_compile_options(-I./) #include DPCT
    add_compile_options(-I/${SYCL_INCLUDE_DIR})
@ -1310,7 +1314,7 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}
 install(TARGETS llama LIBRARY PUBLIC_HEADER)
 install(
-    FILES convert.py
+    FILES convert-hf-to-gguf.py
    PERMISSIONS
        OWNER_READ
        OWNER_WRITE
--- a/README-sycl.md
+++ b/README-sycl.md
@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 ## OS
-| OS      | Status  | Verified                           |
+| OS      | Status  | Verified                                       |
-|---------|---------|------------------------------------|
+|---------|---------|------------------------------------------------|
-| Linux   | Support | Ubuntu 22.04, Fedora Silverblue 39 |
+| Linux   | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
-| Windows | Support | Windows 11                         |
+| Windows | Support | Windows 11                                     |
 ## Hardware
@ -70,7 +70,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 |-------------------------------|---------|---------------------------------------|
 | Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
-| Intel Arc Series              | Support | Arc 770, 730M                         |
+| Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
 | Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
 | Intel iGPU                    | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
--- a/README.md
+++ b/README.md
@ -2,7 +2,9 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
-[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
 [![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -20,7 +22,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 ### Hot topics
- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
+- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
 - Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
 - BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
 - MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
 - Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
@ -200,6 +203,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
@ -315,8 +319,6 @@ In order to build llama.cpp you have four different options.
      make
      ```
      **Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
  - On Windows:
    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
@ -328,23 +330,32 @@ In order to build llama.cpp you have four different options.
        make
        ```
  - Notes:
    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
    - For faster repeated compilation, install [ccache](https://ccache.dev/).
    - For debug builds, run `make LLAMA_DEBUG=1`
 - Using `CMake`:
-    ```bash
+  ```bash
-    cmake -B build
+  cmake -B build
-    cmake --build build --config Release
+  cmake --build build --config Release
-    ```
+  ```
-    **Note**: for `Debug` builds, there are two cases:
+  **Notes**:
-    - Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
+    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
    - For faster repeated compilation, install [ccache](https://ccache.dev/).
    - For debug builds, there are two cases:
      1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
      ```bash
      cmake -B build -DCMAKE_BUILD_TYPE=Debug
      cmake --build build
      ```
-    - Multi-config generators (`-G` param set to Visual Studio, XCode...):
+      2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
      ```bash
      cmake -B build -G "Xcode"
@ -379,6 +390,14 @@ In order to build llama.cpp you have four different options.
    CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
    the instructions for use and activate this options in this document below.
 ### Homebrew
 On Mac and Linux, the homebrew package manager can be used via
 ```
 brew install llama.cpp
 ```
 The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
 ### Metal Build
 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
@ -477,7 +496,8 @@ Building the program with BLAS support may lead to some performance improvements
  |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
  | LLAMA_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
  | LLAMA_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
-  | LLAMA_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
+  | LLAMA_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                               |
  | LLAMA_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. |                                                                                                                                         |
  | LLAMA_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
  | LLAMA_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
  | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
@ -696,7 +716,8 @@ Building the program with BLAS support may lead to some performance improvements
 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
-Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
+Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
 It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
 ```bash
 # obtain the official LLaMA model weights and place them in ./models
@ -713,10 +734,10 @@ ls ./models
 python3 -m pip install -r requirements.txt
 # convert the model to ggml FP16 format
-python3 convert.py models/mymodel/
+python3 convert-hf-to-gguf.py models/mymodel/
 # [Optional] for models using BPE tokenizers
-python convert.py models/mymodel/ --vocab-type bpe
+python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
 # quantize the model to 4-bits (using Q4_K_M method)
 ./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
--- a/ci/run.sh
+++ b/ci/run.sh
@ -287,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
-    python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -25,8 +25,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
 from convert import LlamaHfVocab
 logger = logging.getLogger("hf-to-gguf")
@ -631,7 +629,7 @@ class Model:
        special_vocab.add_to_gguf(self.gguf_writer)
    def _set_vocab_llama_hf(self):
-        vocab = LlamaHfVocab(self.dir_model)
+        vocab = gguf.LlamaHfVocab(self.dir_model)
        tokens = []
        scores = []
        toktypes = []
@ -1314,6 +1312,17 @@ class LlamaModel(Model):
                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
        if tokenizer_config_file.is_file():
            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
                tokenizer_config_json = json.load(f)
                if "add_prefix_space" in tokenizer_config_json:
                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
        # Apply to granite small models only
        if self.hparams.get("vocab_size", 32000) == 49152:
            self.gguf_writer.add_add_bos_token(False)
    @staticmethod
    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
        if n_head_kv is not None and n_head != n_head_kv:
@ -1328,9 +1337,9 @@ class LlamaModel(Model):
        n_head = self.hparams["num_attention_heads"]
        n_kv_head = self.hparams.get("num_key_value_heads")
-        if name.endswith("q_proj.weight"):
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
-        if name.endswith("k_proj.weight"):
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
        # process the experts separately
--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
 ### 1. Convert the model to GGUF
 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
+Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
--- a/examples/convert-legacy-llama.py
+++ b/examples/convert-legacy-llama.py
@ -24,14 +24,16 @@ from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
+from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
 import numpy as np
 from sentencepiece import SentencePieceProcessor
 if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+    # use .parent.parent since we are in "examples" directory
    sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
 import gguf
 from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
 if TYPE_CHECKING:
    from typing_extensions import Self, TypeAlias
@ -380,306 +382,6 @@ class Metadata:
        return metadata
 #
 # vocab
 #
@runtime_checkable
 class BaseVocab(Protocol):
    tokenizer_model: ClassVar[str]
    name: ClassVar[str]
 class NoVocab(BaseVocab):
    tokenizer_model = "no_vocab"
    name = "no_vocab"
    def __repr__(self) -> str:
        return "<NoVocab for a model without integrated vocabulary>"
@runtime_checkable
 class Vocab(BaseVocab, Protocol):
    vocab_size: int
    added_tokens_dict: dict[str, int]
    added_tokens_list: list[str]
    fname_tokenizer: Path
    def __init__(self, base_path: Path): ...
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
 class BpeVocab(Vocab):
    tokenizer_model = "gpt2"
    name = "bpe"
    def __init__(self, base_path: Path):
        added_tokens: dict[str, int] = {}
        if (fname_tokenizer := base_path / 'vocab.json').exists():
            # "slow" tokenizer
            with open(fname_tokenizer, encoding="utf-8") as f:
                self.vocab = json.load(f)
            try:
                # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
                    added_tokens = json.load(f)
            except FileNotFoundError:
                pass
        else:
            # "fast" tokenizer
            fname_tokenizer = base_path / FAST_TOKENIZER_FILE
            # if this fails, FileNotFoundError propagates to caller
            with open(fname_tokenizer, encoding="utf-8") as f:
                tokenizer_json = json.load(f)
            tokenizer_model: dict[str, Any] = tokenizer_json['model']
            if (
                tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
                or tokenizer_json['decoder']['type'] != 'ByteLevel'
            ):
                raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
            self.vocab = tokenizer_model["vocab"]
            if (added := tokenizer_json.get('added_tokens')) is not None:
                # Added tokens here can be duplicates of the main vocabulary.
                added_tokens = {item['content']: item['id']
                                for item in added
                                if item['content'] not in self.vocab}
        vocab_size   = len(self.vocab)
        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
        actual_ids   = sorted(added_tokens.values())
        if expected_ids != actual_ids:
            expected_end_id = vocab_size + len(actual_ids) - 1
            raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
                             f"{vocab_size} - {expected_end_id}; got {actual_ids}")
        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
        self.added_tokens_dict    = added_tokens
        self.added_tokens_list    = [text for (text, idx) in items]
        self.vocab_size_base      = vocab_size
        self.vocab_size           = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer      = fname_tokenizer
    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
        for i, _ in enumerate(self.vocab):
            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            score = -1000.0
            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        yield from self.bpe_tokens()
        yield from self.added_tokens()
    def __repr__(self) -> str:
        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 class SentencePieceVocab(Vocab):
    tokenizer_model = "llama"
    name = "spm"
    def __init__(self, base_path: Path):
        added_tokens: dict[str, int] = {}
        if (fname_tokenizer := base_path / 'tokenizer.model').exists():
            # normal location
            try:
                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
                    added_tokens = json.load(f)
            except FileNotFoundError:
                pass
        elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
            # not found in alternate location either
            raise FileNotFoundError('Cannot find tokenizer.model')
        self.sentencepiece_tokenizer = SentencePieceProcessor()
        self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
        vocab_size = self.sentencepiece_tokenizer.vocab_size()
        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
        actual_new_ids   = sorted(new_tokens.keys())
        if expected_new_ids != actual_new_ids:
            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
        # Token pieces that were added to the base vocabulary.
        self.added_tokens_dict  = added_tokens
        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
        self.vocab_size_base    = vocab_size
        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer    = fname_tokenizer
    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.sentencepiece_tokenizer
        for i in range(tokenizer.vocab_size()):
            piece = tokenizer.IdToPiece(i)
            text         = piece.encode("utf-8")
            score: float = tokenizer.GetScore(i)
            toktype = gguf.TokenType.NORMAL
            if tokenizer.IsUnknown(i):
                toktype = gguf.TokenType.UNKNOWN
            if tokenizer.IsControl(i):
                toktype = gguf.TokenType.CONTROL
            # NOTE: I think added_tokens are user defined.
            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
            if tokenizer.IsUnused(i):
                toktype = gguf.TokenType.UNUSED
            if tokenizer.IsByte(i):
                toktype = gguf.TokenType.BYTE
            yield text, score, toktype
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            score = -1000.0
            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        yield from self.sentencepiece_tokens()
        yield from self.added_tokens()
    def __repr__(self) -> str:
        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 class LlamaHfVocab(Vocab):
    tokenizer_model = "llama"
    name = "hfft"
    def __init__(self, base_path: Path):
        fname_tokenizer = base_path / FAST_TOKENIZER_FILE
        # if this fails, FileNotFoundError propagates to caller
        with open(fname_tokenizer, encoding='utf-8') as f:
            tokenizer_json = json.load(f)
        # pre-check so we know if we need transformers
        tokenizer_model: dict[str, Any] = tokenizer_json['model']
        is_llama3 = (
            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
            and not tokenizer_model.get('byte_fallback', True)
        )
        if is_llama3:
            raise TypeError('Llama 3 must be converted with BpeVocab')
        if not is_llama3 and (
            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
            or tokenizer_json['decoder']['type'] != 'Sequence'
        ):
            raise FileNotFoundError('Cannot find Llama BPE tokenizer')
        try:
            from transformers import AutoTokenizer
        except ImportError as e:
            raise ImportError(
                "To use LlamaHfVocab, please install the `transformers` package. "
                "You can install it with `pip install transformers`."
            ) from e
        # Allow the tokenizer to default to slow or fast versions.
        # Explicitly set tokenizer to use local paths.
        self.tokenizer = AutoTokenizer.from_pretrained(
            base_path,
            cache_dir=base_path,
            local_files_only=True,
        )
        assert self.tokenizer.is_fast  # assume tokenizer.json is used
        # Initialize lists and dictionaries for added tokens
        self.added_tokens_list = []
        self.added_tokens_dict = dict()
        self.added_tokens_ids  = set()
        # Process added tokens
        for tok, tokidx in sorted(
            self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
        ):
            # Only consider added tokens that are not in the base vocabulary
            if tokidx >= self.tokenizer.vocab_size:
                self.added_tokens_list.append(tok)
                self.added_tokens_dict[tok] = tokidx
                self.added_tokens_ids.add(tokidx)
        # Store special tokens and their IDs
        self.specials = {
            tok: self.tokenizer.get_vocab()[tok]
            for tok in self.tokenizer.all_special_tokens
        }
        self.special_ids = set(self.tokenizer.all_special_ids)
        # Set vocabulary sizes
        self.vocab_size_base = self.tokenizer.vocab_size
        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer = fname_tokenizer
    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {
            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
        }
        for token_id in range(self.vocab_size_base):
            # Skip processing added tokens here
            if token_id in self.added_tokens_ids:
                continue
            # Convert token text to bytes
            token_text = reverse_vocab[token_id].encode("utf-8")
            # Yield token text, score, and type
            yield token_text, self.get_token_score(token_id), self.get_token_type(
                token_id, token_text, self.special_ids  # Reuse already stored special IDs
            )
    def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
        # Special case for byte tokens
        if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
            return gguf.TokenType.BYTE
        # Determine token type based on whether it's a special token
        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
    def get_token_score(self, token_id: int) -> float:
        # Placeholder for actual logic to determine the token's score
        # This needs to be implemented based on specific requirements
        return -1000.0  # Default score
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            if text in self.specials:
                toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
                score = self.get_token_score(self.specials[text])
            else:
                toktype = gguf.TokenType.USER_DEFINED
                score = -1000.0
            yield text.encode("utf-8"), score, toktype
    def has_newline_token(self):
        return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        yield from self.hf_tokens()
        yield from self.added_tokens()
    def __repr__(self) -> str:
        return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 #
 # data loading
 # TODO: reuse (probably move to gguf.py?)
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -178,6 +178,7 @@ struct cmd_params {
    std::vector<ggml_type> type_v;
    std::vector<int> n_threads;
    std::vector<int> n_gpu_layers;
    std::vector<std::string> rpc_servers;
    std::vector<llama_split_mode> split_mode;
    std::vector<int> main_gpu;
    std::vector<bool> no_kv_offload;
@ -202,6 +203,7 @@ static const cmd_params cmd_params_defaults = {
    /* type_v        */ {GGML_TYPE_F16},
    /* n_threads     */ {cpu_get_num_math()},
    /* n_gpu_layers  */ {99},
    /* rpc_servers   */ {""},
    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
    /* main_gpu      */ {0},
    /* no_kv_offload */ {false},
@ -230,6 +232,7 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -ctv, --cache-type-v <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
    printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
    printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
    printf("  -rpc, --rpc <rpc_servers>           (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
    printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
    printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
@ -384,6 +387,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
        } else if (arg == "-rpc" || arg == "--rpc") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rpc_servers.push_back(argv[i]);
        } else if (arg == "-sm" || arg == "--split-mode") {
            if (++i >= argc) {
                invalid_param = true;
@ -519,6 +528,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
    if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
    if (params.rpc_servers.empty())  { params.rpc_servers = cmd_params_defaults.rpc_servers; }
    if (params.split_mode.empty())   { params.split_mode = cmd_params_defaults.split_mode; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
@ -541,6 +551,7 @@ struct cmd_params_instance {
    ggml_type type_v;
    int n_threads;
    int n_gpu_layers;
    std::string rpc_servers;
    llama_split_mode split_mode;
    int main_gpu;
    bool no_kv_offload;
@ -553,6 +564,9 @@ struct cmd_params_instance {
        llama_model_params mparams = llama_model_default_params();
        mparams.n_gpu_layers = n_gpu_layers;
        if (!rpc_servers.empty()) {
            mparams.rpc_servers = rpc_servers.c_str();
        }
        mparams.split_mode = split_mode;
        mparams.main_gpu = main_gpu;
        mparams.tensor_split = tensor_split.data();
@ -564,6 +578,7 @@ struct cmd_params_instance {
    bool equal_mparams(const cmd_params_instance & other) const {
        return model == other.model &&
               n_gpu_layers == other.n_gpu_layers &&
               rpc_servers == other.rpc_servers &&
               split_mode == other.split_mode &&
               main_gpu == other.main_gpu &&
               use_mmap == other.use_mmap &&
@ -592,6 +607,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    // this ordering minimizes the number of times that each model needs to be reloaded
    for (const auto & m : params.model)
    for (const auto & nl : params.n_gpu_layers)
    for (const auto & rpc : params.rpc_servers)
    for (const auto & sm : params.split_mode)
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
@ -618,6 +634,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
@ -643,6 +660,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
@ -668,6 +686,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
@ -692,6 +711,7 @@ struct test {
    static const bool kompute;
    static const bool metal;
    static const bool sycl;
    static const bool rpc;
    static const bool gpu_blas;
    static const bool blas;
    static const std::string cpu_info;
@ -790,6 +810,9 @@ struct test {
        if (sycl) {
            return GGML_SYCL_NAME;
        }
        if (rpc) {
            return "RPC";
        }
        if (gpu_blas) {
            return "GPU BLAS";
        }
@ -803,7 +826,7 @@ struct test {
    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
            "build_commit", "build_number",
-            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
+            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_ubatch",
@ -859,7 +882,7 @@ struct test {
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
            std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
-            std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
+            std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_ubatch),
@ -894,6 +917,7 @@ const bool        test::metal        = !!ggml_cpu_has_metal();
 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 const bool        test::blas         = !!ggml_cpu_has_blas();
 const bool        test::sycl         = !!ggml_cpu_has_sycl();
 const bool        test::rpc          = !!ggml_cpu_has_rpc();
 const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -54,10 +54,10 @@ python ./examples/llava/convert-image-encoder-to-gguf \
    --projector-type ldpv2
 ```
-4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
 ```sh
-python ./convert.py path/to/MobileVLM-1.7B
+python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
 ```
 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -50,10 +50,10 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
 python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```
-5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
 ```sh
-python ./convert.py ../llava-v1.5-7b --skip-unknown
+python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown
 ```
 Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
@ -92,7 +92,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto
 6) Then convert the model to gguf format:
 ```console
-python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
+python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```
 7) And finally we can run the llava-cli using the 1.6 model version:
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@ -1,3 +1,3 @@
-r ../../requirements/requirements-convert.txt
+-r ../../requirements/requirements-convert-legacy-llama.txt
 pillow~=10.2.0
 torch~=2.1.1
--- a/examples/make-ggml.py
+++ b/examples/make-ggml.py
@ -1,98 +0,0 @@
 #!/usr/bin/env python3
 """
 This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
 Usage:
 python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
 Arguments:
 - model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
 - --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
 - --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
 - --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
 - --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
 - --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
 Old quant types (some base model types require these):
 - Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
 - Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
 - Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
 - Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
 New quant types (recommended):
 - Q2_K: smallest, extreme quality loss - not recommended
 - Q3_K: alias for Q3_K_M
 - Q3_K_S: very small, very high quality loss
 - Q3_K_M: very small, very high quality loss
 - Q3_K_L: small, substantial quality loss
 - Q4_K: alias for Q4_K_M
 - Q4_K_S: small, significant quality loss
 - Q4_K_M: medium, balanced quality - recommended
 - Q5_K: alias for Q5_K_M
 - Q5_K_S: large, low quality loss - recommended
 - Q5_K_M: large, very low quality loss - recommended
 - Q6_K: very large, extremely low quality loss
 - Q8_0: very large, extremely low quality loss - not recommended
 - F16: extremely large, virtually no quality loss - not recommended
 - F32: absolutely huge, lossless - not recommended
 """
 import subprocess
 subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
 import argparse
 import os
 from huggingface_hub import snapshot_download
 def main(model, model_type, outname, outdir, quants, keep_fp16):
    if not os.path.isdir(model):
        print(f"Model not found at {model}. Downloading...")
        try:
            if outname is None:
                outname = model.split('/')[-1]
            model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
        except Exception as e:
            raise Exception(f"Could not download the model: {e}")
    if outdir is None:
        outdir = f'../models/{outname}'
    if not os.path.isfile(f"{model}/config.json"):
        raise Exception(f"Could not find config.json in {model}")
    os.makedirs(outdir, exist_ok=True)
    print("Building llama.cpp")
    subprocess.run(f"cd .. && make quantize", shell=True, check=True)
    fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
    print(f"Making unquantised GGUF at {fp16}")
    if not os.path.isfile(fp16):
        if model_type != "llama":
            subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
        else:
            subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
    else:
        print(f"Unquantised GGML already exists at: {fp16}")
    print("Making quants")
    for type in quants:
        outfile = f"{outdir}/{outname}.gguf.{type}.bin"
        print(f"Making {type} : {outfile}")
        subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
    if not keep_fp16:
        os.remove(fp16)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
    parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
    parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
    parser.add_argument('--outname', default=None, help='Output model(s) name')
    parser.add_argument('--outdir', default=None, help='Output directory')
    parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
    parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
    args = parser.parse_args()
    main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -1870,7 +1870,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
        }
    }
 #else
-    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
+    if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
        // use cublasGemmStridedBatchedEx
        CUBLAS_CHECK(
@ -2886,7 +2886,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_CONT:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
            return true;
        case GGML_OP_ROPE:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_IM2COL:
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM_ROWS:
--- a/ggml-cuda/concat.cu
+++ b/ggml-cuda/concat.cu
@ -1,5 +1,6 @@
 #include "concat.cuh"
 // contiguous kernels
 static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
@ -92,39 +93,104 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, int n
    concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
 }
 // non-contiguous kernel (slow)
 static __global__ void concat_f32_non_cont(
        const char * src0,
        const char * src1,
              char * dst,
           int64_t   ne00,
           int64_t   ne01,
           int64_t   ne02,
           int64_t   ne03,
          uint64_t   nb00,
          uint64_t   nb01,
          uint64_t   nb02,
          uint64_t   nb03,
           int64_t /*ne10*/,
           int64_t /*ne11*/,
           int64_t /*ne12*/,
           int64_t /*ne13*/,
          uint64_t   nb10,
          uint64_t   nb11,
          uint64_t   nb12,
          uint64_t   nb13,
           int64_t   ne0,
           int64_t /*ne1*/,
           int64_t /*ne2*/,
           int64_t /*ne3*/,
          uint64_t   nb0,
          uint64_t   nb1,
          uint64_t   nb2,
          uint64_t   nb3,
          int32_t   dim) {
    const int64_t i3 = blockIdx.z;
    const int64_t i2 = blockIdx.y;
    const int64_t i1 = blockIdx.x;
    int64_t o[4] = {0, 0, 0, 0};
    o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
    const float * x;
    for (int i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
        if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
            x = (const float *)(src0 + (i3       )*nb03 + (i2       )*nb02 + (i1       )*nb01 + (i0       )*nb00);
        } else {
            x = (const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
        }
        float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
        *y = *x;
    }
 }
 void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
    const float * src0_d = (const float *)src0->data;
    const float * src1_d = (const float *)src1->data;
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    const int32_t dim = ((int32_t *) dst->op_params)[0];
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(ggml_is_contiguous(src1));
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
-    if (dim != 3) {
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
-        for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+        const float * src0_d = (const float *)src0->data;
-            concat_f32_cuda(
+        const float * src1_d = (const float *)src1->data;
-                    src0_d + i3 * (src0->nb[3] / 4),
+
-                    src1_d + i3 * (src1->nb[3] / 4),
+        float * dst_d = (float *)dst->data;
-                     dst_d + i3 * ( dst->nb[3] / 4),
+
-                    src0->ne[0], src0->ne[1], src0->ne[2],
+        if (dim != 3) {
-                     dst->ne[0],  dst->ne[1],  dst->ne[2], dim, stream);
+            for (int i3 = 0; i3 < dst->ne[3]; i3++) {
                concat_f32_cuda(
                        src0_d + i3 * (src0->nb[3] / 4),
                        src1_d + i3 * (src1->nb[3] / 4),
                        dst_d + i3 * ( dst->nb[3] / 4),
                        src0->ne[0], src0->ne[1], src0->ne[2],
                        dst->ne[0],  dst->ne[1],  dst->ne[2], dim, stream);
            }
        } else {
            const size_t size0 = ggml_nbytes(src0);
            const size_t size1 = ggml_nbytes(src1);
            CUDA_CHECK(cudaMemcpyAsync(dst_d,           src0_d, size0, cudaMemcpyDeviceToDevice, stream));
            CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
        }
    } else {
-        const size_t size0 = ggml_nbytes(src0);
+        dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
-        const size_t size1 = ggml_nbytes(src1);
+        concat_f32_non_cont<<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
-
+                (const char *)src0->data,
-        CUDA_CHECK(cudaMemcpyAsync(dst_d,           src0_d, size0, cudaMemcpyDeviceToDevice, stream));
+                (const char *)src1->data,
-        CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
+                (      char *)dst->data,
                src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
                src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
                src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
                src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
                dst->ne[0],  dst->ne[1],  dst->ne[2],  dst->ne[3],
                dst->nb[0],  dst->nb[1],  dst->nb[2],  dst->nb[3], dim);
    }
 }
--- a/ggml-cuda/norm.cu
+++ b/ggml-cuda/norm.cu
@ -170,6 +170,8 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -188,6 +190,8 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -202,6 +206,8 @@ void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
--- a/ggml-cuda/rope.cu
+++ b/ggml-cuda/rope.cu
@ -61,7 +61,7 @@ static __global__ void rope(
 template<typename T, bool has_pos, bool has_freq_facs>
 static __global__ void rope_neox(
    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims, const float * freq_factors
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors
 ) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
@ -85,15 +85,13 @@ static __global__ void rope_neox(
    const int i  = row*ncols + ib*n_dims + ic/2;
    const int i2 = row/p_delta_rows;
    float cur_rot = inv_ndims * ic - ib;
    const int p = has_pos ? pos[i2] : 0;
    const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
-    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f)/freq_factor;
+    const float theta_base = p*powf(theta_scale, col/2.0f)/freq_factor;
    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    rope_yarn(theta_base, freq_scale, corr_dims, ic, ext_factor, attn_factor, &cos_theta, &sin_theta);
    const float x0 = x[i + 0];
    const float x1 = x[i + n_dims/2];
@ -174,30 +172,29 @@ static void rope_neox_cuda(
    const dim3 block_nums(nrows, num_blocks_x, 1);
    const float theta_scale = powf(freq_base, -2.0f/n_dims);
    const float inv_ndims = -1.0f / n_dims;
    if (pos == nullptr) {
        if (freq_factors == nullptr) {
            rope_neox<T, false, false><<<block_nums, block_dims, 0, stream>>>(
                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, inv_ndims, freq_factors
+                theta_scale, freq_factors
                );
        } else {
            rope_neox<T, false, true><<<block_nums, block_dims, 0, stream>>>(
                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, inv_ndims, freq_factors
+                theta_scale, freq_factors
                );
        }
    } else {
        if (freq_factors == nullptr) {
            rope_neox<T, true, false><<<block_nums, block_dims, 0, stream>>>(
                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, inv_ndims, freq_factors
+                theta_scale, freq_factors
                );
        } else {
            rope_neox<T, true, true><<<block_nums, block_dims, 0, stream>>>(
                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, inv_ndims, freq_factors
+                theta_scale, freq_factors
                );
        }
    }
@ -254,6 +251,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
    GGML_ASSERT(src0->type == dst->type);
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@ -1597,7 +1597,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                    {
                        GGML_ASSERT(ne00 == ne10);
                        // TODO: assert that dim2 and dim3 are contiguous
                        GGML_ASSERT(ne12 % ne02 == 0);
                        GGML_ASSERT(ne13 % ne03 == 0);
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -1519,7 +1519,6 @@ static enum ggml_status ggml_metal_graph_compute(
                    {
                        GGML_ASSERT(ne00 == ne10);
                        // TODO: assert that dim2 and dim3 are contiguous
                        GGML_ASSERT(ne12 % ne02 == 0);
                        GGML_ASSERT(ne13 % ne03 == 0);
@ -2187,6 +2186,7 @@ static enum ggml_status ggml_metal_graph_compute(
                case GGML_OP_RMS_NORM:
                    {
                        GGML_ASSERT(ne00 % 4 == 0);
                        GGML_ASSERT(ggml_is_contiguous_1(src0));
                        float eps;
                        memcpy(&eps, dst->op_params, sizeof(float));
@ -2214,6 +2214,7 @@ static enum ggml_status ggml_metal_graph_compute(
                case GGML_OP_GROUP_NORM:
                    {
                        GGML_ASSERT(ne00 % 4 == 0);
                        GGML_ASSERT(ggml_is_contiguous(src0));
                        //float eps;
                        //memcpy(&eps, dst->op_params, sizeof(float));
@ -2247,6 +2248,8 @@ static enum ggml_status ggml_metal_graph_compute(
                    } break;
                case GGML_OP_NORM:
                    {
                        GGML_ASSERT(ggml_is_contiguous_1(src0));
                        float eps;
                        memcpy(&eps, dst->op_params, sizeof(float));
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -1767,13 +1767,13 @@ kernel void kernel_rope(
    const int64_t p = pos[i2];
-    const float theta_0 = (float)p;
+    const float theta_base = (float)p;
    const float inv_ndims = -1.f/n_dims;
    if (!is_neox) {
        for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
            const float theta = theta_base * pow(freq_base, inv_ndims*i0);
            const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
            float cos_theta, sin_theta;
            rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
@ -1789,18 +1789,14 @@ kernel void kernel_rope(
    } else {
        for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
            if (ic < n_dims) {
-                const int64_t ib = 0;
+                const int64_t i0 = ic/2;
-                // simplified from `(ib * n_dims + ic) * inv_ndims`
+                const float freq_factor = src2 != src0 ? src2[i0] : 1.0f;
                const float cur_rot = inv_ndims*ic - ib;
                const float freq_factor = src2 != src0 ? src2[ic/2] : 1.0f;
-                const float theta = theta_0 * pow(freq_base, cur_rot) / freq_factor;
+                const float theta = theta_base * pow(freq_base, inv_ndims*ic);
                float cos_theta, sin_theta;
-                rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
+                rope_yarn(theta/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor, &cos_theta, &sin_theta);
                const int64_t i0 = ib*n_dims + ic/2;
                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -6088,6 +6088,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
        const uint8_t * restrict q2 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
        const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0);
        const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4);
        const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4);
@ -6807,6 +6808,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
    for (int i = 0; i < nb; ++i) {
        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
        const uint8_t * restrict q3 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
        // Set up scales
        memcpy(aux, x[i].scales, 12);
        __m128i scales128 = lsx_set_w(
@ -6828,29 +6831,32 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
        int bit = 0;
        int is  = 0;
        __m256i xvbit;
        const uint8_t * restrict q3 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
        for (int j = 0; j < QK_K/128; ++j) {
            // load low 2 bits
            const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
            xvbit = __lasx_xvreplgr2vr_h(bit);
            // prepare low and high bits
            const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3);
-            const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
+            const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
            ++bit;
            xvbit = __lasx_xvreplgr2vr_h(bit);
            const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3);
-            const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
+            const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
            ++bit;
            xvbit = __lasx_xvreplgr2vr_h(bit);
            const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3);
-            const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
+            const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
            ++bit;
            xvbit = __lasx_xvreplgr2vr_h(bit);
            const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3);
-            const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
+            const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
            ++bit;
            // load Q8 quants
@ -7399,6 +7405,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
    *s = vec_extract(vsumf0, 0);
 #elif defined __loongarch_asx
    GGML_UNUSED(kmask1);
    GGML_UNUSED(kmask2);
    GGML_UNUSED(kmask3);
    const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
@ -7411,6 +7420,11 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
        memcpy(utmp, x[i].scales, 12);
        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
        const uint32_t uaux = utmp[1] & kmask1;
        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
        utmp[2] = uaux;
        utmp[0] &= kmask1;
        const uint8_t * restrict q4 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
@ -7450,16 +7464,17 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
        __m256 vd = __lasx_xvreplfr2vr_s(d);
        acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
    }
    acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
    __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
    acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
    ft_union fi;
    fi.i = __lsx_vpickve2gr_w(acc_m, 0);
    *s = hsum_float_8(acc) + fi.f ;
 #else
    const uint8_t * scales = (const uint8_t*)&utmp[0];
@ -7997,6 +8012,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
    *s = vec_extract(vsumf0, 0);
 #elif defined __loongarch_asx
    GGML_UNUSED(kmask1);
    GGML_UNUSED(kmask2);
    GGML_UNUSED(kmask3);
    const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
    const __m128i mzero = __lsx_vldi(0);
@ -8015,6 +8033,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
        memcpy(utmp, x[i].scales, 12);
        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
        const uint32_t uaux = utmp[1] & kmask1;
        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
        utmp[2] = uaux;
        utmp[0] &= kmask1;
        const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
@ -8033,6 +8056,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
        __m256i sumi = __lasx_xvldi(0);
        int bit = 0;
        __m256i xvbit;
        for (int j = 0; j < QK_K/64; ++j) {
@ -8041,13 +8065,15 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
            const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
            xvbit = __lasx_xvreplgr2vr_h(bit++);
            const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4);
-            const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvand_v(hbits, hmask), bit++), 4);
+            const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
            const __m256i q5_0  = __lasx_xvadd_b(q5l_0, q5h_0);
            hmask = __lasx_xvslli_h(hmask, 1);
            xvbit = __lasx_xvreplgr2vr_h(bit++);
            const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
-            const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvand_v(hbits, hmask), bit++), 4);
+            const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
            const __m256i q5_1  = __lasx_xvadd_b(q5l_1, q5h_1);
            hmask = __lasx_xvslli_h(hmask, 1);
@ -8061,10 +8087,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
            p16_1 = lasx_madd_h(scale_1, p16_1);
            sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
        }
        __m256 vd = __lasx_xvreplfr2vr_s(d);
        acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
    }
    *s = hsum_float_8(acc) + summs;
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -3022,20 +3022,19 @@ static int g_work_group_size = 0;
 // typedef sycl::half ggml_fp16_t;
 #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
-#define VER_4VEC   610          //todo for hardward optimize.
+#define VER_4VEC   130          //todo for hardward optimize.
 #define VER_GEN9      700       //todo for hardward optimize.
 #define VER_GEN12 1000000       //todo for hardward optimize.
 #define VER_GEN13      (VER_GEN12 + 1030)   //todo for hardward optimize.
 #define GGML_SYCL_MAX_NODES 8192 //TODO: adapt to hardwares
-
+#if !defined(GGML_SYCL_FORCE_MMQ)
-//define for XMX in Intel GPU
+    #define SYCL_USE_XMX
-//TODO: currently, it's not used for XMX really.
+#endif
 #define SYCL_USE_XMX
 // max batch size to use MMQ kernels when tensor cores are available
-#define XMX_MAX_BATCH_SIZE 32
+#define MMQ_MAX_BATCH_SIZE 32
 #if defined(_MSC_VER)
@ -13567,7 +13566,7 @@ inline void ggml_sycl_op_concat(const ggml_tensor *src0,
 #pragma message("TODO: generalize concat kernel for dim != 2")
 #pragma message("      https://github.com/ggerganov/llama.cpp/pull/7563")
    int dim = dst->op_params[0];
-    GGML_ASSERT(dim != 2);
+    GGML_ASSERT(dim == 2);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
@ -15184,7 +15183,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;
-    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
+    if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
            *g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
@ -15249,6 +15248,29 @@ catch (sycl::exception const &exc) {
  std::exit(1);
 }
 inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
    // TODO: accuracy issues in MMQ
    return false;
 }
 bool ggml_sycl_supports_dmmv(enum ggml_type type) {
    switch (type) {
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
        case GGML_TYPE_Q5_1:
        case GGML_TYPE_Q8_0:
        case GGML_TYPE_Q2_K:
        case GGML_TYPE_Q3_K:
        case GGML_TYPE_Q4_K:
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_F16:
            return true;
        default:
            return false;
    }
 }
 static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const bool all_on_device =
@ -15265,76 +15287,42 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
        }
    }
    // check data types and tensor shapes for custom matrix multiplication kernels:
    bool use_dequantize_mul_mat_vec = ggml_sycl_supports_dmmv(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
    bool use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
    bool use_mul_mat_q =  ggml_sycl_supports_mmq(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
    // mmvq and mmq need the __dp4a instruction which is available for gen12+
    // Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
    use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
 #ifdef SYCL_USE_XMX
-    const bool use_xmx = true;
+    use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
-#else
+#endif // SYCL_USE_XMX
    const bool use_xmx = false;
 #endif
-    // debug helpers
+    if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
    if (!split && all_on_device && !use_xmx && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        // KQ single-batch
        // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_p021\n");
        ggml_sycl_mul_mat_vec_p021(src0, src1, dst);
-    } else if (!split && all_on_device && !use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+    } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // KQV single-batch
        // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n");
        ggml_sycl_mul_mat_vec_nc(src0, src1, dst);
-    } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+    } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
        // KQ + KQV multi-batch
        // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n");
        ggml_sycl_mul_mat_batched_sycl(src0, src1, dst);
-    } else if (src0->type == GGML_TYPE_F32) {
+    } else if (use_dequantize_mul_mat_vec) {
-        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
+        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
-        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
+    } else if (use_mul_mat_vec_q) {
-    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
+        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
-        // GGML_SYCL_DEBUG("ggml_is_quantized or GGML_TYPE_F16\n");
+    } else if (use_mul_mat_q) {
-        if (src1->ne[1] == 1 && src0->ne[0] % GGML_SYCL_DMMV_X == 0) {
+        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
 #ifdef GGML_SYCL_FORCE_DMMV
            const bool use_mul_mat_vec_q = false;
 #else
            bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
            use_mul_mat_vec_q = use_mul_mat_vec_q ||
                (src0->type == GGML_TYPE_IQ2_XXS) || (src0->type == GGML_TYPE_IQ2_XS) || (src0->type == GGML_TYPE_IQ2_S) ||
                (src0->type == GGML_TYPE_IQ3_XXS) || (src0->type == GGML_TYPE_IQ3_S) ||
                (src0->type == GGML_TYPE_IQ4_NL) || (src0->type == GGML_TYPE_IQ4_XS) ||
                (src0->type == GGML_TYPE_IQ1_S) || (src0->type == GGML_TYPE_IQ1_M);
 #endif // GGML_SYCL_FORCE_DMMV
            if (use_mul_mat_vec_q) {
                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
            } else {
                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n");
                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
            }
        } else {
            bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
            use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
            if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
                use_mul_mat_q = false;
            }
            if (use_mul_mat_q) {
                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n");
                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
            } else {
                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n");
                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
            }
        }
    } else {
-        GGML_ASSERT(false);
+        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
    }
 }
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -6012,6 +6012,8 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
 };
 GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
    ggml_vk_instance_init();
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
 #endif
--- a/ggml.c
+++ b/ggml.c
@ -60,6 +60,9 @@
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 typedef atomic_int atomic_flag;
 #define ATOMIC_FLAG_INIT 0
 static void atomic_store(atomic_int * ptr, LONG val) {
    InterlockedExchange(ptr, val);
@ -73,6 +76,12 @@ static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
 static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
    return atomic_fetch_add(ptr, -(dec));
 }
 static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
    return InterlockedExchange(ptr, 1);
 }
 static void atomic_flag_clear(atomic_flag * ptr) {
    InterlockedExchange(ptr, 0);
 }
 typedef HANDLE pthread_t;
@ -1567,11 +1576,11 @@ do {                                                              \
 // F16 arithmetic is not supported by AVX, so we use F32 instead
-#define GGML_F32Cx8             __m256
+#define GGML_F32Cx8          __m256
 #define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
 #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
-static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
+static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
    float tmp[8];
    for (int i = 0; i < 8; i++) {
@ -1580,13 +1589,14 @@ static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
    return (__m256)__lasx_xvld(tmp, 0);
 }
-static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
+static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
    float arr[8];
    __lasx_xvst(y, arr, 0);
-    for (int i = 0; i < 8; i++)
+    for (int i = 0; i < 8; i++) {
        x[i] = GGML_FP32_TO_FP16(arr[i]);
    }
 }
 #define GGML_F32Cx8_LOAD(x)     __lasx_f32cx8_load(x)
 #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
@ -1662,7 +1672,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
 #define GGML_F16_STEP 32
 #define GGML_F16_EPR  4
-static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
+static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
    float tmp[4];
    tmp[0] = GGML_FP16_TO_FP32(x[0]);
@ -1673,7 +1683,7 @@ static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
    return __lsx_vld(tmp, 0);
 }
-static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
+static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
    float arr[4];
    __lsx_vst(y, arr, 0);
@ -2306,32 +2316,27 @@ inline static __m512 ggml_v_expf(__m512 x) {
  const __m512 r = _mm512_set1_ps(0x1.8p23f);
  const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
  const __m512 n = _mm512_sub_ps(z, r);
-  const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
+  const __m512 b =
-                                    _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
+      _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
-  const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
+                       _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
  const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
  const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
  const __m512 u = _mm512_mul_ps(b, b);
  const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
                                                                   _mm512_set1_ps(0x1.573e2ep-5f)), u,
                                                   _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
                                                                   _mm512_set1_ps(0x1.fffdb6p-2f))),
                                   u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
  if (_mm512_kortestz(c, c))
    return _mm512_fmadd_ps(j, k, k);
  const __m512i g = _mm512_and_si512(
      _mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
      _mm512_set1_epi32(0x82000000u));
  const __m512 s1 =
      _mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
  const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
  const __mmask16 d =
      _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
-  return _mm512_mask_blend_ps(
+  const __m512 u = _mm512_mul_ps(b, b);
-      d, _mm512_mask_blend_ps(
+  const __m512 j = _mm512_fmadd_ps(
-          c, _mm512_fmadd_ps(k, j, k),
+      _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
-          _mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
+                                      _mm512_set1_ps(0x1.573e2ep-5f)),
-      _mm512_mul_ps(s1, s1));
+                      u,
                      _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
                                      _mm512_set1_ps(0x1.fffdb6p-2f))),
      u,
      _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
  const __m512 res = _mm512_scalef_ps(j, n);
  if (_mm512_kortestz(d, d))
    return res;
  const __m512 zero = _mm512_setzero_ps();
  const __m512 alt = _mm512_mask_blend_ps(
      _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
  return _mm512_mask_blend_ps(d, res, alt);
 }
 // computes silu x/(1+exp(-x)) in single precision vector
@ -2883,24 +2888,20 @@ struct ggml_state {
 // global state
 static struct ggml_state g_state;
-static atomic_int g_state_barrier = 0;
+static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
 // barrier via spin lock
 inline static void ggml_critical_section_start(void) {
-    int processing = atomic_fetch_add(&g_state_barrier, 1);
+    while (atomic_flag_test_and_set(&g_state_critical)) {
-
+        // spin
-    while (processing > 0) {
+        sched_yield();
        // wait for other threads to finish
        atomic_fetch_sub(&g_state_barrier, 1);
        sched_yield(); // TODO: reconsider this
        processing = atomic_fetch_add(&g_state_barrier, 1);
    }
 }
 // TODO: make this somehow automatically executed
 //       some sort of "sentry" mechanism
 inline static void ggml_critical_section_end(void) {
-    atomic_fetch_sub(&g_state_barrier, 1);
+    atomic_flag_clear(&g_state_critical);
 }
 #if defined(__gnu_linux__)
@ -3216,7 +3217,11 @@ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
-static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
+GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
    return ggml_is_contiguous(tensor);
 }
 GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
    return
@ -3225,6 +3230,14 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
 GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
    return
        tensor->nb[0] == ggml_type_size(tensor->type) &&
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
 GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
@ -6392,6 +6405,16 @@ struct ggml_tensor * ggml_rope_custom_inplace(
    );
 }
 struct ggml_tensor * ggml_rope_xpos_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   n_dims,
        float                 base,
        bool                  down) {
    return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
 }
 // ggml_rope_back
 struct ggml_tensor * ggml_rope_back(
@ -11012,7 +11035,7 @@ static void ggml_compute_forward_concat_f32(
 static void ggml_compute_forward_concat(
    const struct ggml_compute_params * params,
-    struct ggml_tensor* dst) {
+    struct ggml_tensor * dst) {
    const struct ggml_tensor * src0 = dst->src[0];
@ -11405,8 +11428,8 @@ static void ggml_compute_forward_gelu_f32(
    const struct ggml_tensor * src0 = dst->src[0];
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
+    GGML_ASSERT(ggml_is_contiguous_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
+    GGML_ASSERT(ggml_is_contiguous_1(dst));
    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@ -11468,8 +11491,8 @@ static void ggml_compute_forward_gelu_quick_f32(
    const struct ggml_tensor * src0 = dst->src[0];
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
+    GGML_ASSERT(ggml_is_contiguous_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
+    GGML_ASSERT(ggml_is_contiguous_1(dst));
    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@ -11531,8 +11554,8 @@ static void ggml_compute_forward_silu_f32(
    const struct ggml_tensor * src0 = dst->src[0];
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
+    GGML_ASSERT(ggml_is_contiguous_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
+    GGML_ASSERT(ggml_is_contiguous_1(dst));
    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@ -11643,9 +11666,9 @@ static void ggml_compute_forward_silu_back_f32(
    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * grad = dst->src[1];
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
+    GGML_ASSERT(ggml_is_contiguous_1(grad));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
+    GGML_ASSERT(ggml_is_contiguous_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
+    GGML_ASSERT(ggml_is_contiguous_1(dst));
    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    GGML_ASSERT(ggml_are_same_shape(src0, grad));
@ -14343,7 +14366,7 @@ static void ggml_compute_forward_rope_f32(
    int ir = 0;
    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float inv_ndims = -1.f/n_dims;
+
    float corr_dims[2];
    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
@ -14392,7 +14415,7 @@ static void ggml_compute_forward_rope_f32(
                        const float cos_block_theta = cosf(block_theta);
                        const float sin_block_theta = sinf(block_theta) * sin_sign;
-                        theta_base *= theta_scale;
+                        theta_base  *= theta_scale;
                        block_theta *= theta_scale;
                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@ -14427,29 +14450,22 @@ static void ggml_compute_forward_rope_f32(
                        dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
                    }
                } else {
-                    // TODO: this might be wrong for ne0 != n_dims - need double check
+                    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
                    //       it seems we have to rope just the first n_dims elements and do nothing with the rest
                    // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
                    theta_base *= freq_scale;
                    for (int64_t ic = 0; ic < ne0; ic += 2) {
                        if (ic < n_dims) {
-                            const int64_t ib = 0;
+                            const int64_t i0 = ic/2;
-                            // simplified from `(ib * n_dims + ic) * inv_ndims`
+                            const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
                            float cur_rot = inv_ndims * ic - ib;
                            float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
                            float cos_theta, sin_theta;
                            rope_yarn(
-                                theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
+                                theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
                                &cos_theta, &sin_theta
                            );
                            sin_theta *= sin_sign;
                            sin_theta  *= sin_sign;
                            theta_base *= theta_scale;
                            const int64_t i0 = ib*n_dims + ic/2;
                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@ -14528,7 +14544,7 @@ static void ggml_compute_forward_rope_f16(
    int ir = 0;
    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float inv_ndims = -1.f/n_dims;
+
    float corr_dims[2];
    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
@ -14577,7 +14593,7 @@ static void ggml_compute_forward_rope_f16(
                        const float cos_block_theta = cosf(block_theta);
                        const float sin_block_theta = sinf(block_theta) * sin_sign;
-                        theta_base *= theta_scale;
+                        theta_base  *= theta_scale;
                        block_theta *= theta_scale;
                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@ -14608,29 +14624,22 @@ static void ggml_compute_forward_rope_f16(
                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                    }
                } else {
-                    // TODO: this might be wrong for ne0 != n_dims - need double check
+                    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
                    //       it seems we have to rope just the first n_dims elements and do nothing with the rest
                    // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
                    theta_base *= freq_scale;
                    for (int64_t ic = 0; ic < ne0; ic += 2) {
                        if (ic < n_dims) {
-                            const int64_t ib = 0;
+                            const int64_t i0 = ic/2;
-                            // simplified from `(ib * n_dims + ic) * inv_ndims`
+                            const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
                            float cur_rot = inv_ndims * ic - ib;
                            float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
                            float cos_theta, sin_theta;
                            rope_yarn(
-                                theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
+                                theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
                                &cos_theta, &sin_theta
                            );
                            sin_theta *= sin_sign;
                            sin_theta  *= sin_sign;
                            theta_base *= theta_scale;
                            const int64_t i0 = ib*n_dims + ic/2;
                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                                  ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@ -22857,6 +22866,14 @@ int ggml_cpu_has_sycl(void) {
 #endif
 }
 int ggml_cpu_has_rpc(void) {
 #if defined(GGML_USE_RPC)
    return 1;
 #else
    return 0;
 #endif
 }
 int ggml_cpu_has_gpublas(void) {
    return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
           ggml_cpu_has_sycl();
--- a/ggml.h
+++ b/ggml.h
@ -756,7 +756,6 @@ extern "C" {
    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
    GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
    GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
    GGML_API GGML_CALL bool ggml_is_permuted  (const struct ggml_tensor * tensor);
    GGML_API GGML_CALL bool ggml_is_empty     (const struct ggml_tensor * tensor);
    GGML_API           bool ggml_is_scalar    (const struct ggml_tensor * tensor);
@ -765,6 +764,11 @@ extern "C" {
    GGML_API           bool ggml_is_3d        (const struct ggml_tensor * tensor);
    GGML_API           int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
    GGML_API GGML_CALL bool ggml_is_contiguous  (const struct ggml_tensor * tensor);
    GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
    GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
    GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
    GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
    GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
@ -1548,6 +1552,14 @@ extern "C" {
            float                 beta_slow),
        "use ggml_rope_ext_inplace instead");
    struct ggml_tensor * ggml_rope_xpos_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   n_dims,
        float                 base,
        bool                  down);
    // compute correction dims for YaRN RoPE scaling
    GGML_CALL void ggml_rope_yarn_corr_dims(
        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
@ -2420,6 +2432,7 @@ extern "C" {
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_rpc        (void);
    GGML_API int ggml_cpu_has_vsx        (void);
    GGML_API int ggml_cpu_has_matmul_int8(void);
--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
@ -2670,14 +2670,12 @@ void main() {
    const uint i  = row*p.ncols + ib*p.ndims + ic/2;
    const uint i2 = row/p.p_delta_rows;
    const float cur_rot = p.inv_ndims * ic - ib;
    const int pos = data_b[i2];
    const float freq_factor = p.has_freq_facs != 0 ? data_freq_factors[ic/2] : 1.0f;
    const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f) / freq_factor;
    float cos_theta, sin_theta;
-    rope_yarn(theta_base, uint(cur_rot), cos_theta, sin_theta);
+    rope_yarn(theta_base, ic, cos_theta, sin_theta);
    const float x0 = float(data_a[i + 0]);
    const float x1 = float(data_a[i + p.ndims/2]);
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@ -1,10 +1,15 @@
 from __future__ import annotations
 import re
 import logging
 import json
 import os
 from pathlib import Path
-from typing import Any, Callable, Sequence, Mapping, Iterable
+from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVar, runtime_checkable
 from sentencepiece import SentencePieceProcessor
 import gguf
 from .gguf_writer import GGUFWriter
@ -163,3 +168,298 @@ class SpecialVocab:
        for typ in self.special_token_types:
            self._set_special_token(typ, config.get(f'{typ}_token_id'))
        return True
@runtime_checkable
 class BaseVocab(Protocol):
    tokenizer_model: ClassVar[str]
    name: ClassVar[str]
@runtime_checkable
 class Vocab(BaseVocab, Protocol):
    vocab_size: int
    added_tokens_dict: dict[str, int]
    added_tokens_list: list[str]
    fname_tokenizer: Path
    def __init__(self, base_path: Path): ...
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
 class NoVocab(BaseVocab):
    tokenizer_model = "no_vocab"
    name = "no_vocab"
    def __repr__(self) -> str:
        return "<NoVocab for a model without integrated vocabulary>"
 class BpeVocab(Vocab):
    tokenizer_model = "gpt2"
    name = "bpe"
    def __init__(self, base_path: Path):
        added_tokens: dict[str, int] = {}
        if (fname_tokenizer := base_path / 'vocab.json').exists():
            # "slow" tokenizer
            with open(fname_tokenizer, encoding="utf-8") as f:
                self.vocab = json.load(f)
            try:
                # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
                with open(base_path / 'added_tokens.json', encoding="utf-8") as f:
                    added_tokens = json.load(f)
            except FileNotFoundError:
                pass
        else:
            # "fast" tokenizer
            fname_tokenizer = base_path / 'tokenizer.json'
            # if this fails, FileNotFoundError propagates to caller
            with open(fname_tokenizer, encoding="utf-8") as f:
                tokenizer_json = json.load(f)
            tokenizer_model: dict[str, Any] = tokenizer_json['model']
            if (
                tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
                or tokenizer_json['decoder']['type'] != 'ByteLevel'
            ):
                raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
            self.vocab = tokenizer_model["vocab"]
            if (added := tokenizer_json.get('added_tokens')) is not None:
                # Added tokens here can be duplicates of the main vocabulary.
                added_tokens = {item['content']: item['id']
                                for item in added
                                if item['content'] not in self.vocab}
        vocab_size   = len(self.vocab)
        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
        actual_ids   = sorted(added_tokens.values())
        if expected_ids != actual_ids:
            expected_end_id = vocab_size + len(actual_ids) - 1
            raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
                             f"{vocab_size} - {expected_end_id}; got {actual_ids}")
        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
        self.added_tokens_dict    = added_tokens
        self.added_tokens_list    = [text for (text, idx) in items]
        self.vocab_size_base      = vocab_size
        self.vocab_size           = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer      = fname_tokenizer
    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
        for i, _ in enumerate(self.vocab):
            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            score = -1000.0
            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        yield from self.bpe_tokens()
        yield from self.added_tokens()
    def __repr__(self) -> str:
        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 class SentencePieceVocab(Vocab):
    tokenizer_model = "llama"
    name = "spm"
    def __init__(self, base_path: Path):
        added_tokens: dict[str, int] = {}
        if (fname_tokenizer := base_path / 'tokenizer.model').exists():
            # normal location
            try:
                with open(base_path / 'added_tokens.json', encoding="utf-8") as f:
                    added_tokens = json.load(f)
            except FileNotFoundError:
                pass
        elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
            # not found in alternate location either
            raise FileNotFoundError('Cannot find tokenizer.model')
        self.sentencepiece_tokenizer = SentencePieceProcessor()
        self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
        vocab_size = self.sentencepiece_tokenizer.vocab_size()
        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
        actual_new_ids   = sorted(new_tokens.keys())
        if expected_new_ids != actual_new_ids:
            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
        # Token pieces that were added to the base vocabulary.
        self.added_tokens_dict  = added_tokens
        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
        self.vocab_size_base    = vocab_size
        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer    = fname_tokenizer
    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.sentencepiece_tokenizer
        for i in range(tokenizer.vocab_size()):
            piece = tokenizer.IdToPiece(i)
            text         = piece.encode("utf-8")
            score: float = tokenizer.GetScore(i)
            toktype = gguf.TokenType.NORMAL
            if tokenizer.IsUnknown(i):
                toktype = gguf.TokenType.UNKNOWN
            if tokenizer.IsControl(i):
                toktype = gguf.TokenType.CONTROL
            # NOTE: I think added_tokens are user defined.
            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
            if tokenizer.IsUnused(i):
                toktype = gguf.TokenType.UNUSED
            if tokenizer.IsByte(i):
                toktype = gguf.TokenType.BYTE
            yield text, score, toktype
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            score = -1000.0
            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        yield from self.sentencepiece_tokens()
        yield from self.added_tokens()
    def __repr__(self) -> str:
        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 class LlamaHfVocab(Vocab):
    tokenizer_model = "llama"
    name = "hfft"
    def __init__(self, base_path: Path):
        fname_tokenizer = base_path / 'tokenizer.json'
        # if this fails, FileNotFoundError propagates to caller
        with open(fname_tokenizer, encoding='utf-8') as f:
            tokenizer_json = json.load(f)
        # pre-check so we know if we need transformers
        tokenizer_model: dict[str, Any] = tokenizer_json['model']
        is_llama3 = (
            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
            and not tokenizer_model.get('byte_fallback', True)
        )
        if is_llama3:
            raise TypeError('Llama 3 must be converted with BpeVocab')
        if not is_llama3 and (
            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
            or tokenizer_json['decoder']['type'] != 'Sequence'
        ):
            raise FileNotFoundError('Cannot find Llama BPE tokenizer')
        try:
            from transformers import AutoTokenizer
        except ImportError as e:
            raise ImportError(
                "To use LlamaHfVocab, please install the `transformers` package. "
                "You can install it with `pip install transformers`."
            ) from e
        # Allow the tokenizer to default to slow or fast versions.
        # Explicitly set tokenizer to use local paths.
        self.tokenizer = AutoTokenizer.from_pretrained(
            base_path,
            cache_dir=base_path,
            local_files_only=True,
        )
        assert self.tokenizer.is_fast  # assume tokenizer.json is used
        # Initialize lists and dictionaries for added tokens
        self.added_tokens_list = []
        self.added_tokens_dict = dict()
        self.added_tokens_ids  = set()
        # Process added tokens
        for tok, tokidx in sorted(
            self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
        ):
            # Only consider added tokens that are not in the base vocabulary
            if tokidx >= self.tokenizer.vocab_size:
                self.added_tokens_list.append(tok)
                self.added_tokens_dict[tok] = tokidx
                self.added_tokens_ids.add(tokidx)
        # Store special tokens and their IDs
        self.specials = {
            tok: self.tokenizer.get_vocab()[tok]
            for tok in self.tokenizer.all_special_tokens
        }
        self.special_ids = set(self.tokenizer.all_special_ids)
        # Set vocabulary sizes
        self.vocab_size_base = self.tokenizer.vocab_size
        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer = fname_tokenizer
    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {
            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
        }
        for token_id in range(self.vocab_size_base):
            # Skip processing added tokens here
            if token_id in self.added_tokens_ids:
                continue
            # Convert token text to bytes
            token_text = reverse_vocab[token_id].encode("utf-8")
            # Yield token text, score, and type
            yield token_text, self.get_token_score(token_id), self.get_token_type(
                token_id, token_text, self.special_ids  # Reuse already stored special IDs
            )
    def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
        # Special case for byte tokens
        if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
            return gguf.TokenType.BYTE
        # Determine token type based on whether it's a special token
        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
    def get_token_score(self, token_id: int) -> float:
        # Placeholder for actual logic to determine the token's score
        # This needs to be implemented based on specific requirements
        return -1000.0  # Default score
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            if text in self.specials:
                toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
                score = self.get_token_score(self.specials[text])
            else:
                toktype = gguf.TokenType.USER_DEFINED
                score = -1000.0
            yield text.encode("utf-8"), score, toktype
    def has_newline_token(self):
        return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        yield from self.hf_tokens()
        yield from self.added_tokens()
    def __repr__(self) -> str:
        return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
--- a/gguf-py/scripts/gguf-new-metadata.py
+++ b/gguf-py/scripts/gguf-new-metadata.py
@ -144,6 +144,7 @@ def main() -> None:
    parser.add_argument("--general-description",                       type=str,  help="The models general.description", metavar='"Description ..."')
    parser.add_argument("--chat-template",                             type=str,  help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
    parser.add_argument("--chat-template-config",                      type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
    parser.add_argument("--pre-tokenizer",                             type=str,  help="The models tokenizer.ggml.pre", metavar='"pre tokenizer"')
    parser.add_argument("--remove-metadata",      action="append",     type=str,  help="Remove metadata (by key name) from output model", metavar='general.url')
    parser.add_argument("--special-token",        action="append",     type=str,  help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
    parser.add_argument("--special-token-by-id",  action="append",     type=str,  help="Special token by id", nargs=2, metavar=(' | '.join(token_names.keys()), '0'))
@ -172,6 +173,9 @@ def main() -> None:
            if template:
                new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
    if args.pre_tokenizer:
        new_metadata[gguf.Keys.Tokenizer.PRE] = MetadataDetails(gguf.GGUFValueType.STRING, args.pre_tokenizer)
    if remove_metadata:
        logger.warning('*** Warning *** Warning *** Warning **')
        logger.warning('* Most metadata is required for a fully functional GGUF file,')
--- a/llama.cpp
+++ b/llama.cpp
@ -1703,12 +1703,13 @@ struct llama_mlock {
 };
 using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
-static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
+// NOTE: avoid ever using this except for building the token_to_piece caches
 static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
+    const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
+        int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
        GGML_ASSERT(check == -n_tokens);
    }
    else {
@ -2029,8 +2030,9 @@ struct llama_layer {
    struct ggml_tensor * ffn_up_shexp;
    // ff bias
-    struct ggml_tensor * ffn_down_b; // b2
+    struct ggml_tensor * ffn_gate_b = nullptr;
-    struct ggml_tensor * ffn_up_b;   // b3
+    struct ggml_tensor * ffn_down_b = nullptr; // b2
    struct ggml_tensor * ffn_up_b   = nullptr; // b3
    struct ggml_tensor * ffn_act;
    // mamba proj
@ -2162,7 +2164,9 @@ struct llama_vocab {
    std::unordered_map<token, id> token_to_id;
    std::vector<token_data>       id_to_token;
-    std::unordered_map<token, id> special_tokens_cache;
+    std::vector<id>    cache_special_tokens;
    std::vector<token> cache_token_to_piece;         // llama_token_to_piece(special = false);
    std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
@ -4059,7 +4063,9 @@ static void llm_load_hparams(
                    switch (hparams.n_layer) {
                        case 22: model.type = e_model::MODEL_1B; break;
                        case 26: model.type = e_model::MODEL_3B; break;
-                        case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
+                        // granite uses a vocab with len 49152
                        case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
                        case 36: model.type = e_model::MODEL_8B; break; // granite
                        case 40: model.type = e_model::MODEL_13B; break;
                        case 48: model.type = e_model::MODEL_34B; break;
                        case 60: model.type = e_model::MODEL_30B; break;
@ -4329,6 +4335,8 @@ static void llm_load_hparams(
                    case 30: model.type = e_model::MODEL_3B; break;
                    case 32: model.type = e_model::MODEL_7B; break;
                    case 40: model.type = e_model::MODEL_15B; break;
                    case 52: model.type = e_model::MODEL_20B; break; // granite
                    case 88: model.type = e_model::MODEL_34B; break; // granite
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
@ -4588,15 +4596,14 @@ static void llm_load_vocab(
            vocab.special_cls_id  = 101;
            vocab.special_mask_id = 103;
            vocab.add_space_prefix = false;
-        } else {
+        } else if (tokenizer_model == "gpt2") {
-            if (tokenizer_model == "gpt2") {
+            vocab.type = LLAMA_VOCAB_TYPE_BPE;
-                vocab.type = LLAMA_VOCAB_TYPE_BPE;
+
-            } else {
+            const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
-                LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
+            if (add_space_prefix_keyidx != -1) {
-                LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
+                vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
                vocab.type = LLAMA_VOCAB_TYPE_SPM;
                return;
            }
            // read bpe merges and populate bpe ranks
            const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
            if (merges_keyidx == -1) {
@ -4630,6 +4637,8 @@ static void llm_load_vocab(
            vocab.special_pad_id  = -1;
            vocab.special_cls_id  = -1;
            vocab.special_mask_id = -1;
        } else {
            throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
        }
        // for now, only BPE models have pre-tokenizers
@ -4823,97 +4832,40 @@ static void llm_load_vocab(
    // build special tokens cache
    {
-        // TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
+        for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
        //  and will always be correctly labeled in 'added_tokens.json' etc.
        // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
        //  to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
        //  are special tokens.
        // From testing, this appears to correlate 1:1 with special tokens.
        //
        // Counting special tokens and verifying in only one direction
        //  is sufficient to detect difference in those two sets.
        //
        uint32_t special_tokens_count_by_type = 0;
        uint32_t special_tokens_count_from_verification = 0;
        bool special_tokens_definition_mismatch = false;
        for (const auto & t : vocab.token_to_id) {
            const auto & token = t.first;
            const auto & id    = t.second;
            // Count all non-normal tokens in the vocab while iterating
            if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
-                special_tokens_count_by_type++;
+                vocab.cache_special_tokens.push_back(id);
            }
            // Skip single character tokens
            if (token.length() > 1) {
                bool is_tokenizable = false;
                // Split token string representation in two, in all possible ways
                //  and check if both halves can be matched to a valid token
                for (unsigned i = 1; i < token.length();) {
                    const auto left  = token.substr(0, i);
                    const auto right = token.substr(i);
                    // check if we didnt partition in the middle of a utf sequence
                    auto utf = utf8_len(left.at(left.length() - 1));
                    if (utf == 1) {
                        if (vocab.token_to_id.find(left)  != vocab.token_to_id.end() &&
                            vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
                            is_tokenizable = true;
                            break;
                        }
                        i++;
                    } else {
                        // skip over the rest of multibyte utf sequence
                        i += utf - 1;
                    }
                }
                if (!is_tokenizable) {
                    // Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
                    //  it's faster to re-filter them here, since there are way less candidates now
                    // Calculate a total "utf" length of a token string representation
                    size_t utf8_str_len = 0;
                    for (unsigned i = 0; i < token.length();) {
                        utf8_str_len++;
                        i += utf8_len(token.at(i));
                    }
                    // And skip the ones which are one character
                    if (utf8_str_len > 1) {
                        // At this point what we have left are special tokens only
                        vocab.special_tokens_cache[token] = id;
                        // Count manually found special tokens
                        special_tokens_count_from_verification++;
                        // If this manually found special token is not marked as such, flag a mismatch
                        if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
                            special_tokens_definition_mismatch = true;
                        }
                    }
                }
            }
        }
-        if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
+        std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
-            LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
+            [&] (const llama_vocab::id a, const llama_vocab::id b) {
-                __func__,
+                return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
-                special_tokens_count_from_verification, vocab.id_to_token.size(),
+            }
-                special_tokens_count_by_type, vocab.id_to_token.size()
+        );
-            );
+
-        } else {
+        LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
-            LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
+    }
-                __func__,
+
-                special_tokens_count_from_verification, vocab.id_to_token.size()
+    // build token to piece caches
-            );
+    {
        size_t size_cache = 0;
        std::vector<llama_vocab::token> cache_token_to_piece        (n_vocab);
        std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
        for (uint32_t id = 0; id < n_vocab; ++id) {
            cache_token_to_piece[id]         = llama_token_to_piece(&model, id, false);
            cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
            size_cache += cache_token_to_piece[id].size();
            size_cache += cache_token_to_piece_special[id].size();
        }
        std::swap(vocab.cache_token_to_piece,         cache_token_to_piece);
        std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
        LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
    }
 }
@ -5213,6 +5165,11 @@ static bool llm_load_tensors(
                            layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
                            layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                            layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
                            // optional MLP bias
                            layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
                            layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
                            layer.ffn_up_b   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
                        } else {
                            layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
@ -5541,8 +5498,8 @@ static bool llm_load_tensors(
                        layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
                        layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        
+
-                        layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff});
+			layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff});
                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE,    "weight", i), {n_embd, n_ff});
                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN,        "weight", i), {n_ff, n_embd});
@ -7488,9 +7445,9 @@ struct llm_build_context {
                cb(cur, "ffn_norm", il);
                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
-                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                cb(cur, "ffn_out", il);
@ -11261,46 +11218,69 @@ struct llm_build_context {
                }
                // split into {n_head * n_embd_head_qk_nope, n_tokens}
-                struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_element_size(q) * hparams.n_embd_head_k, ggml_element_size(q) * hparams.n_embd_head_k * n_head, 0);
+                struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
                        ggml_row_size(q->type, hparams.n_embd_head_k),
                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
                        0);
                cb(q_nope, "q_nope", il);
                // and {n_head * n_embd_head_qk_rope, n_tokens}
-                struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_element_size(q) * hparams.n_embd_head_k, ggml_element_size(q) * hparams.n_embd_head_k * n_head, ggml_element_size(q) * n_embd_head_qk_nope);
+                struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
                        ggml_row_size(q->type, hparams.n_embd_head_k),
                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
                        ggml_row_size(q->type, n_embd_head_qk_nope));
                cb(q_pe, "q_pe", il);
                // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
-                struct ggml_tensor * compressed_kv_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+                struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
-                cb(compressed_kv_pe, "compressed_kv_pe", il);
+                cb(kv_pe_compresseed, "kv_pe_compresseed", il);
                // split into {kv_lora_rank, n_tokens}
-                struct ggml_tensor * compressed_kv = ggml_view_2d(ctx0, compressed_kv_pe, kv_lora_rank, n_tokens, compressed_kv_pe->nb[1], 0);
+                struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
-                cb(compressed_kv, "compressed_kv", il);
+                        kv_pe_compresseed->nb[1],
                        0);
                cb(kv_compressed, "kv_compressed", il);
                // and {n_embd_head_qk_rope, n_tokens}
-                struct ggml_tensor * k_pe = ggml_view_2d(ctx0, compressed_kv_pe, n_embd_head_qk_rope, n_tokens, compressed_kv_pe->nb[1], ggml_element_size(compressed_kv_pe)*kv_lora_rank);
+                struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
                        kv_pe_compresseed->nb[1],
                        kv_pe_compresseed->nb[1],
                        ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
                cb(k_pe, "k_pe", il);
-                compressed_kv = llm_build_norm(ctx0, compressed_kv, hparams,
+                kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
                kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
                        model.layers[il].attn_kv_a_norm, NULL,
                        LLM_NORM_RMS, cb, il);
-                cb(compressed_kv, "compressed_kv", il);
+                cb(kv_compressed, "kv_compressed", il);
                // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
-                struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, compressed_kv);
+                struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
                cb(kv, "kv", il);
                // split into {n_head * n_embd_head_qk_nope, n_tokens}
-                struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, ggml_element_size(kv) * (n_embd_head_qk_nope + hparams.n_embd_head_v), ggml_element_size(kv) * n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v), 0);
+                struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
                        ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
                        ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
                        0);
                cb(k_nope, "k_nope", il);
                // and {n_head * n_embd_head_v, n_tokens}
-                struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, ggml_element_size(kv) * (n_embd_head_qk_nope + hparams.n_embd_head_v), ggml_element_size(kv) * n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v), ggml_element_size(kv) * n_embd_head_qk_nope);
+                struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
                        ggml_row_size(kv->type, (n_embd_head_qk_nope)));
                cb(v_states, "v_states", il);
                v_states = ggml_cont(ctx0, v_states);
                cb(v_states, "v_states", il);
-                v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, ggml_element_size(kv) * hparams.n_embd_head_v * n_head, 0);
+                v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
                    ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
                    0);
                cb(v_states, "v_states", il);
                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
                q_pe = ggml_rope_ext(
                    ctx0, q_pe, inp_pos, nullptr,
                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@ -11309,8 +11289,9 @@ struct llm_build_context {
                cb(q_pe, "q_pe", il);
                // shared RoPE key
                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
                k_pe = ggml_rope_ext(
-                    ctx0, ggml_view_3d(ctx0, k_pe, n_embd_head_qk_rope, 1, n_tokens, k_pe->nb[0], k_pe->nb[1], 0), inp_pos, nullptr,
+                    ctx0, k_pe, inp_pos, nullptr,
                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                    ext_factor, attn_factor_scaled, beta_fast, beta_slow
                );
@ -13142,7 +13123,7 @@ struct llm_tokenizer_wpm {
    llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
-        auto * token_map = &vocab.token_to_id;
+        const auto & token_map = vocab.token_to_id;
        // normalize and split by whitespace
        std::vector<std::string> words = preprocess(text);
@ -13157,108 +13138,89 @@ struct llm_tokenizer_wpm {
            }
            // prepend phantom space
-            std::string word1 = "\xe2\x96\x81" + word;
+            const std::string word1 = "\xe2\x96\x81" + word;
-            int n = word1.size();
+            const int n = word1.size();
            const size_t current_tokens = output.size();
            // we're at the start of a new word
            int i = 0;
            bool match_any = false;
            // move through character position in word
-            while (i < n) {
+            for (int i = 0; i < n; ++i) {
                // loop through possible match length
                bool match = false;
                for (int j = n; j > i; j--) {
-                    auto it = token_map->find(word1.substr(i, j - i));
+                    auto it = token_map.find(word1.substr(i, j - i));
-                    if (it != token_map->end()) {
+                    if (it != token_map.end()) {
                        output.push_back(it->second);
                        match = true;
-                        match_any = true;
+                        i = j - 1;
                        i = j;
                        break;
                    }
                }
-                // must be an unknown character
+                if (!match) { // discard all
-                if (!match) {
+                    output.resize(current_tokens);
-                    i++;
+                    break;  // and discard next tokens
                }
            }
            // we didn't find any matches for this word
-            if (!match_any) {
+            if (current_tokens == output.size()) {
                output.push_back(vocab.special_unk_id);
            }
        }
    }
    std::vector<std::string> preprocess(const std::string & text) {
-        std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
+        const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
        std::vector<std::string> words(1, "");
-        // strip accents, strip control, uniformize whitespace,
+        for (const char32_t cpt : cpts_nfd) {
-        // to lowercase, pad chinese characters, pad punctuation
+            const auto flags = unicode_cpt_flags(cpt);
-        std::string new_str = "";
+
-        for (uint32_t code : cpts_nfd) {
+            if (flags.is_whitespace) {
-            const codepoint_flags flags = unicode_cpt_flags(code);
+                if (words.back().size()) {  // finish previous word if any
-            if (flags.is_accent_mark || flags.is_control) {
+                    words.emplace_back();
                }
                continue;
            }
-            code = unicode_tolower(code);
+
-            if (flags.is_separator || flags.is_whitespace) {  //####FIXME: is_separator ?
+            assert (!flags.is_separator);
-                code = ' ';
+            if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
                continue;
            }
-            std::string s = unicode_cpt_to_utf8(code);
+
-            if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
+            const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
-                new_str += " ";
+            if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
-                new_str += s;
+                if (words.back().size()) {  // finish previous word if any
-                new_str += " ";
+                    words.emplace_back();
                }
                words.back() = s;       // single char word
                words.emplace_back();   // start a new word
            } else {
-                new_str += s;
+                words.back() += s;  // append char to word
            }
        }
-        // split by whitespace
+        if (!words.back().size()) {
-        uint64_t l = 0;
+            words.pop_back();
        uint64_t r = 0;
        std::vector<std::string> words;
        while (r < new_str.size()) {
            // if is whitespace
            if (isspace(new_str[r], std::locale::classic())) {
                if (r > l) words.push_back(new_str.substr(l, (r - l)));
                l = r + 1;
                r = l;
            } else {
                r += 1;
            }
        }
        if (r > l) {
            words.push_back(new_str.substr(l, (r - l)));
        }
        return words;
    }
-    bool is_ascii_punct(uint32_t code) {
+    static bool is_chinese_char(uint32_t cpt) {
-        if (code > 0xFF) {
+        return
-            return false;
+            (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
-        }
+            (cpt >= 0x03400 && cpt <= 0x04DBF) ||
        auto c = char(static_cast<unsigned char>(code));
        return ispunct(c, std::locale::classic());
    }
    bool is_chinese_char(uint32_t cpt) {
        if ((cpt >= 0x4E00  && cpt <= 0x9FFF)  ||
            (cpt >= 0x3400  && cpt <= 0x4DBF)  ||
            (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
            (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
            (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
            (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
-            (cpt >= 0xF900  && cpt <= 0xFAFF)  ||
+            (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
-            (cpt >= 0x2F800 && cpt <= 0x2FA1F) ||
+            (cpt >= 0x2F800 && cpt <= 0x2FA1F);
-            (cpt >= 0x3000  && cpt <= 0x303F)  ||
+            //(cpt >= 0x3000  && cpt <= 0x303F)  ||
-            (cpt >= 0xFF00  && cpt <= 0xFFEF)) {
+            //(cpt >= 0xFF00  && cpt <= 0xFFEF);
            return true; // NOLINT
        }
        return false;
    }
    const llama_vocab & vocab;
@ -13302,9 +13264,8 @@ struct fragment_buffer_variant {
 static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
    // for each special token
-    for (const auto & st: vocab.special_tokens_cache) {
+    for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
-        const auto & special_token = st.first;
+        const auto & special_token = vocab.id_to_token[special_id].text;
        const auto & special_id    = st.second;
        // for each text fragment
        std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@ -13313,7 +13274,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
            // if a fragment is text ( not yet processed )
            if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
-                auto * raw_text = &(fragment.raw_text);
+                auto & raw_text = fragment.raw_text;
                auto raw_text_base_offset = fragment.offset;
                auto raw_text_base_length = fragment.length;
@ -13323,7 +13284,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                    // find the first occurrence of a given special token in this fragment
                    //  passing offset argument only limit the "search area" but match coordinates
                    //  are still relative to the source full raw_text
-                    auto match = raw_text->find(special_token, raw_text_base_offset);
+                    auto match = raw_text.find(special_token, raw_text_base_offset);
                    // no occurrences found, stop processing this fragment for a given special token
                    if (match == std::string::npos) break;
@ -13342,7 +13303,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                        // left
                        const int64_t left_reminder_offset = raw_text_base_offset + 0;
                        const int64_t left_reminder_length = match - raw_text_base_offset;
-                        buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
+                        buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
 #ifdef PRETOKENIZERDEBUG
                        LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
@ -13358,7 +13319,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
                    if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
                        const int64_t right_reminder_offset = match + special_token.length();
                        const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
-                        buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
+                        buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
 #ifdef PRETOKENIZERDEBUG
                        LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
@ -14462,7 +14423,7 @@ void llama_sample_repetition_penalties(
 void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
    GGML_ASSERT(ctx);
-    const int64_t t_start_sample_us = ggml_time_us();
+    int64_t t_start_sample_us = ggml_time_us();
    bool allow_eog = false;
    for (const auto & stack : grammar->stacks) {
@ -14474,12 +14435,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
    std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
    candidates_decoded.reserve(candidates->size);
-    std::vector<llama_grammar_candidate>                              candidates_grammar;
+
    std::vector<llama_grammar_candidate> candidates_grammar;
    candidates_grammar.reserve(candidates->size);
    for (size_t i = 0; i < candidates->size; ++i) {
-        const llama_token id    = candidates->data[i].id;
+        const llama_token id      = candidates->data[i].id;
-        const std::string piece = llama_token_to_piece(ctx, id, false);
+        const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
        if (llama_token_is_eog(&ctx->model, id)) {
            if (!allow_eog) {
@ -14679,7 +14641,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
        GGML_ASSERT(false);
    }
-    const std::string piece = llama_token_to_piece(ctx, token, false);
+    const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
    // Note terminating 0 in decoded string
    const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
@ -18362,69 +18324,83 @@ static std::string llama_decode_text(const std::string & text) {
 // does not write null-terminator to buf
 int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
    // if we have a cache - use it
    {
        const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
        if (!cache.empty()) {
            const auto & res = cache.at(token);
            if (length < (int) res.size()) {
                return -(int) res.size();
            }
            memcpy(buf, res.c_str(), res.size());
            return res.size();
        }
    }
    if (0 <= token && token < llama_n_vocab(model)) {
        switch (llama_vocab_get_type(model->vocab)) {
-        case LLAMA_VOCAB_TYPE_WPM:
+            case LLAMA_VOCAB_TYPE_WPM:
-        case LLAMA_VOCAB_TYPE_SPM: {
+            case LLAMA_VOCAB_TYPE_SPM: {
-            // NOTE: we accept all unsupported token types,
+                // NOTE: we accept all unsupported token types,
-            // suppressing them like CONTROL tokens.
+                // suppressing them like CONTROL tokens.
-            if (llama_is_normal_token(model->vocab, token)) {
+                if (llama_is_normal_token(model->vocab, token)) {
-                std::string result = model->vocab.id_to_token[token].text;
+                    std::string result = model->vocab.id_to_token[token].text;
-                llama_unescape_whitespace(result);
+                    llama_unescape_whitespace(result);
-                if (length < (int) result.length()) {
+                    if (length < (int) result.length()) {
-                    return -(int) result.length();
+                        return -(int) result.length();
                    }
                    memcpy(buf, result.c_str(), result.length());
                    return result.length();
                } else if (
                        (llama_is_user_defined_token(model->vocab, token)) ||
                        (llama_is_control_token     (model->vocab, token) && special)) {
                    std::string result = model->vocab.id_to_token[token].text;
                    if (length < (int) result.length()) {
                        return -(int) result.length();
                    }
                    memcpy(buf, result.c_str(), result.length());
                    return result.length();
                } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
                    if (length < 3) {
                        return -3;
                    }
                    memcpy(buf, "\xe2\x96\x85", 3);
                    return 3;
                } else if (llama_is_byte_token(model->vocab, token)) {
                    if (length < 1) {
                        return -1;
                    }
                    buf[0] = llama_token_to_byte(model->vocab, token);
                    return 1;
                }
-                memcpy(buf, result.c_str(), result.length());
+                break;
                return result.length();
            } else if (
                    (llama_is_user_defined_token(model->vocab, token)) ||
                    (llama_is_control_token     (model->vocab, token) && special)) {
                std::string result = model->vocab.id_to_token[token].text;
                if (length < (int) result.length()) {
                    return -(int) result.length();
                }
                memcpy(buf, result.c_str(), result.length());
                return result.length();
            } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
                if (length < 3) {
                    return -3;
                }
                memcpy(buf, "\xe2\x96\x85", 3);
                return 3;
            } else if (llama_is_byte_token(model->vocab, token)) {
                if (length < 1) {
                    return -1;
                }
                buf[0] = llama_token_to_byte(model->vocab, token);
                return 1;
            }
-            break;
+            case LLAMA_VOCAB_TYPE_BPE: {
-        }
+                // NOTE: we accept all unsupported token types,
-        case LLAMA_VOCAB_TYPE_BPE: {
+                // suppressing them like CONTROL tokens.
-            // NOTE: we accept all unsupported token types,
+                if (llama_is_normal_token(model->vocab, token)) {
-            // suppressing them like CONTROL tokens.
+                    std::string result = model->vocab.id_to_token[token].text;
-            if (llama_is_normal_token(model->vocab, token)) {
+                    result = llama_decode_text(result);
-                std::string result = model->vocab.id_to_token[token].text;
+                    if (length < (int) result.length()) {
-                result = llama_decode_text(result);
+                        return -(int) result.length();
-                if (length < (int) result.length()) {
+                    }
-                    return -(int) result.length();
+                    memcpy(buf, result.c_str(), result.length());
                    return result.length();
                } else if (
                        (llama_is_user_defined_token(model->vocab, token)) ||
                        (llama_is_control_token     (model->vocab, token) && special)) {
                    std::string result = model->vocab.id_to_token[token].text;
                    if (length < (int) result.length()) {
                        return -(int) result.length();
                    }
                    memcpy(buf, result.c_str(), result.length());
                    return result.length();
                }
-                memcpy(buf, result.c_str(), result.length());
+                break;
                return result.length();
            } else if (
                    (llama_is_user_defined_token(model->vocab, token)) ||
                    (llama_is_control_token     (model->vocab, token) && special)) {
                std::string result = model->vocab.id_to_token[token].text;
                if (length < (int) result.length()) {
                    return -(int) result.length();
                }
                memcpy(buf, result.c_str(), result.length());
                return result.length();
            }
-            break;
+            default:
-        }
+                GGML_ASSERT(false);
        default:
            GGML_ASSERT(false);
        }
    }
    return 0;
--- a/llama.h
+++ b/llama.h
@ -424,8 +424,8 @@ extern "C" {
    LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
-    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model   * model);
+    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
-    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model   * model);
+    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
--- a/requirements.txt
+++ b/requirements.txt
@ -4,7 +4,7 @@
 # Package versions must stay compatible across all top-level python scripts.
 #
-r ./requirements/requirements-convert.txt
+-r ./requirements/requirements-convert-legacy-llama.txt
 -r ./requirements/requirements-convert-hf-to-gguf.txt
 -r ./requirements/requirements-convert-hf-to-gguf-update.txt
--- a/requirements/requirements-convert-hf-to-gguf-update.txt
+++ b/requirements/requirements-convert-hf-to-gguf-update.txt
@ -1,2 +1,2 @@
-r ./requirements-convert.txt
+-r ./requirements-convert-legacy-llama.txt
 torch~=2.1.1
--- a/requirements/requirements-convert-hf-to-gguf.txt
+++ b/requirements/requirements-convert-hf-to-gguf.txt
@ -1,2 +1,2 @@
-r ./requirements-convert.txt
+-r ./requirements-convert-legacy-llama.txt
 torch~=2.1.1
--- a/requirements/requirements-convert-legacy-llama.txt
+++ b/requirements/requirements-convert-legacy-llama.txt
--- a/requirements/requirements-convert-llama-ggml-to-gguf.txt
+++ b/requirements/requirements-convert-llama-ggml-to-gguf.txt
@ -1 +1 @@
-r ./requirements-convert.txt
+-r ./requirements-convert-legacy-llama.txt
--- a/scripts/check-requirements.sh
+++ b/scripts/check-requirements.sh
@ -166,7 +166,7 @@ if (( do_cleanup )); then
    rm -rf -- "$all_venv"
 fi
-check_convert_script convert.py
+check_convert_script examples/convert-legacy-llama.py
 for py in convert-*.py; do
    # skip convert-hf-to-gguf-update.py
    # TODO: the check is failing for some reason:
--- a/scripts/convert-gg.sh
+++ b/scripts/convert-gg.sh
@ -3,20 +3,20 @@
 set -e
 # LLaMA v1
-python3 convert.py ../llama1/7B  --outfile models/llama-7b/ggml-model-f16.gguf  --outtype f16
+python3 examples/convert-legacy-llama.py ../llama1/7B  --outfile models/llama-7b/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-legacy-llama.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-legacy-llama.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-legacy-llama.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16
 # LLaMA v2
-python3 convert.py ../llama2/llama-2-7b  --outfile models/llama-7b-v2/ggml-model-f16.gguf  --outtype f16
+python3 examples/convert-legacy-llama.py ../llama2/llama-2-7b  --outfile models/llama-7b-v2/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-legacy-llama.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-legacy-llama.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16
 # Code Llama
-python3 convert.py ../codellama/CodeLlama-7b/  --outfile models/codellama-7b/ggml-model-f16.gguf  --outtype f16
+python3 examples/convert-legacy-llama.py ../codellama/CodeLlama-7b/  --outfile models/codellama-7b/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-legacy-llama.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-legacy-llama.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16
 # Falcon
 python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b  1
--- a/scripts/pod-llama.sh
+++ b/scripts/pod-llama.sh
@ -75,7 +75,7 @@ if [ "$1" -eq "1" ]; then
    cd /workspace/llama.cpp
-    python3 convert.py ./models/tinyllama-1b  --outfile ./models/tinyllama-1b/ggml-model-f16.gguf  --outtype f16
+    python3 examples/convert-legacy-llama.py ./models/tinyllama-1b  --outfile ./models/tinyllama-1b/ggml-model-f16.gguf  --outtype f16
    ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
@ -90,7 +90,7 @@ if [ "$1" -eq "2" ]; then
    cd /workspace/llama.cpp
-    python3 convert.py ./models/codellama-7b  --outfile ./models/codellama-7b/ggml-model-f16.gguf  --outtype f16
+    python3 examples/convert-legacy-llama.py ./models/codellama-7b  --outfile ./models/codellama-7b/ggml-model-f16.gguf  --outtype f16
    ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
@ -105,7 +105,7 @@ if [ "$1" -eq "3" ]; then
    cd /workspace/llama.cpp
-    python3 convert.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert-legacy-llama.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
    ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
@ -120,7 +120,7 @@ if [ "$1" -eq "4" ]; then
    cd /workspace/llama.cpp
-    python3 convert.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert-legacy-llama.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
    ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
@ -135,7 +135,7 @@ if [ "$1" -eq "5" ]; then
    cd /workspace/llama.cpp
-    python3 convert.py ./models/codellama-7b-instruct  --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf  --outtype f16
+    python3 examples/convert-legacy-llama.py ./models/codellama-7b-instruct  --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf  --outtype f16
    ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
@ -150,7 +150,7 @@ if [ "$1" -eq "6" ]; then
    cd /workspace/llama.cpp
-    python3 convert.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert-legacy-llama.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
    ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
@ -165,7 +165,7 @@ if [ "$1" -eq "7" ]; then
    cd /workspace/llama.cpp
-    python3 convert.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert-legacy-llama.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
    ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
    ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@ -106,8 +106,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
    # src/ggml-kompute.h          -> ggml-kompute.h
    # src/ggml-metal.h            -> ggml-metal.h
    # src/ggml-metal.m            -> ggml-metal.m
    # src/ggml-mpi.h              -> ggml-mpi.h
    # src/ggml-mpi.c              -> ggml-mpi.c
    # src/ggml-opencl.cpp         -> ggml-opencl.cpp
    # src/ggml-opencl.h           -> ggml-opencl.h
    # src/ggml-quants.c           -> ggml-quants.c
@ -145,8 +143,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
        -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
        -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
        -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
        -e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
        -e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
        -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
        -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-126d34985705a5a2222723c145cb4e125ac689f3
+2aae01fd9b8f9399f343cf18f46f38996ef52e2c
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@ -14,8 +14,6 @@ cp -rpv ../ggml/src/ggml-kompute.h          ./ggml-kompute.h
 cp -rpv ../ggml/src/ggml-metal.h            ./ggml-metal.h
 cp -rpv ../ggml/src/ggml-metal.m            ./ggml-metal.m
 cp -rpv ../ggml/src/ggml-metal.metal        ./ggml-metal.metal
 cp -rpv ../ggml/src/ggml-mpi.h              ./ggml-mpi.h
 cp -rpv ../ggml/src/ggml-mpi.c              ./ggml-mpi.c
 cp -rpv ../ggml/src/ggml-opencl.cpp         ./ggml-opencl.cpp
 cp -rpv ../ggml/src/ggml-opencl.h           ./ggml-opencl.h
 cp -rpv ../ggml/src/ggml-quants.c           ./ggml-quants.c
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -129,8 +129,11 @@ llama_target_and_test(test-rope.cpp)
 llama_target_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_target_and_test(test-autorelease.cpp        LABEL "model")
-llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
+# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
-target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
+if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
    llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
    target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
 endif()
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -1138,26 +1138,37 @@ struct test_soft_max : public test_case {
 // GGML_OP_ROPE
 struct test_rope : public test_case {
    const ggml_type type;
-    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> ne_a;
    int n_dims;
    int mode;
    int n_ctx;
    float fs; // freq_scale
    float ef; // ext_factor
    float af; // attn_factor
    bool ff;
    int v; // view (1 : non-contiguous a)
    std::string vars() override {
-        return VARS_TO_STR6(type, ne, n_dims, mode, n_ctx, ff);
+        return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
    }
    test_rope(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+            std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
-            int n_dims = 10, int mode = 0, int n_ctx = 512, bool ff = false)
+            int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0)
-        : type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx), ff(ff) {}
+        : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_tensor * a;
-        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
+        if (v & 1) {
            auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
            a = ggml_new_tensor(ctx, type, 4, ne.data());
            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
        } else {
            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
        }
        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
        ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
-        ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+        ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, n_ctx, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
        return out;
    }
@ -1165,11 +1176,11 @@ struct test_rope : public test_case {
        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
            if (t->type == GGML_TYPE_I32) {
                // pos
-                std::vector<int> data(ne[2]);
+                std::vector<int> data(ne_a[2]);
-                for (int i = 0; i < ne[2]; i++) {
+                for (int i = 0; i < ne_a[2]; i++) {
                    data[i] = rand() % n_ctx;
                }
-                ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
+                ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
            } else {
                if (t->ne[0] == n_dims/2) {
                    // frequency factors in the range [0.9f, 1.1f]
@ -1262,22 +1273,37 @@ struct test_concat : public test_case {
    const std::array<int64_t, 4> ne_a;
    const int64_t ne_b_d;
    const int dim;
    const int v; // view (1 << 0: non-cont a, 1 << 1: non-cont b)
    std::string vars() override {
-        return VARS_TO_STR4(type, ne_a, ne_b_d, dim);
+        return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v);
    }
    test_concat(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
            int64_t ne_b_d = 10,
-            int dim = 2)
+            int dim = 2, int v = 0)
-        : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim) {}
+        : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        auto ne_b = ne_a;
        ne_b[dim] = ne_b_d;
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_tensor * a;
-        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
+        if (v & 1) {
            auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
            a = ggml_new_tensor(ctx, type, 4, ne.data());
            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
        } else {
            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
        }
        ggml_tensor * b;
        if (v & 2) {
            auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
            b = ggml_new_tensor(ctx, type, 4, ne.data());
            b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0);
        } else {
            b = ggml_new_tensor(ctx, type, 4, ne_b.data());
        }
        ggml_tensor * out = ggml_concat(ctx, a, b, dim);
        return out;
    }
@ -2198,26 +2224,46 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 0.0f));
    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 8.0f));
-    for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+    {
-        // TODO: ff not supported yet for !neox
+        bool all = true;
        test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512, false)); // llama 7B
        test_cases.emplace_back(new test_rope(type, {128,  40, 10, 1}, 128, 0, 512, false)); // llama 13B
        test_cases.emplace_back(new test_rope(type, {128,  52, 10, 1}, 128, 0, 512, false)); // llama 30B
        test_cases.emplace_back(new test_rope(type, {128,  64, 10, 1}, 128, 0, 512, false)); // llama 65B
-        for (bool ff : {false, true}) { // freq_factors
+        for (float v : { 0, 1 }) {
-            test_cases.emplace_back(new test_rope(type, { 64,   1, 10, 1},  64, 2, 512, ff)); // neox (falcon 7B)
+            for (float fs : { 1.0f, 1.4245f }) {
-            test_cases.emplace_back(new test_rope(type, { 64,  71, 10, 1},  64, 2, 512, ff)); // neox (falcon 7B)
+                for (float ef : { 0.0f, 0.7465f }) {
-            test_cases.emplace_back(new test_rope(type, { 64,   8, 10, 1},  64, 2, 512, ff)); // neox (falcon 40B)
+                    for (float af : { 1.0f, 1.4245f }) {
-            test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1},  64, 2, 512, ff)); // neox (falcon 40B)
+                        for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
-            test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  20, 2, 512, ff)); // neox (stablelm)
+                            // TODO: ff not supported yet for !neox
-            test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  32, 2, 512, ff)); // neox (phi-2)
+                            test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 7B
                            if (all) {
                                test_cases.emplace_back(new test_rope(type, {128,  40, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 13B
                                test_cases.emplace_back(new test_rope(type, {128,  52, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 30B
                                test_cases.emplace_back(new test_rope(type, {128,  64, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 65B
                            }
                            for (bool ff : {false, true}) { // freq_factors
                                if (all) {
                                    test_cases.emplace_back(new test_rope(type, { 64,   1, 10, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
                                    test_cases.emplace_back(new test_rope(type, { 64,  71, 10, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
                                    test_cases.emplace_back(new test_rope(type, { 64,   8, 10, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
                                }
                                test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
                            }
                        }
                        all = false;
                    }
                }
            }
        }
    }
-    for (int dim : { 0, 1, 2, 3, }) {
+    for (int v : { 0, 1, 2, 3 }) {
-        test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim));
+        for (int dim : { 0, 1, 2, 3, }) {
-        test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim));
+            test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
            test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
        }
    }
    for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@ -167,8 +167,10 @@ def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]:
    for m in range(iterations):
        rand.seed(m)
        words = rand.choices(special_tokens, k=500)
-        if tokenizer.add_bos_token:  # skip spam warning of double BOS
+        if words[0] == tokenizer.bos_token:  # skip spam warning of double BOS
-            while words and words[0] == tokenizer.bos_token:
+            while len(words) > 1 and words[1] == tokenizer.bos_token:  # leave one starting BOS
                words.pop(0)
            if tokenizer.add_bos_token:  # drop all starting BOS
                words.pop(0)
        yield "".join(words)
@ -293,15 +295,17 @@ def main(argv: list[str] = None):
    model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
    tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
    tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", True)
    tokenizer.add_eos_token = getattr(tokenizer, "add_eos_token", False)
    def func_tokenize1(text: str):
        return model.tokenize(text, add_special=True, parse_special=True)
    def func_tokenize2(text: str):
        return tokenizer.encode(text, add_special_tokens=True)
    ids = func_tokenize2("a")
    assert 1 <= len(ids) <= 3
    add_bos_token = len(ids) > 1 and tokenizer.bos_token_id == ids[0]
    tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", add_bos_token)
    vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
@ -324,8 +328,10 @@ if __name__ == "__main__":
    # import os
    # tokenizers = os.listdir(path_tokenizers)
    tokenizers = [
-        "llama-spm",   # SPM
+        # "llama-spm",   # SPM
-        "phi-3",       # SPM
+        # "phi-3",       # SPM
        "jina-v2-en",  # WPM
        "bert-bge",    # WPM
    ]
    for tokenizer in tokenizers:
`@ -1,2 +1,2 @@`
	`-r ./requirements-convert.txt`	`-r ./requirements-convert-legacy-llama.txt`
	`torch~=2.1.1`	`torch~=2.1.1`
`@ -1 +1 @@`
	`-r ./requirements-convert.txt`	`-r ./requirements-convert-legacy-llama.txt`
`@ -1 +1 @@`
	`126d34985705a5a2222723c145cb4e125ac689f3`	`2aae01fd9b8f9399f343cf18f46f38996ef52e2c`