Merge remote-tracking branch 'refs/remotes/origin/master' into hp/server/avoid-infinite-loop

2024-04-19 13:19:56 +02:00 · 2024-04-19 13:19:56 +02:00 · 558f69083a
commit 558f69083a
parent 6fd5ad597f 9958c81b79
65 changed files with 9425 additions and 6837 deletions
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -91,6 +91,12 @@ jobs:
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
      - name: Downcase github.repository_owner
        run: |
          echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
        env:
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
      - name: Build and push Docker image (versioned)
        if: github.event_name == 'push'
        uses: docker/build-push-action@v4
@ -98,7 +104,7 @@ jobs:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}
      - name: Build and push Docker image (tagged)
@ -107,5 +113,5 @@ jobs:
          context: .
          push: ${{ github.event_name == 'push' }}
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
          file: ${{ matrix.config.dockerfile }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -43,6 +43,18 @@ else()
    set(LLAMA_METAL_DEFAULT OFF)
 endif()
 # TODO: fix this for Android CI
 #       https://github.com/ggerganov/llama.cpp/pull/6716#issuecomment-2061509191
 #if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
 #    set(LLAMA_LLAMAFILE_DEFAULT OFF)
 #else()
 #    set(LLAMA_LLAMAFILE_DEFAULT ON)
 #endif()
 # TODO: temporary disable until MoE is fixed
 #       https://github.com/ggerganov/llama.cpp/pull/6716
 set(LLAMA_LLAMAFILE_DEFAULT OFF)
 # general
 option(BUILD_SHARED_LIBS                "build shared libraries"                                OFF)
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
@ -88,6 +100,7 @@ endif()
 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
 option(LLAMA_LLAMAFILE                       "llama: use llamafile SGEMM"                       ${LLAMA_LLAMAFILE_DEFAULT})
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
 option(LLAMA_CUBLAS                          "llama: use CUDA (deprecated, use LLAMA_CUDA)"     OFF)
@ -286,6 +299,7 @@ if (LLAMA_METAL)
        ${METALKIT_FRAMEWORK}
        )
 endif()
 if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
@ -368,6 +382,13 @@ if (LLAMA_BLAS)
    endif()
 endif()
 if (LLAMA_LLAMAFILE)
    add_compile_definitions(GGML_USE_LLAMAFILE)
    set(GGML_HEADERS_LLAMAFILE sgemm.h)
    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
 endif()
 if (LLAMA_QKK_64)
    add_compile_definitions(GGML_QKK_64)
 endif()
@ -1160,6 +1181,7 @@ add_library(ggml OBJECT
            ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
            ${GGML_SOURCES_VULKAN}    ${GGML_HEADERS_VULKAN}
            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
            )
 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
--- a/22
+++ b/22
@ -384,6 +384,15 @@ ifdef LLAMA_OPENBLAS
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
 endif # LLAMA_OPENBLAS
 # TODO: temporary disable until MoE is fixed
 #       https://github.com/ggerganov/llama.cpp/pull/6716
 LLAMA_NO_LLAMAFILE := 1
 ifndef LLAMA_NO_LLAMAFILE
 	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
 	OBJS        += sgemm.o
 endif
 ifdef LLAMA_BLIS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
@ -480,11 +489,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(NVCC_COMPILE)
 endif # LLAMA_CUDA
 ifdef LLAMA_CLBLAST
 	MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other clblast OpenCL)
 	MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
@ -603,6 +610,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
 ifndef LLAMA_NO_LLAMAFILE
 sgemm.o: sgemm.cpp sgemm.h ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif
 GF_CC := $(CC)
 include scripts/get-flags.mk
@ -688,7 +700,7 @@ llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
-COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o
+COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
 common.o: common/common.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@ -756,7 +768,7 @@ batched: examples/batched/batched.cpp                         ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o common.o $(OBJS)
+batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -788,7 +800,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
--- a/Package.swift
+++ b/Package.swift
@ -2,6 +2,45 @@
 import PackageDescription
 var sources = [
    "ggml.c",
    "sgemm.cpp",
    "llama.cpp",
    "unicode.cpp",
    "unicode-data.cpp",
    "ggml-alloc.c",
    "ggml-backend.c",
    "ggml-quants.c",
 ]
 var resources: [Resource] = []
 var linkerSettings: [LinkerSetting] = []
 var cSettings: [CSetting] =  [
    .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
    .unsafeFlags(["-fno-objc-arc"]),
    // NOTE: NEW_LAPACK will required iOS version 16.4+
    // We should consider add this in the future when we drop support for iOS 14
    // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
    // .define("ACCELERATE_NEW_LAPACK"),
    // .define("ACCELERATE_LAPACK_ILP64")
 ]
 #if canImport(Darwin)
 sources.append("ggml-metal.m")
 resources.append(.process("ggml-metal.metal"))
 linkerSettings.append(.linkedFramework("Accelerate"))
 cSettings.append(
    contentsOf: [
        .define("GGML_USE_ACCELERATE"),
        .define("GGML_USE_METAL")
    ]
 )
 #endif
 #if os(Linux)
    cSettings.append(.define("_GNU_SOURCE"))
 #endif
 let package = Package(
    name: "llama",
    platforms: [
@ -28,34 +67,11 @@ let package = Package(
               "ggml-cuda.h",
               "Makefile"
            ],
-            sources: [
+            sources: sources,
-                "ggml.c",
+            resources: resources,
                "llama.cpp",
                "unicode.cpp",
                "unicode-data.cpp",
                "ggml-alloc.c",
                "ggml-backend.c",
                "ggml-quants.c",
                "ggml-metal.m",
            ],
            resources: [
                .process("ggml-metal.metal")
            ],
            publicHeadersPath: "spm-headers",
-            cSettings: [
+            cSettings: cSettings,
-                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
+            linkerSettings: linkerSettings
                .define("GGML_USE_ACCELERATE"),
                .unsafeFlags(["-fno-objc-arc"]),
                .define("GGML_USE_METAL"),
                // NOTE: NEW_LAPACK will required iOS version 16.4+
                // We should consider add this in the future when we drop support for iOS 14
                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
                // .define("ACCELERATE_NEW_LAPACK"),
                // .define("ACCELERATE_LAPACK_ILP64")
            ],
            linkerSettings: [
                .linkedFramework("Accelerate")
            ]
        )
    ],
    cxxLanguageStandard: .cxx11
--- a/README-sycl.md
+++ b/README-sycl.md
@ -68,7 +68,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 | Intel GPU                     | Status  | Verified Model                        |
 |-------------------------------|---------|---------------------------------------|
-| Intel Data Center Max Series  | Support | Max 1550                              |
+| Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
 | Intel Arc Series              | Support | Arc 770, 730M                         |
 | Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
@ -84,8 +84,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 - **Execution Unit (EU)**
  - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
-### Nvidia GPU
+### Other Vendor GPU
 The BLAS acceleration on Nvidia GPU through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)
 **Verified devices**
@ -94,14 +93,9 @@ The BLAS acceleration on Nvidia GPU through oneAPI can be obtained using the Nvi
 | Ampere Series            | Support | A100, A4000    |
 | Ampere Series *(Mobile)* | Support | RTX 40 Series  |
 *Notes:*
  - Support for Nvidia targets through oneAPI is currently limited to Linux platforms.
  - Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.
 ## Docker
 The docker build option is currently limited to *intel GPU* targets.
 ### Build image
 ```sh
 # Using FP16
@ -168,29 +162,10 @@ Platform #0: Intel(R) OpenCL HD Graphics
 - **Nvidia GPU**
 In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
 Installation can be verified by running the following:
 ```sh
 nvidia-smi
 ```
 Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
 ```
 +---------------------------------------------------------------------------------------+
 | NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
 |-----------------------------------------+----------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
 |                                         |                      |               MIG M. |
 |=========================================+======================+======================|
 |   0  NVIDIA A100-PCIE-40GB          On  | 00000000:8D:00.0 Off |                    0 |
 | N/A   36C    P0              57W / 250W |      4MiB / 40960MiB |      0%      Default |
 |                                         |                      |             Disabled |
 +-----------------------------------------+----------------------+----------------------+
 ```
 2. **Install Intel® oneAPI Base toolkit**
- **Base installation**
+- **For Intel GPU**
 The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
@ -202,10 +177,10 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
 - **Adding support to Nvidia GPUs**
-**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
+**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
-**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
+**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
 ```sh
 git clone https://github.com/oneapi-src/oneMKL
@ -237,7 +212,7 @@ When targeting an intel GPU, the user should expect one or more level-zero devic
 - **Nvidia GPU**
-Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
+Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
 ```
 [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.12.0.12_195853.xmain-hotfix]
 [opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
@ -255,11 +230,13 @@ source /opt/intel/oneapi/setvars.sh
 mkdir -p build && cd build
 # Option 1: Use FP16 for better performance in long-prompt inference
-cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
 # Or without "--build", run "make" next
 # Option 2: Use FP32 by default
-cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 #build all binary
 cmake --build . --config Release -j -v
 ```
 #### Nvidia GPU
@ -274,10 +251,14 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
 mkdir -p build && cd build
 # Option 1: Use FP16 for better performance in long-prompt  inference
-cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
 # Option 2: Use FP32 by default
-cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 #build all binary
 cmake --build . --config Release -j -v
 ```
 ### III. Run the inference
@ -357,7 +338,6 @@ Otherwise, you can run the script:
 *Notes:*
 - By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
 - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
 ```sh
@ -438,7 +418,7 @@ cd build
 cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
-make
+make -j
 ```
 Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
@ -525,7 +505,6 @@ Otherwise, run the following wrapper script:
 Note:
 - By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
 - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
 ```sh
@ -557,12 +536,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 ## Known Issues
 - Hanging during startup
  llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.
  - **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.
 - `Split-mode:[row]` is not supported.
 ## Q&A
@ -574,7 +547,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 - General compiler error:
-  - Remove build folder or try a clean-build.
+  - Remove **build** folder or try a clean-build.
 - I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
--- a/README.md
+++ b/README.md
@ -94,6 +94,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] LLaMA 2 🦙🦙
 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
 - [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
 - [X] Falcon
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
@ -121,6 +122,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
 (instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
@ -188,6 +190,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [MindMac](https://mindmac.app) (proprietary)
 - [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
 ---
--- a/build.zig
+++ b/build.zig
@ -112,6 +112,7 @@ pub fn build(b: *std.build.Builder) !void {
    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
    const ggml = make.obj("ggml", "ggml.c");
    const sgemm = make.obj("sgemm", "sgemm.cpp");
    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
@ -128,14 +129,14 @@ pub fn build(b: *std.build.Builder) !void {
    const clip = make.obj("clip", "examples/llava/clip.cpp");
    const llava = make.obj("llava", "examples/llava/llava.cpp");
-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/ci/run.sh
+++ b/ci/run.sh
@ -153,6 +153,52 @@ function gg_sum_ctest_release {
    gg_printf '```\n'
 }
 # test_scripts_debug
 function gg_run_test_scripts_debug {
    cd ${SRC}
    set -e
    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
    set +e
 }
 function gg_sum_test_scripts_debug {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs test scripts in debug mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
    gg_printf '```\n'
    gg_printf '\n'
 }
 # test_scripts_release
 function gg_run_test_scripts_release {
    cd ${SRC}
    set -e
    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
    set +e
 }
 function gg_sum_test_scripts_release {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs test scripts in release mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
    gg_printf '```\n'
    gg_printf '\n'
 }
 function gg_get_model {
    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
@ -642,6 +688,9 @@ test $ret -eq 0 && gg_run ctest_release
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run embd_bge_small
    test $ret -eq 0 && gg_run test_scripts_debug
    test $ret -eq 0 && gg_run test_scripts_release
    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
        if [ -z ${GG_BUILD_CUDA} ]; then
            test $ret -eq 0 && gg_run open_llama_3b_v2
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -47,9 +47,6 @@ if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 set(TARGET json-schema-to-grammar)
 add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h)
 set(TARGET common)
 add_library(${TARGET} STATIC
@ -63,6 +60,7 @@ add_library(${TARGET} STATIC
    grammar-parser.h
    grammar-parser.cpp
    json.hpp
    json-schema-to-grammar.cpp
    train.h
    train.cpp
    ngram-cache.h
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1,4 +1,6 @@
 #include "common.h"
 #include "json.hpp"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
 #include <algorithm>
@ -68,6 +70,8 @@
 #define LLAMA_CURL_MAX_HEADER_LENGTH 256
 #endif // LLAMA_USE_CURL
 using json = nlohmann::ordered_json;
 int32_t get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
@ -104,6 +108,79 @@ int32_t get_num_physical_cores() {
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }
 #if defined(__x86_64__) && defined(__linux__)
 #include <pthread.h>
 static void cpuid(unsigned leaf, unsigned subleaf,
                  unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
    __asm__("movq\t%%rbx,%%rsi\n\t"
            "cpuid\n\t"
            "xchgq\t%%rbx,%%rsi"
            : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
            : "0"(leaf), "2"(subleaf));
 }
 static int pin_cpu(int cpu) {
    cpu_set_t mask;
    CPU_ZERO(&mask);
    CPU_SET(cpu, &mask);
    return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
 }
 static bool is_hybrid_cpu(void) {
    unsigned eax, ebx, ecx, edx;
    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
    return !!(edx & (1u << 15));
 }
 static bool is_running_on_efficiency_core(void) {
    unsigned eax, ebx, ecx, edx;
    cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
    int intel_atom = 0x20;
    int core_type = (eax & 0xff000000u) >> 24;
    return core_type == intel_atom;
 }
 static int count_math_cpus(int cpu_count) {
    int result = 0;
    for (int cpu = 0; cpu < cpu_count; ++cpu) {
        if (pin_cpu(cpu)) {
            return -1;
        }
        if (is_running_on_efficiency_core()) {
            continue; // efficiency cores harm lockstep threading
        }
        ++cpu; // hyperthreading isn't useful for linear algebra
        ++result;
    }
    return result;
 }
 #endif // __x86_64__ && __linux__
 /**
 * Returns number of CPUs on system that are useful for math.
 */
 int get_math_cpu_count() {
 #if defined(__x86_64__) && defined(__linux__)
    int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
    if (cpu_count < 1) {
        return get_num_physical_cores();
    }
    if (is_hybrid_cpu()) {
        cpu_set_t affinity;
        if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
            int result = count_math_cpus(cpu_count);
            pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
            if (result > 0) {
                return result;
            }
        }
    }
 #endif
    return get_num_physical_cores();
 }
 void process_escapes(std::string & input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;
@ -1148,6 +1225,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        );
        return true;
    }
    if (arg == "-j" || arg == "--json-schema") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
        return true;
    }
    if (arg == "--override-kv") {
        if (++i >= argc) {
            invalid_param = true;
@ -1353,6 +1438,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
    printf("  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
    printf("  --grammar-file FNAME  file to read grammar from\n");
    printf("  -j SCHEMA, --json-schema SCHEMA\n");
    printf("                        JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
    printf("                        For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
    printf("  --cfg-negative-prompt PROMPT\n");
    printf("                        negative prompt to use for guidance. (default: empty)\n");
    printf("  --cfg-negative-prompt-file FNAME\n");
--- a/common/common.h
+++ b/common/common.h
@ -39,6 +39,7 @@ extern char const *LLAMA_BUILD_TARGET;
 struct llama_control_vector_load_info;
 int get_math_cpu_count();
 int32_t get_num_physical_cores();
 //
@ -48,7 +49,7 @@ int32_t get_num_physical_cores();
 struct gpt_params {
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
-    int32_t n_threads             = get_num_physical_cores();
+    int32_t n_threads             = get_math_cpu_count();
    int32_t n_threads_draft       = -1;
    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
    int32_t n_threads_batch_draft = -1;
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -11,35 +11,101 @@
 using json = nlohmann::ordered_json;
 template <typename Iterator>
 static std::string join(Iterator begin, Iterator end, const std::string & separator);
 static std::string repeat(const std::string & str, size_t n);
 static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
    if (separator_rule.empty()) {
        if (min_items == 0 && max_items == 1) {
            return item_rule + "?";
        } else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
            return item_rule + "+";
        }
    }
    std::string result;
    if (min_items > 0) {
        if (item_rule_is_literal && separator_rule.empty()) {
            result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
        } else {
            std::vector<std::string> items(min_items, item_rule);
            result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
        }
    }
    std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
        auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
        if (up_to_n == 0) {
            return "";
        } else if (up_to_n == 1) {
            return "(" + content + ")?";
        } else if (!separator_rule.empty() && !prefix_with_sep) {
            return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
        } else {
            std::string res = repeat("(" + content + " ", up_to_n);
            // strip trailing space
            res = res.substr(0, res.length() - 1);
            res += repeat(")?", up_to_n);
            return res;
        }
    };
    if (min_items > 0 && max_items != min_items) {
        result += " ";
    }
    if (max_items != std::numeric_limits<int>::max()) {
        result += opt_repetitions(max_items - min_items, min_items > 0);
    } else {
        std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
        if (min_items == 0 && !separator_rule.empty()) {
            result = "(" + item_rule + " " + item_operator + "*)?";
        } else {
            result += item_operator + "*";
        }
    }
    return result;
 }
 const std::string SPACE_RULE = "\" \"?";
-std::unordered_map<std::string, std::string> PRIMITIVE_RULES = {
+struct BuiltinRule {
-    {"boolean", "(\"true\" | \"false\") space"},
+    std::string content;
-    {"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"},
+    std::vector<std::string> deps;
    {"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"},
    {"value", "object | array | string | number | boolean"},
    {"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"},
    {"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"},
    {"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"},
    {"string", " \"\\\"\" (\n"
               "        [^\"\\\\] |\n"
               "        \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n"
               "      )* \"\\\"\" space"},
    {"null", "\"null\" space"}
 };
 std::vector<std::string> OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"};
-std::unordered_map<std::string, std::string> DATE_RULES = {
+const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
-    {"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"},
+
-    {"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"},
+std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
-    {"date-time", "date \"T\" time"},
+    {"boolean", {"(\"true\" | \"false\") space", {}}},
-    {"date-string", "\"\\\"\" date \"\\\"\" space"},
+    {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
-    {"time-string", "\"\\\"\" time \"\\\"\" space"},
+    {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
-    {"date-time-string", "\"\\\"\" date-time \"\\\"\" space"}
+    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
    {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
    {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
    {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
    {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
    {"char",   {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
    {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
    {"null", {"\"null\" space", {}}},
 };
 std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
    {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
    {"date-time", {"date \"T\" time", {"date", "time"}}},
    {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
    {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
    {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
 };
 static bool is_reserved_name(const std::string & name) {
@ -47,7 +113,7 @@ static bool is_reserved_name(const std::string & name) {
    if (RESERVED_NAMES.empty()) {
        RESERVED_NAMES.insert("root");
        for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
-        for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first);
+        for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
    }
    return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
 }
@ -192,7 +258,7 @@ private:
                if (_dotall) {
                    rule = "[\\U00000000-\\U0010FFFF]";
                } else {
-                    rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]";
+                    rule = "[^\\x0A\\x0D]";
                }
                return _add_rule("dot", rule);
            };
@ -308,13 +374,6 @@ private:
                    auto &sub = last.first;
                    auto sub_is_literal = last.second;
                    if (min_times == 0 && max_times == std::numeric_limits<int>::max()) {
                        sub += "*";
                    } else if (min_times == 0 && max_times == 1) {
                        sub += "?";
                    } else if (min_times == 1 && max_times == std::numeric_limits<int>::max()) {
                        sub += "+";
                    } else {
                    if (!sub_is_literal) {
                        std::string & sub_id = sub_rule_ids[sub];
                        if (sub_id.empty()) {
@ -322,33 +381,14 @@ private:
                        }
                        sub = sub_id;
                    }
-                        std::string result;
+                    seq.back().first = build_repetition(
-                        if (sub_is_literal && min_times > 0) {
+                        sub_is_literal ? "\"" + sub + "\"" : sub,
-                            result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\"";
+                        min_times,
-                        } else {
+                        max_times,
-                            for (int j = 0; j < min_times; j++) {
+                        "",
-                                if (j > 0) {
+                        sub_is_literal
-                                    result += " ";
+                    );
                                }
                                result += sub;
                            }
                        }
                        if (min_times > 0 && min_times < max_times) {
                            result += " ";
                        }
                        if (max_times == std::numeric_limits<int>::max()) {
                            result += sub + "*";
                        } else {
                            for (int j = min_times; j < max_times; j++) {
                                if (j > min_times) {
                                    result += " ";
                                }
                                result += sub + "?";
                            }
                        }
                        seq.back().first = result;
                    seq.back().second = false;
                    }
                } else {
                    std::string literal;
                    auto is_non_literal = [&](char c) {
@ -424,7 +464,7 @@ private:
        if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
            std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
            std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
-            std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
+            std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
            prop_kv_rule_names["*"] = kv_rule;
            optional_props.push_back("*");
        }
@ -486,6 +526,25 @@ private:
        return rule;
    }
    std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
        auto n = _add_rule(name, rule.content);
        for (const auto & dep : rule.deps) {
            BuiltinRule dep_rule;
            auto it = PRIMITIVE_RULES.find(dep);
            if (it == PRIMITIVE_RULES.end()) {
                it = STRING_FORMAT_RULES.find(dep);
                if (it == STRING_FORMAT_RULES.end()) {
                    _errors.push_back("Rule " + dep + " not known");
                    continue;
                }
            }
            if (_rules.find(dep) == _rules.end()) {
                _add_primitive(dep, it->second);
            }
        }
        return n;
    }
 public:
    SchemaConverter(
        const std::function<json(const std::string &)> & fetch_json,
@ -647,49 +706,33 @@ public:
                return _add_rule(rule_name, rule);
            } else {
                std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
                std::string list_item_operator = "( \",\" space " + item_rule_name + " )";
                std::string successive_items;
                int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
                json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
-                int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : -1;
+                int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
-                if (min_items > 0) {
+
-                    successive_items += repeat(list_item_operator, min_items - 1);
+                return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
                    min_items--;
                }
                if (max_items >= 0 && max_items > min_items) {
                    successive_items += repeat(list_item_operator + "?", max_items - min_items - 1);
                } else {
                    successive_items += list_item_operator + "*";
                }
                std::string rule;
                if (min_items == 0) {
                    rule =  "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space";
                } else {
                    rule =  "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space";
                }
                return _add_rule(rule_name, rule);
            }
        } else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
            return _visit_pattern(schema["pattern"], rule_name);
        } else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
-            return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
+            return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
-        } else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) {
+        } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
-            for (const auto & kv : DATE_RULES) {
+            auto prim_name = schema_format + "-string";
-                _add_rule(kv.first, kv.second);
+            return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
-            }
+        } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
-            return schema_format + "-string";
+            std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
            int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
        } else if (schema.empty() || schema_type == "object") {
-            for (const auto & n : OBJECT_RULE_NAMES) {
+            return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
                _add_rule(n, PRIMITIVE_RULES.at(n));
            }
            return _add_rule(rule_name, "object");
        } else {
            if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
                _errors.push_back("Unrecognized schema: " + schema.dump());
                return "";
            }
            // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-            return _add_rule(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
+            return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
        }
    }
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -43,17 +43,18 @@ AnyModel = TypeVar("AnyModel", bound="type[Model]")
 class Model(ABC):
    _model_classes: dict[str, type[Model]] = {}
-    def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
+    def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool):
        self.dir_model = dir_model
        self.ftype = ftype
        self.fname_out = fname_out
        self.is_big_endian = is_big_endian
        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
        self.use_temp_file = use_temp_file
        self.is_safetensors = self._is_model_safetensors()
        self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
        self.part_names = self._get_part_names()
        self.hparams = Model.load_hparams(self.dir_model)
-        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
+        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
    @property
@ -1206,9 +1207,91 @@ class StableLMModel(Model):
        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
        self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
    def write_tensors(self):
        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
        n_head = self.hparams.get("num_attention_heads")
        n_kv_head = self.hparams.get("num_key_value_heads")
        q_norms = dict()
        k_norms = dict()
        for name, data_torch in self.get_tensors():
            # we don't need these
            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
                continue
            old_dtype = data_torch.dtype
            # convert any unsupported data types to float32
            if data_torch.dtype not in (torch.float16, torch.float32):
                data_torch = data_torch.to(torch.float32)
            data = data_torch.squeeze().numpy()
            n_dims = len(data.shape)
            if name.find("q_layernorm.norms") != -1:
                q_norms[name] = data
                if len(q_norms) >= (block_count * n_head):
                    self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
                continue
            if name.find("k_layernorm.norms") != -1:
                k_norms[name] = data
                if len(k_norms) >= (block_count * n_kv_head):
                    self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
                continue
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
                print(f"Can not map tensor {name!r}")
                sys.exit()
            n_dims = len(data.shape)
            data_dtype = data.dtype
            # if f32 desired, convert any float16 to float32
            if self.ftype == 0 and data_dtype == np.float16:
                data = data.astype(np.float32)
            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
            if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
                data = data.astype(np.float32)
            # if f16 desired, convert any float32 2-dim weight tensors to float16
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
                data = data.astype(np.float16)
            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)
    def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
        for bid in range(block_count):
            datas = []
            for xid in range(n_head):
                ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
                datas.append(norms[ename])
                del norms[ename]
            data = np.stack(datas, axis=0)
            data_dtype = data.dtype
            merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
            new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
                print(f"Can not map tensor {name!r}")
                sys.exit()
            if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
                data = data.astype(np.float32)
            # if f16 desired, convert any float32 2-dim weight tensors to float16
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
                data = data.astype(np.float16)
            print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
 class LlamaModel(Model):
@ -1220,6 +1303,14 @@ class LlamaModel(Model):
        except FileNotFoundError:
            self._set_vocab_llama_hf()
        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
                                          special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
        special_vocab._set_special_token("prefix", 32007)
        special_vocab._set_special_token("suffix", 32008)
        special_vocab._set_special_token("middle", 32009)
        special_vocab._set_special_token("eot",    32010)
        special_vocab.add_to_gguf(self.gguf_writer)
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
@ -1427,6 +1518,102 @@ class GrokModel(Model):
            self.gguf_writer.add_tensor(new_name, data)
@Model.register("DbrxForCausalLM")
 class DbrxModel(Model):
    model_arch = gguf.MODEL_ARCH.DBRX
    def set_gguf_parameters(self):
        ffn_config = self.hparams["ffn_config"]
        attn_config = self.hparams["attn_config"]
        self.gguf_writer.add_name(self.hparams["model_type"])
        self.gguf_writer.add_block_count(self.hparams["n_layers"])
        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
        self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
        self.gguf_writer.add_head_count(self.hparams["n_heads"])
        self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
        self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
        self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
        self.gguf_writer.add_file_type(self.ftype)
        self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
        self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
        self.gguf_writer.add_layer_norm_eps(1e-5)
        self.gguf_writer.add_file_type(self.ftype)
        print(f"gguf: file type = {self.ftype}")
    def write_tensors(self):
        block_count = self.hparams.get("n_layers")
        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
        for name, data_torch in self.get_tensors():
            n_expert = self.hparams["ffn_config"]["moe_num_experts"]
            n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
            n_embd = self.hparams["d_model"]
            # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
            # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
            # But llama.cpp moe graph works differently
            # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
            # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
            exp_tensor_names = {"ffn.experts.mlp.w1": None,       # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff,   n_expert}
                                "ffn.experts.mlp.w2": (0, 2, 1),  # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff,   n_embd, n_expert}
                                "ffn.experts.mlp.v1": None}       # LLM_TENSOR_FFN_UP_EXPS   ggml_tensor->ne{n_embd, n_ff,   n_expert}
            experts = False
            for exp_tensor_name in exp_tensor_names.keys():
                if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
                    experts = True
                    data_torch = data_torch.view(n_expert, n_ff, n_embd)
                    if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
                        data_torch = data_torch.permute(*permute_tensor)
                    break
            old_dtype = data_torch.dtype
            # convert any unsupported data types to float32
            if data_torch.dtype not in (torch.float16, torch.float32):
                data_torch = data_torch.to(torch.float32)
            data = data_torch.squeeze().numpy()
            # map tensor names
            # In MoE models the ffn tensors are typically most of the model weights,
            # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
            # Every other model has the weight names ending in .weight,
            # let's assume that is the convention which is not the case for dbrx:
            # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
            new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
            if new_name is None:
                print(f"Can not map tensor {name!r}")
                sys.exit()
            n_dims = len(data.shape)
            data_dtype = data.dtype
            # Most of the codebase that takes in 1D tensors only handles F32 tensors
            # and most of the outputs tensors are F32.
            if data_dtype != np.float32 and n_dims == 1:
                print(f"Can not map tensor {name!r}: all 1D tensors must be F32")
                sys.exit()
            # if f32 desired, convert any float16 to float32
            if self.ftype == 0 and data_dtype == np.float16:
                data = data.astype(np.float32)
            # if f16 desired, convert any float32 2-dim weight tensors to float16
            if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
                data = data.astype(np.float16)
            print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)
@Model.register("MiniCPMForCausalLM")
 class MiniCPMModel(Model):
    model_arch = gguf.MODEL_ARCH.MINICPM
@ -1595,6 +1782,105 @@ class Qwen2Model(Model):
    model_arch = gguf.MODEL_ARCH.QWEN2
@Model.register("Qwen2MoeForCausalLM")
 class Qwen2MoeModel(Model):
    model_arch = gguf.MODEL_ARCH.QWEN2MOE
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        if (n_experts := self.hparams.get("num_experts")) is not None:
            self.gguf_writer.add_expert_count(n_experts)
    def write_tensors(self):
        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
        n_experts = self.hparams.get("num_experts")
        experts = dict()
        for name, data_torch in self.get_tensors():
            # we don't need these
            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
                continue
            old_dtype = data_torch.dtype
            # convert any unsupported data types to float32
            if data_torch.dtype not in (torch.float16, torch.float32):
                data_torch = data_torch.to(torch.float32)
            data = data_torch.squeeze().numpy()
            # process the experts separately
            if name.find("experts") != -1:
                experts[name] = data
                if len(experts) >= n_experts * 3:
                    # merge the experts into a single 3d tensor
                    for bid in range(block_count):
                        for w_name in ["down_proj", "gate_proj", "up_proj"]:
                            full = True
                            for xid in range(n_experts):
                                ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
                                if ename not in experts:
                                    full = False
                                    break
                            if not full:
                                continue
                            datas = []
                            for xid in range(n_experts):
                                ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
                                datas.append(experts[ename])
                                del experts[ename]
                            data = np.stack(datas, axis=0)
                            data_dtype = data.dtype
                            if self.ftype == 0 and data_dtype == np.float16:
                                data = data.astype(np.float32)
                            if self.ftype == 1 and data_dtype == np.float32:
                                data = data.astype(np.float16)
                            merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
                            new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
                            if new_name is None:
                                print(f"Can not map tensor {name!r}")
                                sys.exit()
                            print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
                            self.gguf_writer.add_tensor(new_name, data)
                continue
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
                print(f"Can not map tensor {name!r}")
                sys.exit()
            n_dims = len(data.shape)
            data_dtype = data.dtype
            # if f32 desired, convert any float16 to float32
            if self.ftype == 0 and data_dtype == np.float16:
                data = data.astype(np.float32)
            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
            if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
                data = data.astype(np.float32)
            # if f16 desired, convert any float32 2-dim weight tensors to float16
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)
            print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)
        if len(experts) > 0:
            raise ValueError(f"Unprocessed experts: {experts.keys()}")
@Model.register("GPT2LMHeadModel")
 class GPT2Model(Model):
    model_arch = gguf.MODEL_ARCH.GPT2
@ -2143,6 +2429,13 @@ class GemmaModel(Model):
    def set_vocab(self):
        self._set_vocab_sentencepiece()
        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
                                          special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
        special_vocab._set_special_token("prefix", 67)
        special_vocab._set_special_token("suffix", 69)
        special_vocab._set_special_token("middle", 68)
        special_vocab._set_special_token("eot",    70)
        special_vocab.add_to_gguf(self.gguf_writer)
    def set_gguf_parameters(self):
        hparams = self.hparams
@ -2165,6 +2458,12 @@ class GemmaModel(Model):
        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
        for name, data_torch in self.get_tensors():
            # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
            # To prevent errors, skip loading lm_head.weight.
            if name == "lm_head.weight":
                print(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
                continue
            old_dtype = data_torch.dtype
            # convert any unsupported data types to float32
@ -2337,6 +2636,66 @@ class CommandR2Model(Model):
        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
@Model.register("OlmoForCausalLM")
@Model.register("OLMoForCausalLM")
 class OlmoModel(Model):
    model_arch = gguf.MODEL_ARCH.OLMO
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_layer_norm_eps(1e-5)
        if "clip_qkv" in self.hparams is not None:
            self.gguf_writer.add_clamp_kqv(self.hparams["clip_qkv"])
    # Same as super class, but permuting q_proj, k_proj
    # Copied from: LlamaModel
    def write_tensors(self):
        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
        n_head = self.hparams.get("num_attention_heads")
        n_kv_head = self.hparams.get("num_key_value_heads")
        for name, data_torch in self.get_tensors():
            old_dtype = data_torch.dtype
            # convert any unsupported data types to float32
            if data_torch.dtype not in (torch.float16, torch.float32):
                data_torch = data_torch.to(torch.float32)
            data = data_torch.numpy()
            if name.endswith("q_proj.weight"):
                data = permute(data, n_head, n_head)
            if name.endswith("k_proj.weight"):
                data = permute(data, n_head, n_kv_head)
            data = data.squeeze()
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
                print(f"Can not map tensor {name!r}")
                sys.exit()
            n_dims = len(data.shape)
            data_dtype = data.dtype
            # if f32 desired, convert any float16 to float32
            if self.ftype == 0 and data_dtype == np.float16:
                data = data.astype(np.float32)
            # 1d tensors need to be converted to float32
            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
                data = data.astype(np.float32)
            # if f16 desired, convert any float32 2-dim weight tensors to float16
            if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2:
                data = data.astype(np.float16)
            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)
 ###### CONVERSION LOGIC ######
@ -2363,6 +2722,7 @@ def parse_args() -> argparse.Namespace:
        "model", type=Path,
        help="directory containing model file",
    )
    parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
    return parser.parse_args()
@ -2406,7 +2766,7 @@ def main() -> None:
    with torch.inference_mode():
        model_class = Model.from_model_architecture(hparams["architectures"][0])
-        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
+        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)
        print("Set model parameters")
        model_instance.set_gguf_parameters()
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -28,14 +28,27 @@ static std::string ggml_ne_string(const ggml_tensor * t) {
 }
 static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
    GGML_ASSERT(n > 0);
    float sum = 0;
    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
        printf("                                     [\n");
-        for (int64_t i2 = 0; i2 < ne[2] && i2 < n; i2++) {
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
            if (i2 == n && ne[2] > 2*n) {
                printf("                                      ..., \n");
                i2 = ne[2] - n;
            }
            printf("                                      [\n");
-            for (int64_t i1 = 0; i1 < ne[1] && i1 < n; i1++) {
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
                if (i1 == n && ne[1] > 2*n) {
                    printf("                                       ..., \n");
                    i1 = ne[1] - n;
                }
                printf("                                       [");
-                for (int64_t i0 = 0; i0 < ne[0] && i0 < n; i0++) {
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
                    if (i0 == n && ne[0] > 2*n) {
                        printf("..., ");
                        i0 = ne[0] - n;
                    }
                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
                    float v;
                    if (type == GGML_TYPE_F16) {
@ -51,17 +64,14 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                    } else {
                        GGML_ASSERT(false);
                    }
-                    printf("%8.4f", v);
+                    printf("%12.4f", v);
                    sum += v;
-                    if (i0 < ne[0] - 1 && i0 < n - 1) printf(", ");
+                    if (i0 < ne[0] - 1) printf(", ");
                }
                if (ne[0] > n) printf(", ...");
                printf("],\n");
            }
            if (ne[1] > n) printf("                                       ...\n");
            printf("                                      ],\n");
        }
        if (ne[2] > n) printf("                                     ...\n");
        printf("                                     ]\n");
        printf("                                     sum = %f\n", sum);
    }
--- a/examples/gguf-split/README.md
+++ b/examples/gguf-split/README.md
@ -5,5 +5,6 @@ CLI to split / merge GGUF files.
 **Command line options:**
 - `--split`: split GGUF to multiple GGUF, default operation.
 - `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`.
 - `--split-max-tensors`: maximum tensors in each split: default(128)
 - `--merge`: merge multiple GGUF to a single GGUF.
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@ -59,10 +59,10 @@ static size_t split_str_to_n_bytes(std::string str) {
    int n;
    if (str.back() == 'M') {
        sscanf(str.c_str(), "%d", &n);
-        n_bytes = n * 1024 * 1024; // megabytes
+        n_bytes = (size_t)n * 1024 * 1024; // megabytes
    } else if (str.back() == 'G') {
        sscanf(str.c_str(), "%d", &n);
-        n_bytes = n * 1024 * 1024 * 1024; // gigabytes
+        n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
    } else {
        throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
    }
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@ -0,0 +1,89 @@
 #!/bin/bash
 set -eu
 if [ $# -lt 1 ]
 then
  echo "usage:   $0 path_to_build_binary [path_to_temp_folder]"
  echo "example: $0 ../../build/bin ../../tmp"
  exit 1
 fi
 if [ $# -gt 1 ]
 then
  TMP_DIR=$2
 else
  TMP_DIR=/tmp
 fi
 set -x
 SPLIT=$1/gguf-split
 MAIN=$1/main
 WORK_PATH=$TMP_DIR/gguf-split
 CUR_DIR=$(pwd)
 mkdir -p "$WORK_PATH"
 # Clean up in case of previously failed test
 rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
 # 1. Get a model
 (
  cd $WORK_PATH
  "$CUR_DIR"/../../scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
 )
 echo PASS
 # 2. Split with max tensors strategy
 $SPLIT --split-max-tensors 28  $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
 echo PASS
 echo
 # 2b. Test the sharded model is loading properly
 $MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
 echo PASS
 echo
 # 3. Merge
 $SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-merge.gguf
 echo PASS
 echo
 # 3b. Test the merged model is loading properly
 $MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
 echo PASS
 echo
 # 4. Split with no tensor in metadata
 #$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
 #echo PASS
 #echo
 # 4b. Test the sharded model is loading properly
 #$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32
 #echo PASS
 #echo
 # 5. Merge
 #$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
 #echo PASS
 #echo
 # 5b. Test the merged model is loading properly
 #$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
 #echo PASS
 #echo
 # 6. Split with size strategy
 $SPLIT --split-max-size 2G $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-2G
 echo PASS
 echo
 # 6b. Test the sharded model is loading properly
 $MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
 echo PASS
 echo
 # Clean up
 rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@ -21,12 +21,12 @@ not have to be performed at all.
 ### Running the example
 Download a Grit model:
 ```console
-$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
+$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
 ```
 Run the example using the downloaded model:
 ```console
-$ ./gritlm -m gritlm-7b_q4_1.gguf
+$ ./gritlm -m models/gritlm-7b_q4_1.gguf
 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -44,7 +44,7 @@ private:
    std::mutex                             m_mutex;
    int                                    m_last_call = 0;
    std::vector<float>                     m_src1_data;
-    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
+    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
                                                  //
    void save_imatrix(const char * file_name) const;
    void keep_imatrix(int ncall) const;
@ -81,6 +81,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    if (ask) {
        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
        if (t->op != GGML_OP_MUL_MAT) return false;
        // why are small batches ignored (<16 tokens)?
        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
        if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
        return true;
@ -101,14 +102,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    // this has been adapted to the new format of storing merged experts in a single 3d tensor
    // ref: https://github.com/ggerganov/llama.cpp/pull/6387
    if (t->op == GGML_OP_MUL_MAT_ID) {
-        const int idx  = ((int32_t *) t->op_params)[0];
+        //   ids  -> [n_experts_used, n_tokens]
        //   src1 -> [cols, n_expert_used, n_tokens]
        const ggml_tensor * ids = t->src[2];
        const int n_as = src0->ne[2];
        const int n_ids = ids->ne[0];
        // the top-k selected expert ids are stored in the ids tensor
        // for simplicity, always copy ids to host, because it is small
-        GGML_ASSERT(ids->ne[1] == src1->ne[1]);
+        // take into account that ids is not contiguous!
-        m_ids.resize(ggml_nbytes(ids)/sizeof(int));
+
        GGML_ASSERT(ids->ne[1] == src1->ne[2]);
        m_ids.resize(ggml_nbytes(ids));
        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
        auto & e = m_stats[wname];
@ -118,9 +124,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        //       using the following line, we can correct for that if needed by replacing the line above with:
        //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
        // loop over all possible experts, regardless if they are used or not in the batch
        for (int ex = 0; ex < n_as; ++ex) {
            size_t e_start = ex*src1->ne[0];
        if (e.values.empty()) {
            e.values.resize(src1->ne[0]*n_as, 0);
        }
@ -129,17 +132,29 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            exit(1); //GGML_ASSERT(false);
        }
        if (m_params.verbosity > 1) {
-                printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
        }
-            for (int row = 0; row < (int)src1->ne[1]; ++row) {
+        // loop over all possible experts, regardless if they are used or not in the batch
-                const int excur = m_ids[row*n_as + idx];
+        for (int ex = 0; ex < n_as; ++ex) {
            size_t e_start = ex*src1->ne[0];
            for (int idx = 0; idx < n_ids; ++idx) {
                for (int row = 0; row < (int)src1->ne[2]; ++row) {
                    const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);
                    GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
                    if (excur != ex) continue;
-                const float * x = data + row * src1->ne[0];
+
                    const int64_t i11 = idx % src1->ne[1];
                    const int64_t i12 = row;
                    const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
                        e.values[e_start + j] += x[j]*x[j];
                    }
                }
            }
            if (e.ncall > m_last_call) {
                m_last_call = e.ncall;
                if (m_last_call % m_params.n_output_frequency == 0) {
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@ -36,6 +36,11 @@ The `infill` program offers a seamless way to interact with LLaMA models, allowi
 ### Example
 Download a model that supports infill, for example CodeLlama:
 ```console
 scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
 ```
 ```bash
 ./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
 ```
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -6,37 +6,94 @@ import re
 import sys
 from typing import Any, Dict, List, Set, Tuple, Union
 def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
    if not separator_rule:
        if min_items == 0 and max_items == 1:
            return f'{item_rule}?'
        elif min_items == 1 and max_items is None:
            return f'{item_rule}+'
    result = ''
    if min_items > 0:
        if item_rule_is_literal and separator_rule is None:
            result = '"' + (item_rule[1:-1] * min_items) + '"'
        else:
            result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
    def opt_repetitions(up_to_n, prefix_with_sep=False):
        '''
            - n=4, no sep:             '(a (a (a (a)?)?)?)?'
            - n=4, sep=',', prefix:    '("," a ("," a ("," a ("," a)?)?)?)?'
            - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
        '''
        content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
        if up_to_n == 0:
            return ''
        elif up_to_n == 1:
            return f'({content})?'
        elif separator_rule and not prefix_with_sep:
            return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
        else:
            return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
    if min_items > 0 and max_items != min_items:
        result += ' '
    if max_items is not None:
        result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
    else:
        item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
        if min_items == 0 and separator_rule:
            result = f'({item_rule} {item_operator}*)?'
        else:
            result += f'{item_operator}*'
    return result
 class BuiltinRule:
    def __init__(self, content: str, deps: list = None):
        self.content = content
        self.deps = deps or []
 _up_to_15_digits = _build_repetition('[0-9]', 0, 15)
 # whitespace is constrained to a single space char to prevent model "running away" in
 # whitespace. Also maybe improves generation quality?
 SPACE_RULE = '" "?'
 PRIMITIVE_RULES = {
-    'boolean': '("true" | "false") space',
+    'boolean'      : BuiltinRule('("true" | "false") space', []),
-    'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
+    'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
-    'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
+    'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
-    'value'  : 'object | array | string | number | boolean',
+    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
-    'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
+    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
-    'array'  : '"[" space ( value ("," space value)* )? "]" space',
+    'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
-    'uuid'   : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space',
+    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
-    'string': r''' "\"" (
+    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-        [^"\\] |
+    'uuid'         : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
-        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+    'char'         : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
-      )* "\"" space''',
+    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
-    'null': '"null" space',
+    'null'         : BuiltinRule('"null" space', []),
 }
 OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value']
 # TODO: support "uri", "email" string formats
-DATE_RULES = {
+STRING_FORMAT_RULES = {
-    'date'   : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
+    'date'            : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
-    'time'   : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
+    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
-    'date-time': 'date "T" time',
+    'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
-    'date-string': '"\\"" date "\\"" space',
+    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
-    'time-string': '"\\"" time "\\"" space',
+    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
-    'date-time-string': '"\\"" date-time "\\"" space',
+    'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
 }
-RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()])
+DOTALL = '[\\U00000000-\\U0010FFFF]'
 DOT = '[^\\x0A\\x0D]'
 RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])
 INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
 GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
@ -46,8 +103,6 @@ GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']'
 NON_LITERAL_SET = set('|.()[]{}*+?')
 ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
 DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
 TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits
 class SchemaConverter:
    def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
@ -55,7 +110,9 @@ class SchemaConverter:
        self._allow_fetch = allow_fetch
        self._dotall = dotall
        self._raw_pattern = raw_pattern
-        self._rules = {'space': SPACE_RULE}
+        self._rules = {
            'space': SPACE_RULE,
        }
        self._refs = {}
        self._refs_being_resolved = set()
@ -65,6 +122,29 @@ class SchemaConverter:
        )
        return f'"{escaped}"'
    def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str:
        '''
            not_literal('a') -> '[^a]'
            not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
        '''
        assert len(literal) > 0, 'Empty literal not supported'
        def recurse(i: int):
            c = literal[i]
            if maybe_escaped_underscores and c == '_':
                yield f'[^{c}\\\\]'
                yield ' | '
                yield f'"\\\\"? "{c}"'
            else:
                yield f'[^{c}]'
            if i < len(literal) - 1:
                yield ' | '
                yield self._format_literal(c)
                yield ' ('
                yield from recurse(i + 1)
                yield ')?'
        return ''.join(('(', *recurse(0), ')'))
    def _add_rule(self, name, rule):
        esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
        if esc_name not in self._rules or self._rules[esc_name] == rule:
@ -169,10 +249,10 @@ class SchemaConverter:
            def get_dot():
                if self._dotall:
-                    rule = '[\\U00000000-\\U0010FFFF]'
+                    rule = DOTALL
                else:
                    # Accept any character... except \n and \r line break chars (\x0A and \xOD)
-                    rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]'
+                    rule = DOT
                return self._add_rule(f'dot', rule)
            def join_seq():
@ -246,13 +326,6 @@ class SchemaConverter:
                    (sub, sub_is_literal) = seq[-1]
                    if min_times == 0 and max_times is None:
                        seq[-1] = (f'{sub}*', False)
                    elif min_times == 0 and max_times == 1:
                        seq[-1] = (f'{sub}?', False)
                    elif min_times == 1 and max_times is None:
                        seq[-1] = (f'{sub}+', False)
                    else:
                    if not sub_is_literal:
                        id = sub_rule_ids.get(sub)
                        if id is None:
@ -260,12 +333,7 @@ class SchemaConverter:
                            sub_rule_ids[sub] = id
                        sub = id
-                        seq[-1] = (
+                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
                            ' '.join(
                                ([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) +
                                ([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])),
                            False
                        )
                else:
                    literal = ''
                    while i < length:
@ -373,49 +441,47 @@ class SchemaConverter:
                    ' "]" space')
            else:
                item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
                list_item_operator = f'( "," space {item_rule_name} )'
                successive_items = ""
                min_items = schema.get("minItems", 0)
                max_items = schema.get("maxItems")
-                if min_items > 0:
+                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
                    successive_items = list_item_operator * (min_items - 1)
                    min_items -= 1
                if max_items is not None and max_items > min_items:
                    successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
                else:
                    successive_items += list_item_operator + "*"
                if min_items == 0:
                    rule = f'"[" space ( {item_rule_name} {successive_items} )? "]" space'
                else:
                    rule = f'"[" space {item_rule_name} {successive_items} "]" space'
                return self._add_rule(rule_name, rule)
        elif schema_type in (None, 'string') and 'pattern' in schema:
            return self._visit_pattern(schema['pattern'], rule_name)
        elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
-            return self._add_rule(
+            return self._add_primitive(
                'root' if rule_name == 'root' else schema_format,
                PRIMITIVE_RULES['uuid']
            )
-        elif schema_type in (None, 'string') and schema_format in DATE_RULES:
+        elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES:
-            for t, r in DATE_RULES.items():
+            prim_name = f'{schema_format}-string'
-                self._add_rule(t, r)
+            return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]))
-            return schema_format + '-string'
+
        elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema):
            char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
            min_len = schema.get('minLength', 0)
            max_len = schema.get('maxLength')
            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
        elif (schema_type == 'object') or (len(schema) == 0):
-            for n in OBJECT_RULE_NAMES:
+            return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
                self._add_rule(n, PRIMITIVE_RULES[n])
            return self._add_rule(rule_name, 'object')
        else:
            assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
            # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-            return self._add_rule(
+            return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
-                'root' if rule_name == 'root' else schema_type,
+
-                PRIMITIVE_RULES[schema_type]
+    def _add_primitive(self, name: str, rule: BuiltinRule):
-            )
+        n = self._add_rule(name, rule.content)
        for dep in rule.deps:
            dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
            assert dep_rule, f'Rule {dep} not known'
            if dep not in self._rules:
                self._add_primitive(dep, dep_rule)
        return n
    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
        prop_order = self._prop_order
@ -437,7 +503,7 @@ class SchemaConverter:
            value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
            prop_kv_rule_names["*"] = self._add_rule(
                f'{sub_name}-kv',
-                self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
+                self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
            )
            optional_props.append("*")
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -190,7 +190,7 @@ static const cmd_params cmd_params_defaults = {
    /* n_ubatch      */ {512},
    /* type_k        */ {GGML_TYPE_F16},
    /* type_v        */ {GGML_TYPE_F16},
-    /* n_threads     */ {get_num_physical_cores()},
+    /* n_threads     */ {get_math_cpu_count()},
    /* n_gpu_layers  */ {99},
    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
    /* main_gpu      */ {0},
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -304,10 +304,12 @@ These options help improve the performance and memory usage of the LLaMA models.
 -   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
-### Grammars
+### Grammars & JSON schemas
 -   `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
 -   `--json-schema SCHEMA`: Specify a [JSON schema](https://json-schema.org/) to constrain model output to (e.g. `{}` for any JSON object, or `{"items": {"type": "string", "minLength": 10, "maxLength": 100}, "minItems": 10}` for a JSON array of strings with size constraints). If a schema uses external `$ref`s, you should use `--grammar "$( python examples/json_schema_to_grammar.py myschema.json )"` instead.
 ### Quantization
 For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -1852,12 +1852,20 @@ int main(int argc, char ** argv) {
    const int32_t n_ctx = params.n_ctx;
    if (n_ctx <= 0) {
        fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
        return 1;
    }
    const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;
    if (ppl) {
-        int n_seq = std::max(1, params.n_batch / n_ctx);
+        const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
-        int32_t n_kv = n_seq * n_ctx;
+        const int32_t n_kv = n_seq * n_ctx;
        params.n_parallel = n_seq;
        params.n_ctx      = n_kv;
        params.n_batch = std::min(params.n_batch, n_kv);
    } else {
        params.n_batch = std::min(params.n_batch, params.n_ctx);
--- a/examples/regex-to-grammar.py
+++ b/examples/regex-to-grammar.py
@ -8,7 +8,7 @@ print(subprocess.check_output(
        "python",
        os.path.join(
        os.path.dirname(os.path.realpath(__file__)),
-        "json-schema-to-grammar.py"),
+        "json_schema_to_grammar.py"),
        *rest,
        "-",
        "--raw-pattern",
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -11,7 +11,7 @@ install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common json-schema-to-grammar ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
 if (LLAMA_SERVER_SSL)
    find_package(OpenSSL REQUIRED)
    target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -11,6 +11,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 * Continuous batching
 * Multimodal (wip)
 * Monitoring endpoints
 * Schema-constrained JSON response format
 The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
@ -250,6 +251,8 @@ node index.js
    `grammar`: Set grammar for grammar-based sampling.  Default: no grammar
    `json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features.  Default: no JSON schema.
    `seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.
    `ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`
@ -365,6 +368,8 @@ Notice that each `probs` is an array of length `n_probs`.
    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
    The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}`), similar to other OpenAI-inspired API providers.
    *Examples:*
    You can use either Python `openai` library with appropriate checkpoints:
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -51,26 +51,6 @@
      margin-bottom: 0.5em;
    }
  button, input, textarea, .button, a.button, select {
      color: #666;
      border: 1px solid #ddd;
      border-radius: 4px;
      line-height: 1.5em;
      padding: 0.25em 0.25em;
      text-decoration: none;
      font-size: 1.1rem;
    }
    button {
      border: 1px solid #2a8aad;
      background: #3584e4;
      font-weight: normal;
      color: #fff;
    }
    button:disabled {
      background: #9cbce5;
    }
    #write form {
      margin: 1em 0 0 0;
      display: flex;
@ -587,7 +567,7 @@
        runCompletion();
      }
      return html`
-        <div class="right">
+        <div>
          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
          <button onclick=${reset}>Reset</button>
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@ -1,33 +1,95 @@
-// WARNING: This file was ported from json-schema-to-grammar.py, please fix bugs / add features there first.
+// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
 const SPACE_RULE = '" "?';
-const PRIMITIVE_RULES = {
+function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
-  boolean: '("true" | "false") space',
+  const separatorRule = opts.separatorRule ?? '';
-  number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
+  const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
-  integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
+
-  value: 'object | array | string | number | boolean',
+  if (separatorRule === '') {
-  object: '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
+    if (minItems === 0 && maxItems === 1) {
-  array: '"[" space ( value ("," space value)* )? "]" space',
+      return `${itemRule}?`;
-  uuid: '"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space',
+    } else if (minItems === 1 && maxItems === undefined) {
-  string: ` "\\"" (
+      return `${itemRule}+`;
-        [^"\\\\] |
+    }
-        "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+  }
-      )* "\\"" space`,
+
-  null: '"null" space',
+  let result = '';
  if (minItems > 0) {
    if (itemRuleIsLiteral && separatorRule === '') {
      result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
    } else {
      result = Array.from({ length: minItems }, () => itemRule)
        .join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
    }
  }
  const optRepetitions = (upToN, prefixWithSep=false) => {
    const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule;
    if (upToN === 0) {
      return '';
    } else if (upToN === 1) {
      return `(${content})?`;
    } else if (separatorRule !== '' && !prefixWithSep) {
      return `(${content} ${optRepetitions(upToN - 1, true)})?`;
    } else {
      return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
    }
  };
  if (minItems > 0 && maxItems !== minItems) {
    result += ' ';
  }
  if (maxItems !== undefined) {
    result += optRepetitions(maxItems - minItems, minItems > 0);
  } else {
    const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
    if (minItems === 0 && separatorRule !== '') {
      result = `(${itemRule} ${itemOperator}*)?`;
    } else {
      result += `${itemOperator}*`;
    }
  }
  return result;
 }
 class BuiltinRule {
  constructor(content, deps) {
    this.content = content;
    this.deps = deps || [];
  }
 }
 const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
 const PRIMITIVE_RULES = {
  boolean        : new BuiltinRule('("true" | "false") space', []),
  'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []),
  'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []),
  number         : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
  integer        : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
  value          : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
  object         : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
  array          : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
  uuid           : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []),
  char           : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []),
  string         : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
  null           : new BuiltinRule('"null" space', []),
 };
 const OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value'];
 // TODO: support "uri", "email" string formats
-const DATE_RULES = {
+const STRING_FORMAT_RULES = {
-    'date'   : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
+  'date'            : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
-    'time'   : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
+  'time'            : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
-    'date-time': 'date "T" time',
+  'date-time'       : new BuiltinRule('date "T" time', ['date', 'time']),
-    'date-string': '"\\"" date "\\"" space',
+  'date-string'     : new BuiltinRule('"\\"" date "\\"" space', ['date']),
-    'time-string': '"\\"" time "\\"" space',
+  'time-string'     : new BuiltinRule('"\\"" time "\\"" space', ['time']),
-    'date-time-string': '"\\"" date-time "\\"" space',
+  'date-time-string': new BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
-};
+}
-const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...DATE_RULES};
+const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...STRING_FORMAT_RULES};
 const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
 const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
@ -158,7 +220,7 @@ export class SchemaConverter {
        rule = '[\\U00000000-\\U0010FFFF]';
      } else {
        // Accept any character... except \n and \r line break chars (\x0A and \xOD)
-        rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]';
+        rule = '[^\\x0A\\x0D]';
      }
      return this._addRule('dot', rule);
    };
@ -259,13 +321,6 @@ export class SchemaConverter {
          let [sub, subIsLiteral] = seq[seq.length - 1];
          if (minTimes === 0 && maxTimes === Infinity) {
            seq[seq.length - 1] = [`${sub}*`, false];
          } else if (minTimes === 0 && maxTimes === 1) {
            seq[seq.length - 1] = [`${sub}?`, false];
          } else if (minTimes === 1 && maxTimes === Infinity) {
            seq[seq.length - 1] = [`${sub}+`, false];
          } else {
          if (!subIsLiteral) {
            let id = subRuleIds[sub];
            if (id === undefined) {
@ -275,10 +330,10 @@ export class SchemaConverter {
            sub = id;
          }
-            const repeatedSub = Array.from({ length: minTimes }, () => subIsLiteral ? `"${sub.slice(1, -1).repeat(minTimes)}"` : sub);
+          seq[seq.length - 1] = [
-            const optionalSub = maxTimes !== undefined ? Array.from({ length: maxTimes - minTimes }, () => `${sub}?`) : [`${sub}*`];
+            _buildRepetition(subIsLiteral ? `"${sub}"` : sub, minTimes, maxTimes, {itemRuleIsLiteral: subIsLiteral}),
-            seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false];
+            false
-          }
+          ];
        } else {
          let literal = '';
          while (i < length) {
@ -394,49 +449,50 @@ export class SchemaConverter {
        );
      } else {
        const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`);
-        const listItemOperator = `( "," space ${itemRuleName} )`;
+        const minItems = schema.minItems || 0;
        let successiveItems = '';
        let minItems = schema.minItems || 0;
        const maxItems = schema.maxItems;
-        if (minItems > 0) {
+        return this._addRule(ruleName, '"[" space ' + _buildRepetition(itemRuleName, minItems, maxItems, {separatorRule: '"," space'}) + ' "]" space');
          successiveItems = listItemOperator.repeat(minItems - 1);
          minItems--;
        }
        if (maxItems !== undefined && maxItems > minItems) {
          successiveItems += `${listItemOperator}?`.repeat(maxItems - minItems - 1);
        } else {
          successiveItems += `${listItemOperator}*`;
        }
        const rule = minItems === 0
          ? `"[" space ( ${itemRuleName} ${successiveItems} )? "]" space`
          : `"[" space ${itemRuleName} ${successiveItems} "]" space`;
        return this._addRule(ruleName, rule);
      }
    } else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) {
      return this._visitPattern(schema.pattern, ruleName);
    } else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) {
-      return this._addRule(
+      return this._addPrimitive(
        ruleName === 'root' ? 'root' : schemaFormat,
-          PRIMITIVE_RULES['uuid'])
+        PRIMITIVE_RULES['uuid']
-    } else if ((schemaType === undefined || schemaType === 'string') && schema.format in DATE_RULES) {
+      );
-      for (const [t, r] of Object.entries(DATE_RULES)) {
+    } else if ((schemaType === undefined || schemaType === 'string') && `${schema.format}-string` in STRING_FORMAT_RULES) {
-        this._addRule(t, r);
+      const primName = `${schema.format}-string`
-      }
+      return this._addRule(ruleName, this._addPrimitive(primName, STRING_FORMAT_RULES[primName]));
-      return schemaFormat + '-string';
+    } else if (schemaType === 'string' && ('minLength' in schema || 'maxLength' in schema)) {
      const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
      const minLen = schema.minLength || 0;
      const maxLen = schema.maxLength;
      return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
    } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
-      for (const n of OBJECT_RULE_NAMES) {
+      return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
        this._addRule(n, PRIMITIVE_RULES[n]);
      }
      return this._addRule(ruleName, 'object');
    } else {
      if (!(schemaType in PRIMITIVE_RULES)) {
        throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
      }
      // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-      return this._addRule(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
+      return this._addPrimitive(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
    }
  }
  _addPrimitive(name, rule) {
    let n = this._addRule(name, rule.content);
    for (const dep of rule.deps) {
      const depRule = PRIMITIVE_RULES[dep] || STRING_FORMAT_RULES[dep];
      if (!depRule) {
        throw new Error(`Rule ${dep} not known`);
      }
      if (!(dep in this._rules)) {
        this._addPrimitive(dep, depRule);
      }
    }
    return n;
  }
  _buildObjectRule(properties, required, name, additionalProperties) {
    const propOrder = this._propOrder;
    // sort by position in prop_order (if specified) then by original order
@ -462,7 +518,7 @@ export class SchemaConverter {
      const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
      propKvRuleNames['*'] = this._addRule(
        `${subName}-kv`,
-        `${this._addRule('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
+        `${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
      optionalProps.push('*');
    }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -859,7 +859,7 @@ struct server_context {
        slot.sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
        // process "json_schema" and "grammar"
-        if (data.contains("json_schema") && data.contains("grammar")) {
+        if (data.contains("json_schema") && !data["json_schema"].is_null() && data.contains("grammar") && !data["grammar"].is_null()) {
            send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
            return false;
        } else if (data.contains("json_schema") && !data.contains("grammar")) {
@ -1095,7 +1095,7 @@ struct server_context {
                };
                if (llama_decode(ctx, batch_view) != 0) {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG_ERROR("llama_decode() failed", {});
                    return;
                }
            }
@ -1293,7 +1293,11 @@ struct server_context {
    }
    void send_error(const int id_task, const int id_multi, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
-        LOG_TEE("task %i - error: %s\n", id_task, error.c_str());
+        LOG_ERROR("task error", {
            {"id_multi", id_multi},
            {"id_task", id_task},
            {"error", error},
        });
        server_task_result res;
        res.id       = id_task;
@ -2198,7 +2202,11 @@ struct server_context {
            if (ret != 0) {
                if (n_batch == 1 || ret < 0) {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
                        {"i",   i},
                        {"n_batch",  ret},
                        {"ret",   ret},
                    });
                    for (auto & slot : slots) {
                        slot.state = SLOT_STATE_PROCESSING;
                        slot.command = SLOT_COMMAND_NONE;
@ -2208,12 +2216,16 @@ struct server_context {
                    break; // break loop of n_batch
                }
                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
                // retry with half the batch size to try to find a free slot in the KV cache
                n_batch /= 2;
                i -= n_batch;
                LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", {
                    {"i",   i},
                    {"n_batch",  n_batch},
                    {"ret",   ret},
                });
                continue; // continue loop of n_batch
            }
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@ -20,4 +20,4 @@ cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 #cmake --build . --config Release --target llama-bench
 #build all binary
-cmake --build . --config Release -v
+cmake --build . --config Release -j -v
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@ -12,6 +12,7 @@ if [ $# -gt 0 ]; then
    GGML_SYCL_SINGLE_GPU=1
 else
    GGML_SYCL_DEVICE=0
    GGML_SYCL_SINGLE_GPU=0
 fi
 #export GGML_SYCL_DEBUG=1
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -73,6 +73,7 @@ struct my_llama_model {
 static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL     = "train_model";
 static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
 static const char * LLM_KV_GENERAL_NAME                = "general.name";
 static const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
 static const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
@ -529,6 +530,7 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex
 static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) {
    const char * arch = "llama";
    enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
    std::vector<char> keybuf;
@ -540,6 +542,7 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
    // set arch
    gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
    gguf_set_val_str(fctx, LLM_KV_GENERAL_NAME, arch);
    gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
    // set hparams
--- a/examples/ts-type-to-grammar.sh
+++ b/examples/ts-type-to-grammar.sh
@ -1,7 +1,7 @@
 #!/bin/bash
 #
 # ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}"
-# python examples/json-schema-to-grammar.py https://json.schemastore.org/tsconfig.json
+# python examples/json_schema_to_grammar.py https://json.schemastore.org/tsconfig.json
 #
 set -euo pipefail
@ -25,4 +25,4 @@ npx ts-json-schema-generator --unstable --no-top-ref --path "$DTS_FILE" --type M
 # https://github.com/YousefED/typescript-json-schema
 # npx typescript-json-schema --defaultProps --required "$DTS_FILE" MyType | tee "$SCHEMA_FILE" >&2
-./examples/json-schema-to-grammar.py "$SCHEMA_FILE"
+./examples/json_schema_to_grammar.py "$SCHEMA_FILE"
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1712163089,
+        "lastModified": 1712791164,
-        "narHash": "sha256-Um+8kTIrC19vD4/lUCN9/cU9kcOsD1O1m+axJqQPyMM=",
+        "narHash": "sha256-3sbWO1mbpWsLepZGbWaMovSO7ndZeFqDSdX0hZ9nVyw=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "fd281bd6b7d3e32ddfa399853946f782553163b5",
+        "rev": "1042fd8b148a9105f3c0aca3a6177fd1d9360ba5",
        "type": "github"
      },
      "original": {
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -1231,7 +1231,7 @@ static void ggml_cuda_op_mul_mat_cublas(
    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
-        ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool());
+        ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
        if (src0->type != GGML_TYPE_F16) {
            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
            GGML_ASSERT(to_fp16_cuda != nullptr);
@ -1241,7 +1241,7 @@ static void ggml_cuda_op_mul_mat_cublas(
        }
        const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
-        ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool());
+        ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
        if (src1->type != GGML_TYPE_F16) {
            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
            GGML_ASSERT(to_fp16_cuda != nullptr);
@ -1250,7 +1250,7 @@ static void ggml_cuda_op_mul_mat_cublas(
            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
        }
        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
-        ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(), row_diff*src1_ncols);
+        ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
        const half alpha_f16 = 1.0f;
        const half beta_f16 = 0.0f;
@ -1946,7 +1946,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
    } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // KQV single-batch
        ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
-    } else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
+    } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
        // KQ + KQV multi-batch
        ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
    } else if (use_dequantize_mul_mat_vec) {
@ -1960,20 +1960,73 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
    }
 }
 struct mmid_row_mapping {
    int32_t i1;
    int32_t i2;
 };
 static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
                                                 int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
                                                 const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
                                                 int64_t ne11, int64_t ne10,
                                                 size_t nb11, size_t nb12) {
    int32_t iid1 = blockIdx.x;
    int32_t id = blockIdx.y;
    const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
    if (row_id_i != i02) {
        return;
    }
    const int64_t i11 = id % ne11;
    const int64_t i12 = iid1;
    __shared__ int src1_row;
    if (threadIdx.x == 0) {
        src1_row = atomicAdd(cur_src1_row, 1);
        row_mapping[src1_row] = {id, iid1};
    }
    __syncthreads();
    const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
    float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
    for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
        src1_row_contiguous[i] = src1_row_original[i];
    }
 }
 static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
                                                  const mmid_row_mapping * __restrict__ row_mapping,
                                                  int64_t ne0,
                                                  size_t nb1, size_t nb2) {
    int32_t i = blockIdx.x;
    const int32_t i1 = row_mapping[i].i1;
    const int32_t i2 = row_mapping[i].i2;
    const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
    float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
    for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
        dst_row_original[j] = dst_row_contiguous[j];
    }
 }
 static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
    const ggml_tensor * ids  = dst->src[2];
    GGML_TENSOR_BINARY_OP_LOCALS
    GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
    cudaStream_t stream = ctx.stream();
-    const size_t nb11 = src1->nb[1];
+    const int64_t n_as = ne02;
-    const size_t nb1  =  dst->nb[1];
+    const int64_t n_ids = ids->ne[0];
    const int32_t id = ((int32_t *) dst->op_params)[0];
    const int32_t n_as = src0->ne[2];
    std::vector<char> ids_host(ggml_nbytes(ids));
    const char * ids_dev = (const char *) ids->data;
@ -1990,20 +2043,40 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
    src0_row.ne[2] = 1;
    src0_row.ne[3] = 1;
-    src0_row.nb[3] = src0->nb[2];
+    src0_row.nb[3] = nb02;
-    if (src1->ne[1] == 1) {
+    src1_row.ne[1] = 1;
-        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+    src1_row.ne[2] = 1;
-            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+    src1_row.ne[3] = 1;
    src1_row.nb[2] = nb11;
    src1_row.nb[3] = nb11;
-            GGML_ASSERT(row_id >= 0 && row_id < n_as);
+    dst_row.ne[1] = 1;
    dst_row.ne[2] = 1;
    dst_row.ne[3] = 1;
    dst_row.nb[2] = nb1;
    dst_row.nb[3] = nb1;
-            src0_row.data = src0_original + row_id*src0->nb[2];
+    if (ne12 == 1) {
-            src1_row.data = src1_original + i01*src1->nb[1];
+        for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
-            dst_row.data  =  dst_original + i01*dst->nb[1];
+            for (int64_t id = 0; id < n_ids; id++) {
                const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
                GGML_ASSERT(i02 >= 0 && i02 < n_as);
                const int64_t i11 = id % ne11;
                const int64_t i12 = iid1;
                const int64_t i1 = id;
                const int64_t i2 = i12;
                src0_row.data = src0_original + i02*nb02;
                src1_row.data = src1_original + i11*nb11 + i12*nb12;
                dst_row.data  =  dst_original + i1*nb1   + i2*nb2;
                ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
            }
        }
    } else {
        ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
        ggml_cuda_pool_alloc<char>  dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
@ -2011,54 +2084,69 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
        src1_row.data = src1_contiguous.get();
        dst_row.data  =  dst_contiguous.get();
-        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
+        for (int64_t i02 = 0; i02 < n_as; i02++) {
            int64_t num_src1_rows = 0;
            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-                if (row_id_i != row_id) {
+            for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
                for (int64_t id = 0; id < n_ids; id++) {
                    const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
                    GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
                    if (row_id_i != i02) {
                        continue;
                    }
                GGML_ASSERT(row_id >= 0 && row_id < n_as);
                CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
                                        nb11, cudaMemcpyDeviceToDevice, stream));
                    num_src1_rows++;
                }
            }
            if (num_src1_rows == 0) {
                continue;
            }
-            src0_row.data = src0_original + row_id*src0->nb[2];
+            ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
            ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
            CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
            {
                dim3 block_dims(std::min((unsigned int)ne10, 768u));
                dim3 grid_dims(ids->ne[1], n_ids);
                k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
                        src1_original, src1_contiguous.get(),
                        dev_cur_src1_row.get(), dev_row_mapping.get(),
                        ids_dev, i02, ids->nb[1], ids->nb[0],
                        ne11, ne10,
                        nb11, nb12);
                CUDA_CHECK(cudaGetLastError());
            }
            src0_row.data = src0_original + i02*nb02;
            GGML_ASSERT(nb11 == sizeof(float)*ne10);
            GGML_ASSERT(nb1 == sizeof(float)*ne0);
            src1_row.ne[1] = num_src1_rows;
            dst_row.ne[1] = num_src1_rows;
            src1_row.nb[1] = nb11;
            src1_row.nb[2] = num_src1_rows*nb11;
            src1_row.nb[3] = num_src1_rows*nb11;
            dst_row.ne[1] = num_src1_rows;
            dst_row.nb[1] = nb1;
            dst_row.nb[2] = num_src1_rows*nb1;
            dst_row.nb[3] = num_src1_rows*nb1;
            ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
-            num_src1_rows = 0;
+            {
-            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+                dim3 block_dims(std::min((unsigned int)ne0, 768u));
-                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+                dim3 grid_dims(num_src1_rows);
-
+                k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
-                if (row_id_i != row_id) {
+                        dst_original, dst_contiguous.get(),
-                    continue;
+                        dev_row_mapping.get(),
-                }
+                        ne0,
-
+                        nb1, nb2);
-                GGML_ASSERT(row_id >= 0 && row_id < n_as);
+                CUDA_CHECK(cudaGetLastError());
                CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
                                        nb1, cudaMemcpyDeviceToDevice, stream));
                num_src1_rows++;
            }
        }
    }
@ -2487,7 +2575,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
    const int min_batch_size = 32;
-    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
+    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
    GGML_UNUSED(backend);
 }
--- a/ggml-cuda/binbcast.cu
+++ b/ggml-cuda/binbcast.cu
@ -22,6 +22,7 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
        int ne0, int ne1, int ne2, int ne3,
        int ne10, int ne11, int ne12, int ne13,
        /*int s0, */ int s1,  int s2,  int s3,
        /*int s00,*/ int s01, int s02, int s03,
        /*int s10,*/ int s11, int s12, int s13) {
    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
@ -36,9 +37,9 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
    const int i12 = i2 % ne12;
    const int i13 = i3 % ne13;
-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
    const src0_t * src0_row = src0 + i_src0;
    const src1_t * src1_row = src1 + i_src1;
@ -55,6 +56,7 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
        int ne0, int ne1, int ne2, int ne3,
        int ne10, int ne11, int ne12, int ne13,
        /*int s0, */ int s1,  int s2,  int s3,
        /*int s00,*/ int s01, int s02, int s03,
        /*int s10,*/ int s11, int s12, int s13) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;
@ -72,9 +74,9 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
    const int i12 = i2 % ne12;
    const int i13 = i3 % ne13;
-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
    const src0_t * src0_row = src0 + i_src0;
    const src1_t * src1_row = src1 + i_src1;
@ -101,10 +103,14 @@ struct bin_bcast_cuda {
        int nr[4] = { nr0, nr1, nr2, nr3 };
        // collapse dimensions until first broadcast dimension
-        int64_t cne0[] = {ne0, ne1, ne2, ne3};
+        int64_t cne[] = {ne0, ne1, ne2, ne3};
        int64_t cne0[] = {ne00, ne01, ne02, ne03};
        int64_t cne1[] = {ne10, ne11, ne12, ne13};
-        size_t cnb0[] = {nb0, nb1, nb2, nb3};
+
        size_t cnb[] = {nb0, nb1, nb2, nb3};
        size_t cnb0[] = {nb00, nb01, nb02, nb03};
        size_t cnb1[] = {nb10, nb11, nb12, nb13};
        auto collapse = [](int64_t cne[]) {
            cne[0] *= cne[1];
            cne[1] = cne[2];
@ -118,32 +124,47 @@ struct bin_bcast_cuda {
            cnb[3] *= cne[3];
        };
        if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
            for (int i = 0; i < 4; i++) {
                if (nr[i] != 1) {
                    break;
                }
                if (i > 0) {
                    collapse_nb(cnb, cne);
                    collapse_nb(cnb0, cne0);
                    collapse_nb(cnb1, cne1);
                    collapse(cne);
                    collapse(cne0);
                    collapse(cne1);
                }
            }
        }
        {
-            int64_t ne0 = cne0[0];
+            int64_t ne0 = cne[0];
-            int64_t ne1 = cne0[1];
+            int64_t ne1 = cne[1];
-            int64_t ne2 = cne0[2];
+            int64_t ne2 = cne[2];
-            int64_t ne3 = cne0[3];
+            int64_t ne3 = cne[3];
            //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);
            //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);
            //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);
            //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);
            int64_t ne10 = cne1[0];
            int64_t ne11 = cne1[1];
            int64_t ne12 = cne1[2];
            int64_t ne13 = cne1[3];
-            size_t nb0 = cnb0[0];
+            size_t nb0 = cnb[0];
-            size_t nb1 = cnb0[1];
+            size_t nb1 = cnb[1];
-            size_t nb2 = cnb0[2];
+            size_t nb2 = cnb[2];
-            size_t nb3 = cnb0[3];
+            size_t nb3 = cnb[3];
            size_t nb00 = cnb0[0];
            size_t nb01 = cnb0[1];
            size_t nb02 = cnb0[2];
            size_t nb03 = cnb0[3];
            size_t nb10 = cnb1[0];
            size_t nb11 = cnb1[1];
@ -160,7 +181,28 @@ struct bin_bcast_cuda {
            size_t s12 = nb12 / sizeof(src1_t);
            size_t s13 = nb13 / sizeof(src1_t);
            size_t s00 = nb00 / sizeof(src0_t);
            size_t s01 = nb01 / sizeof(src0_t);
            size_t s02 = nb02 / sizeof(src0_t);
            size_t s03 = nb03 / sizeof(src0_t);
            GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
            GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
            GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
            GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
            GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
            GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
            GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
            GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
            GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
            GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
            GGML_ASSERT(s0 == 1);
            GGML_ASSERT(s00 == 1);
            GGML_ASSERT(s10 == 1);
            const int block_size = 128;
@ -179,13 +221,14 @@ struct bin_bcast_cuda {
            );
            if (block_nums.z > 65535) {
-                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
+                // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel
                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
                    src0_dd, src1_dd, dst_dd,
                    ne0, ne1, ne2, ne3,
                    ne10, ne11, ne12, ne13,
                    /* s0, */ s1, s2, s3,
                    /* s00, */ s01, s02, s03,
                    /* s10, */ s11, s12, s13);
            } else {
                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
@ -193,6 +236,7 @@ struct bin_bcast_cuda {
                    ne0, ne1, ne2, ne3,
                    ne10, ne11, ne12, ne13,
                    /* s0, */ s1, s2, s3,
                    /* s00, */ s01, s02, s03,
                    /* s10, */ s11, s12, s13);
            }
        }
--- a/ggml-cuda/convert.cu
+++ b/ggml-cuda/convert.cu
@ -45,6 +45,8 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
        vals[ix] = x0[ix];
    }
    __syncthreads();
 #pragma unroll
    for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
        if (need_check && i0 + iy + 2*threadIdx.x >= k) {
--- a/ggml-impl.h
+++ b/ggml-impl.h
@ -88,7 +88,7 @@ typedef uint16_t ggml_fp16_internal_t;
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
 #if !defined(__riscv)
 #include <immintrin.h>
 #endif
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -37,11 +37,15 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_DIV_ROW,
    GGML_METAL_KERNEL_TYPE_SCALE,
    GGML_METAL_KERNEL_TYPE_SCALE_4,
    GGML_METAL_KERNEL_TYPE_CLAMP,
    GGML_METAL_KERNEL_TYPE_TANH,
    GGML_METAL_KERNEL_TYPE_RELU,
    GGML_METAL_KERNEL_TYPE_GELU,
    GGML_METAL_KERNEL_TYPE_GELU_4,
    GGML_METAL_KERNEL_TYPE_GELU_QUICK,
    GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,
    GGML_METAL_KERNEL_TYPE_SILU,
    GGML_METAL_KERNEL_TYPE_SILU_4,
    GGML_METAL_KERNEL_TYPE_SOFT_MAX,
    GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,
    GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,
@ -468,11 +472,15 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW,                   div_row,                true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE,                     scale,                  true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4,                   scale_4,                true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP,                     clamp,                  true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH,                      tanh,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU,                      relu,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU,                      gelu,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4,                    gelu_4,                 true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK,                gelu_quick,             true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,              gelu_quick_4,           true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                      silu,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4,                    silu_4,                 true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX,                  soft_max,               ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,                soft_max_4,             ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,             diag_mask_inf,          true);
@ -713,6 +721,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_SCALE:
        case GGML_OP_CLAMP:
        case GGML_OP_SQR:
        case GGML_OP_SUM_ROWS:
            return true;
@ -1152,10 +1161,32 @@ static enum ggml_status ggml_metal_graph_compute(
                        [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
                        [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                    } break;
                case GGML_OP_CLAMP:
                {
                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline;
                    float min;
                    float max;
                    memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
                    memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
                    [encoder setComputePipelineState:pipeline];
                    [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
                    [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
                    [encoder setBytes:&min length:sizeof(min) atIndex:2];
                    [encoder setBytes:&max length:sizeof(max) atIndex:3];
                    const int64_t n = ggml_nelements(dst);
                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                } break;
                case GGML_OP_UNARY:
                    switch (ggml_get_unary_op(gf->nodes[i])) {
                        // we are not taking into account the strides, so for now require contiguous tensors
                        GGML_ASSERT(ggml_is_contiguous(src0));
                        case GGML_UNARY_OP_TANH:
                            {
                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
@ -1182,42 +1213,60 @@ static enum ggml_status ggml_metal_graph_compute(
                            } break;
                        case GGML_UNARY_OP_GELU:
                            {
-                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
+                                int64_t n = ggml_nelements(dst);
                                id<MTLComputePipelineState> pipeline = nil;
                                if (n % 4 == 0) {
                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_4].pipeline;
                                    n /= 4;
                                } else {
                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
                                }
                                [encoder setComputePipelineState:pipeline];
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                                const int64_t n = ggml_nelements(dst);
+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                GGML_ASSERT(n % 4 == 0);
                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                            } break;
                        case GGML_UNARY_OP_GELU_QUICK:
                            {
-                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
+                                int64_t n = ggml_nelements(dst);
                                id<MTLComputePipelineState> pipeline = nil;
                                if (n % 4 == 0) {
                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK_4].pipeline;
                                    n /= 4;
                                } else {
                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
                                }
                                [encoder setComputePipelineState:pipeline];
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                                const int64_t n = ggml_nelements(dst);
+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                GGML_ASSERT(n % 4 == 0);
                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                            } break;
                        case GGML_UNARY_OP_SILU:
                            {
-                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
+                                int64_t n = ggml_nelements(dst);
                                id<MTLComputePipelineState> pipeline = nil;
                                if (n % 4 == 0) {
                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU_4].pipeline;
                                    n /= 4;
                                } else {
                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
                                }
                                [encoder setComputePipelineState:pipeline];
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                                const int64_t n = ggml_nelements(dst);
+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                GGML_ASSERT(n % 4 == 0);
                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                            } break;
                        default:
                            {
@ -1683,15 +1732,10 @@ static enum ggml_status ggml_metal_graph_compute(
                    } break;
                case GGML_OP_MUL_MAT_ID:
                    {
                        //GGML_ASSERT(ne00 == ne10);
                        //GGML_ASSERT(ne03 == ne13);
                        const int n_as = src0->ne[2];
                        // max size of the src1ids array in the kernel shared buffer
                        GGML_ASSERT(ne11 <= 4096);
                        // src2 = ids
-                        const int64_t  ne20 = src2->ne[0]; GGML_UNUSED(ne20);
+                        const int64_t  ne20 = src2->ne[0];
                        const int64_t  ne21 = src2->ne[1];
                        const int64_t  ne22 = src2->ne[2]; GGML_UNUSED(ne22);
                        const int64_t  ne23 = src2->ne[3]; GGML_UNUSED(ne23);
@ -1712,15 +1756,13 @@ static enum ggml_status ggml_metal_graph_compute(
                        // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                        // to the matrix-vector kernel
-                        int ne11_mm_min = n_as;
+                        // ne20 = n_used_experts
                        // ne21 = n_rows
                        const int dst_rows = ne20*ne21;
                        const int dst_rows_min = n_as;
-                        const int idx = ((int32_t *) dst->op_params)[0];
+                        // max size of the rowids array in the kernel shared buffer
-
+                        GGML_ASSERT(dst_rows <= 2048);
                        // batch size
                        GGML_ASSERT(ne21 == ne11); // ?
                        GGML_ASSERT(ne12 == 1 && ne13 == 1); // no broadcasting
                        const uint r2 = 1;
                        const uint r3 = 1;
                        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
@ -1730,7 +1772,7 @@ static enum ggml_status ggml_metal_graph_compute(
                        // !!!
                        if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
                            ne00 % 32 == 0 && ne00 >= 64 &&
-                            ne11 > ne11_mm_min) {
+                            dst_rows > dst_rows_min) {
                            // some Metal matrix data types require aligned pointers
                            // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
@ -1772,26 +1814,26 @@ static enum ggml_status ggml_metal_graph_compute(
                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
                            [encoder setBuffer:id_src2 offset:offs_src2    atIndex:3];
-                            [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:4];
+                            [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
-                            [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:5];
+                            [encoder setBytes:&ne21    length:sizeof(ne21) atIndex:5];
-                            [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:6];
+                            [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
-                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:7];
+                            [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:7];
-                            [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:8];
+                            [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:8];
-                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:9];
+                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:9];
-                            [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:10];
+                            [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:10];
-                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:11];
+                            [encoder setBytes:&ne11    length:sizeof(ne11) atIndex:11];
-                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:12];
+                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:12];
-                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:13];
+                            [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:13];
-                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:14];
+                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:14];
-                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:15];
+                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:15];
-                            [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:16];
+                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:16];
-                            [encoder setBytes:&r2      length:sizeof(r2)   atIndex:17];
+                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:17];
-                            [encoder setBytes:&r3      length:sizeof(r3)   atIndex:18];
+                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:18];
-                            [encoder setBytes:&idx     length:sizeof(idx)  atIndex:19];
+                            [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:19];
-                            [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0];
-                            [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne01 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                        } else {
                            int nth0 = 32;
                            int nth1 = 1;
@ -1926,7 +1968,12 @@ static enum ggml_status ggml_metal_graph_compute(
                                    {
                                        nth0 = 4;
                                        nth1 = 16;
                                    #if QK_K == 64
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
                                    #else
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
                                    #endif
                                    } break;
                                default:
                                    {
@ -1939,72 +1986,72 @@ static enum ggml_status ggml_metal_graph_compute(
                                GGML_ASSERT(ne00 >= nth0*nth1);
                            }
                            const int64_t _ne1 = 1; // kernels needs a reference in constant memory
                            [encoder setComputePipelineState:pipeline];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                            [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
-                            [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:4];
+                            [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:5];
+                            [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:6];
+                            [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:7];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:8];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:8];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:9];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:9];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:10];
+                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:10];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:11];
-                            [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:12];
+                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:12];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:13];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:14];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:15];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:16];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:17];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:18];
+                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:18];
-                            [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:19];
+                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:19];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:20];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:20];
-                            [encoder setBytes:&r2   length:sizeof(r2)   atIndex:21];
+                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:21];
-                            [encoder setBytes:&r3   length:sizeof(r3)   atIndex:22];
+                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:22];
-                            [encoder setBytes:&idx  length:sizeof(idx)  atIndex:23];
+
                            const int64_t _ne1 = 1;
                            const int tgz = dst_rows;
                            if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
                                src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
                                src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
                                const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
                                const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
                                const int mem_size = 32*sizeof(float);
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_Q4_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_Q3_K) {
 #ifdef GGML_QKK_64
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #else
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #endif
                            }
                            else if (src0t == GGML_TYPE_Q5_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_Q6_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            } else {
-                                const int64_t ny = (_ne1 + nrows - 1)/nrows;
+                                const int64_t ny = (_ne1 + nrows - 1)/nrows; // = _ne1
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                        }
                    } break;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
 }
 static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
+#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
    const __m256i zero = _mm256_setzero_si256();
    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
    return _mm256_cvtepi32_ps(summed_pairs);
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
 #define SYCL_SCALE_BLOCK_SIZE 256
 #define SYCL_CLAMP_BLOCK_SIZE 256
 #define SYCL_ROPE_BLOCK_SIZE 256
 #define SYCL_SOFT_MAX_BLOCK_SIZE 1024
 #define SYCL_ALIBI_BLOCK_SIZE 32
 #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
 #define SYCL_QUANTIZE_BLOCK_SIZE 256
@ -13080,11 +13079,13 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
                              const int nrows_y, const float scale, const float max_bias,
                              dpct::queue_ptr stream) {
    int nth = WARP_SIZE;
-    while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
+    int max_block_size = g_work_group_size;
    while (nth < ncols_x && nth < max_block_size) nth *= 2;
    if (nth>max_block_size) nth = max_block_size;
    const sycl::range<3> block_dims(1, 1, nth);
    const sycl::range<3> block_nums(1, 1, nrows_x);
    const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
    static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
    const uint32_t n_head_kv   = nrows_x/nrows_y;
    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
@ -13094,6 +13095,12 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
    const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
    if (n_local_scratch*sizeof(float) < local_mem_size) {
        if (ncols_x > max_block_size) {
            soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
                                               max_bias, m0, m1, n_head_log2, block_nums,
                                               block_dims, n_local_scratch, stream);
            return;
        }
        switch (ncols_x) {
            case 32:
                soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
@ -15989,73 +15996,76 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
 static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
                                 const ggml_tensor *src1,
                                 ggml_tensor *dst) try {
-#if 0
+    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT &&
-    ggml_sycl_mul_mat_id_sycl(dst);
+                "mul_mat_id does not support split buffers");
-    // TODO: mmq/mmv support
+    const ggml_tensor *ids = dst->src[2];
 #endif
    const int64_t nb11 = src1->nb[1];
    const int64_t nb1  =  dst->nb[1];
    const struct ggml_tensor * ids = src0;
    const int32_t id = ((int32_t *) dst->op_params)[0];
    const int32_t n_as = ((int32_t *) dst->op_params)[1];
    std::vector<char> ids_host(ggml_nbytes(ids));
    const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];
-    if (ids->backend == GGML_BACKEND_TYPE_GPU) {
+    const size_t nb11 = src1->nb[1];
-        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
+    const size_t nb1 = dst->nb[1];
    const int32_t id = ((int32_t *)dst->op_params)[0];
    const int32_t n_as = src0->ne[2];
    std::vector<char> ids_host(ggml_nbytes(ids));
    const char *ids_dev = (const char *)ids->data;
    SYCL_CHECK(CHECK_TRY_ERROR(
-            stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)).wait()));
+        stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
-        // SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
+    SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
    } else {
        memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
    }
-    const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
+    const ggml_tensor_extra_gpu *src0_extra =
-    const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
+        (const ggml_tensor_extra_gpu *)src0->extra;
    const ggml_tensor_extra_gpu *src1_extra =
        (const ggml_tensor_extra_gpu *)src1->extra;
    const ggml_tensor_extra_gpu *dst_extra =
        (const ggml_tensor_extra_gpu *)dst->extra;
    ggml_tensor_extra_gpu src0_row_extra;
    ggml_tensor_extra_gpu src1_row_extra;
    ggml_tensor_extra_gpu dst_row_extra;
    ggml_tensor src0_row = *src0;
    ggml_tensor src1_row = *src1;
    ggml_tensor dst_row = *dst;
    src1_row.backend = GGML_BACKEND_TYPE_GPU;
    dst_row.backend  = GGML_BACKEND_TYPE_GPU;
    src0_row.extra = &src0_row_extra;
    src1_row.extra = &src1_row_extra;
    dst_row.extra = &dst_row_extra;
-    char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
+    char *src0_original = src1->backend == GGML_BACKEND_TYPE_CPU
-        (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
+                              ? (char *)src0->data
-    char * dst_original  =  dst->backend == GGML_BACKEND_TYPE_CPU ?
+                              : (char *)src0_extra->data_device[g_main_device];
-        (char *)  dst->data : (char *)  dst_extra->data_device[g_main_device];
+    char *src1_original = src1->backend == GGML_BACKEND_TYPE_CPU
                              ? (char *)src1->data
                              : (char *)src1_extra->data_device[g_main_device];
    char *dst_original = dst->backend == GGML_BACKEND_TYPE_CPU
                             ? (char *)dst->data
                             : (char *)dst_extra->data_device[g_main_device];
    src0_row.ne[2] = 1;
    src0_row.ne[3] = 1;
    src0_row.nb[3] = src0->nb[2];
    if (src1->ne[1] == 1) {
        GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
        GGML_ASSERT(dst->backend  == GGML_BACKEND_TYPE_GPU);
        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-            //int32_t row_id;
+            const int32_t row_id =
-            //SYCL_CHECK(syclMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), syclMemcpyDeviceToHost, g_syclStreams[g_main_device][0]));
+                *(const int32_t *)(ids_host.data() + i01 * ids->nb[1] +
-            //SYCL_CHECK(syclStreamSynchronize(g_syclStreams[g_main_device][0]));
+                                   id * ids->nb[0]);
            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
            GGML_ASSERT(row_id >= 0 && row_id < n_as);
-            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+            src0_row_extra.data_device[g_main_device] =
                src0_original + row_id * src0->nb[2];
            src1_row_extra.data_device[g_main_device] =
                src1_original + i01 * src1->nb[1];
            dst_row_extra.data_device[g_main_device] =
                dst_original + i01 * dst->nb[1];
-            src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
+            ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
            src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
            dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
            dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
            ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
        }
    } else {
        sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
@ -16065,8 +16075,6 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
        dst_row_extra.data_device[g_main_device]  =  dst_contiguous.get();
        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
            int64_t num_src1_rows = 0;
            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
@ -16079,7 +16087,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
                SYCL_CHECK(CHECK_TRY_ERROR(
                    stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
-                                   src1_original + i01 * nb11, nb11).wait()));
+                                   src1_original + i01 * nb11, nb11)));
                num_src1_rows++;
            }
@ -16087,6 +16095,9 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
                continue;
            }
            src0_row_extra.data_device[g_main_device] =
                src0_original + row_id * src0->nb[2];
            src1_row.ne[1] = num_src1_rows;
            dst_row.ne[1] = num_src1_rows;
@ -16098,7 +16109,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
            dst_row.nb[2] = num_src1_rows*nb1;
            dst_row.nb[3] = num_src1_rows*nb1;
-            ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
+            ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
            num_src1_rows = 0;
            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
@ -16112,7 +16123,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
                SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
                    dst_original + i01 * nb1,
-                    dst_contiguous.get() + num_src1_rows * nb1, nb1).wait()));
+                    dst_contiguous.get() + num_src1_rows * nb1, nb1)));
                num_src1_rows++;
            }
        }
@ -16814,11 +16825,13 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
    const dpct::queue_ptr stream = g_syclStreams[ctx->device][0];
    SYCL_CHECK(
        CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
-
+    char* host_buf = (char*)malloc(size);
    memcpy(host_buf, data, size);
    SYCL_CHECK(
        CHECK_TRY_ERROR((*stream)
-                             .memcpy((char *)tensor->data + offset, data, size)
+                             .memcpy((char *)tensor->data + offset, host_buf, size)
                             .wait()));
    free(host_buf);
 }
 catch (sycl::exception const &exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@ -17739,7 +17752,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
 GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
    const int min_batch_size = 32;
-    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
+    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
    GGML_UNUSED(backend);
 }
--- a/ggml.c
+++ b/ggml.c
@ -4,6 +4,7 @@
 #include "ggml-impl.h"
 #include "ggml-quants.h"
 #include "ggml.h"
 #include "sgemm.h"
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@ -32,6 +33,10 @@
 #include <unistd.h>
 #endif
 #ifdef __ARM_FEATURE_MATMUL_INT8
 #undef GGML_USE_LLAMAFILE
 #endif
 #if defined(_MSC_VER)
 // disable "possible loss of data" to avoid hundreds of casts
 // we should just be careful :)
@ -4573,21 +4578,32 @@ void ggml_mul_mat_set_prec(
 // ggml_mul_mat_id
-// NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
+/*
-//       this will allow computing all the used experts in a single matrix multiplication
+    c = ggml_mul_mat_id(ctx, as, b, ids);
    as  -> [cols, rows, n_expert]
    ids -> [n_experts_used, n_tokens] (i32)
    b   -> [cols, n_expert_used, n_tokens]
    c   -> [cols, n_expert_used, n_tokens]
    in b, n_experts_used can be broadcasted to match the n_expert_used of ids
    c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
 */
 struct ggml_tensor * ggml_mul_mat_id(
        struct ggml_context * ctx,
        struct ggml_tensor  * as,
-        struct ggml_tensor  * ids,
+        struct ggml_tensor  * b,
-        int                   id,
+        struct ggml_tensor  * ids) {
-        struct ggml_tensor  * b) {
+    GGML_ASSERT(!ggml_is_transposed(as));
    GGML_ASSERT(ids->type == GGML_TYPE_I32);
    GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
    GGML_ASSERT(b->ne[3] == 1); // b is 3d
    GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
-    GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
+    GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
    GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
    GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
    GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
    GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
    bool is_node = false;
@ -4595,11 +4611,9 @@ struct ggml_tensor * ggml_mul_mat_id(
        is_node = true;
    }
-    const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
+    const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
    ggml_set_op_params_i32(result, 0, id);
    result->op   = GGML_OP_MUL_MAT_ID;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = as;
@ -10810,6 +10824,28 @@ static void ggml_compute_forward_mul_mat(
    }
 #endif
 #if GGML_USE_LLAMAFILE
    if (nb10 == ggml_type_size(src1->type)) {
        for (int64_t i13 = 0; i13 < ne13; i13++)
            for (int64_t i12 = 0; i12 < ne12; i12++)
                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
                                     nb01/ggml_type_size(src0->type),
                                     (const char *)src1->data + i12*nb12 + i13*nb13,
                                     nb11/ggml_type_size(src1->type),
                                     (char *)dst->data + i12*nb2 + i13*nb3,
                                     nb1/ggml_type_size(dst->type),
                                     ith, nth,
                                     params->type,
                                     src0->type,
                                     src1->type,
                                     dst->type))
                    goto UseGgmlGemm1;
        return;
    }
 UseGgmlGemm1:;
 #endif
    if (params->type == GGML_TASK_TYPE_INIT) {
        if (ith != 0) {
            return;
@ -10841,6 +10877,30 @@ static void ggml_compute_forward_mul_mat(
    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
 #if GGML_USE_LLAMAFILE
    if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
        for (int64_t i13 = 0; i13 < ne13; i13++)
            for (int64_t i12 = 0; i12 < ne12; i12++)
                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
                                     nb01/ggml_type_size(src0->type),
                                     (const char *)wdata + ggml_row_size(vec_dot_type,
                                         nb12/ggml_type_size(src1->type)*i12 +
                                         nb13/ggml_type_size(src1->type)*i13),
                                     row_size/ggml_type_size(vec_dot_type),
                                     (char *)dst->data + i12*nb2 + i13*nb3,
                                     nb1/ggml_type_size(dst->type),
                                     ith, nth,
                                     params->type,
                                     src0->type,
                                     vec_dot_type,
                                     dst->type))
                    goto UseGgmlGemm2;
        return;
    }
 UseGgmlGemm2:;
 #endif
    const int64_t nr0 = ne01;          // src0 rows
    const int64_t nr1 = ne1*ne12*ne13; // src1 rows
@ -10958,11 +11018,6 @@ static void ggml_compute_forward_mul_mat_id(
    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
    GGML_ASSERT(ne0 == ne01);
    GGML_ASSERT(ne1 == ne11);
    GGML_ASSERT(ne2 == ne12);
    GGML_ASSERT(ne3 == ne13);
    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == ggml_type_size(type));
    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
@ -10973,22 +11028,21 @@ static void ggml_compute_forward_mul_mat_id(
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);
    // broadcast is not supported with mmid
    assert(ne12 == 1);
    assert(ne13 == 1);
    // row groups
-    const int id   = ggml_get_op_params_i32(dst, 0);
+    const int n_ids = ids->ne[0]; // n_expert_used
-    const int n_as = src0->ne[2];
+    const int n_as  = ne02;       // n_expert
    char * wdata_src1_end = (src1->type == vec_dot_type) ?
            (char *) params->wdata :
            (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
-    int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
+    struct mmid_row_mapping {
-    int64_t * matrix_rows       = matrix_row_counts + n_as;     // [n_as][ne11]
+        int32_t i1;
        int32_t i2;
    };
-    #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
+    int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
    struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
   if (params->type == GGML_TASK_TYPE_INIT) {
        if (ith != 0) {
@ -11012,16 +11066,20 @@ static void ggml_compute_forward_mul_mat_id(
        }
        // initialize matrix_row_counts
        GGML_ASSERT(wdata == wdata_src1_end);
        memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
-        // group rows by src0 matrix
+#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
            const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
-            GGML_ASSERT(row_id >= 0 && row_id < n_as);
+        // group rows by src0 matrix
-            MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
+        for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
-            matrix_row_counts[row_id] += 1;
+            for (int id = 0; id < n_ids; ++id) {
                const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
                assert(i02 >= 0 && i02 < n_as);
                MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
                matrix_row_counts[i02] += 1;
            }
        }
        return;
@ -11039,15 +11097,13 @@ static void ggml_compute_forward_mul_mat_id(
            continue;
        }
-        size_t src0_offset = cur_a*src0->nb[2];
+        const char * src0_cur = (const char *) src0->data + cur_a*nb02;
        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
        const int64_t nr0 = ne01; // src0 rows
-        const int64_t nr1 = cne1*ne12*ne13; // src1 rows
+        const int64_t nr1 = cne1; // src1 rows
        //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
        // distribute the thread work across the inner or outer loop based on which one is larger
@ -11066,13 +11122,11 @@ static void ggml_compute_forward_mul_mat_id(
        const int64_t ir110 = dr1*ith1;
        const int64_t ir111 = MIN(ir110 + dr1, nr1);
        //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
        // threads with no work simply yield (not sure if it helps)
-        if (ir010 >= ir011 || ir110 >= ir111) {
+        //if (ir010 >= ir011 || ir110 >= ir111) {
-            sched_yield();
+        //    sched_yield();
-            continue;
+        //    continue;
-        }
+        //}
        // block-tiling attempt
        const int64_t blck_0 = 16;
@ -11084,20 +11138,16 @@ static void ggml_compute_forward_mul_mat_id(
        for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
            for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
                for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
-                    const int64_t  i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
+                    const int64_t _i12 = ir1; // logical row index for this expert
                    const int64_t  i12 = (ir1 - i13*ne12*cne1)/cne1;
                    const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
                    const int64_t  i11 = MMID_MATRIX_ROW(cur_a, _i11);
-                    // broadcast src0 into src1
+                    struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
-                    //const int64_t i03 = i13/r3;
+                    const int id       = row_mapping.i1; // selected expert index
                    //const int64_t i02 = i12/r2;
-                    const int64_t i1 = i11;
+                    const int64_t  i11 = id % ne11;
-                    const int64_t i2 = i12;
+                    const int64_t  i12 = row_mapping.i2; // row index in src1
                    const int64_t i3 = i13;
-                    const char * src0_row = (const char *) src0->data + src0_offset;
+                    const int64_t  i1 = id;  // selected expert index
                    const int64_t  i2 = i12; // row
                    // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
                    //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@ -11105,18 +11155,19 @@ static void ggml_compute_forward_mul_mat_id(
                    // TODO: this is a bit of a hack, we should probably have a better way to handle this
                    const char * src1_col = (const char *) wdata +
                        (src1_cont || src1->type != vec_dot_type
-                        ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
+                        ? (i11      + i12*ne11)*row_size
-                        : (i11*nb11 + i12*nb12 + i13*nb13));
+                        : (i11*nb11 + i12*nb12));
-                    float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
+                    float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
                    //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
                    //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
                    //}
                    for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                        vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
+                        vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
                    }
                    memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
                }
            }
@ -18462,7 +18513,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                    const int n_as = src0->ne[2];
                    cur += GGML_PAD(cur, sizeof(int64_t));       // align
                    cur += n_as * sizeof(int64_t);               // matrix_row_counts
-                    cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
+                    cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
                } break;
            case GGML_OP_OUT_PROD:
                {
@ -20888,12 +20939,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
            ok = ok && cur != NULL;
            ggml_set_name(cur, ctx->infos[i].name.data);
            if (!ok) {
                break;
            }
            ggml_set_name(cur, ctx->infos[i].name.data);
            // point the data member to the appropriate location in the binary blob using the tensor infos
            if (!params.no_alloc) {
              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
--- a/ggml.h
+++ b/ggml.h
@ -1161,13 +1161,11 @@ extern "C" {
            enum ggml_prec       prec);
    // indirect matrix multiplication
    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
    GGML_API struct ggml_tensor * ggml_mul_mat_id(
            struct ggml_context * ctx,
            struct ggml_tensor  * as,
-            struct ggml_tensor  * ids,
+            struct ggml_tensor  * b,
-            int                   id,
+            struct ggml_tensor  * ids);
            struct ggml_tensor  * b);
    // A: m columns, n rows,
    // B: p columns, n rows,
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@ -21,6 +21,8 @@ pip install gguf
 [scripts/gguf-convert-endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-convert-endian.py) — Allows converting the endianness of GGUF files.
 [scripts/gguf-new-metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-new-metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
 ## Development
 Maintainers who participate in development of this package are advised to install it in editable mode:
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -90,6 +90,13 @@ class Keys:
        HF_JSON          = "tokenizer.huggingface.json"
        RWKV             = "tokenizer.rwkv.world"
        CHAT_TEMPLATE    = "tokenizer.chat_template"
        CHAT_TEMPLATE_N  = "tokenizer.chat_template.{name}"
        CHAT_TEMPLATES   = "tokenizer.chat_templates"
        # FIM/Infill special tokens constants
        PREFIX_ID        = "tokenizer.ggml.prefix_token_id"
        SUFFIX_ID        = "tokenizer.ggml.suffix_token_id"
        MIDDLE_ID        = "tokenizer.ggml.middle_token_id"
        EOT_ID           = "tokenizer.ggml.eot_token_id"
 #
@ -115,6 +122,7 @@ class MODEL_ARCH(IntEnum):
    STABLELM   = auto()
    QWEN       = auto()
    QWEN2      = auto()
    QWEN2MOE   = auto()
    PHI2       = auto()
    PLAMO      = auto()
    CODESHELL  = auto()
@ -126,6 +134,8 @@ class MODEL_ARCH(IntEnum):
    MAMBA      = auto()
    XVERSE     = auto()
    COMMAND_R  = auto()
    DBRX       = auto()
    OLMO       = auto()
 class MODEL_TENSOR(IntEnum):
@ -146,6 +156,7 @@ class MODEL_TENSOR(IntEnum):
    ATTN_OUT_NORM      = auto()
    ATTN_ROT_EMBD      = auto()
    FFN_GATE_INP       = auto()
    FFN_GATE_INP_SHEXP = auto()
    FFN_NORM           = auto()
    FFN_GATE           = auto()
    FFN_DOWN           = auto()
@ -154,6 +165,9 @@ class MODEL_TENSOR(IntEnum):
    FFN_GATE_EXP       = auto()
    FFN_DOWN_EXP       = auto()
    FFN_UP_EXP         = auto()
    FFN_GATE_SHEXP     = auto()
    FFN_DOWN_SHEXP     = auto()
    FFN_UP_SHEXP       = auto()
    ATTN_Q_NORM        = auto()
    ATTN_K_NORM        = auto()
    LAYER_OUT_NORM     = auto()
@ -184,6 +198,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.STABLELM:       "stablelm",
    MODEL_ARCH.QWEN:           "qwen",
    MODEL_ARCH.QWEN2:          "qwen2",
    MODEL_ARCH.QWEN2MOE:       "qwen2moe",
    MODEL_ARCH.PHI2:           "phi2",
    MODEL_ARCH.PLAMO:          "plamo",
    MODEL_ARCH.CODESHELL:      "codeshell",
@ -195,6 +210,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.MAMBA:          "mamba",
    MODEL_ARCH.XVERSE:         "xverse",
    MODEL_ARCH.COMMAND_R:      "command-r",
    MODEL_ARCH.DBRX:           "dbrx",
    MODEL_ARCH.OLMO:           "olmo",
 }
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -217,10 +234,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.ATTN_K_NORM:        "blk.{bid}.attn_k_norm",
    MODEL_TENSOR.ATTN_OUT_NORM:      "blk.{bid}.attn_output_norm",
    MODEL_TENSOR.FFN_GATE_INP:       "blk.{bid}.ffn_gate_inp",
    MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
    MODEL_TENSOR.FFN_NORM:           "blk.{bid}.ffn_norm",
    MODEL_TENSOR.FFN_GATE:           "blk.{bid}.ffn_gate",
    MODEL_TENSOR.FFN_DOWN:           "blk.{bid}.ffn_down",
    MODEL_TENSOR.FFN_UP:             "blk.{bid}.ffn_up",
    MODEL_TENSOR.FFN_GATE_SHEXP:     "blk.{bid}.ffn_gate_shexp",
    MODEL_TENSOR.FFN_DOWN_SHEXP:     "blk.{bid}.ffn_down_shexp",
    MODEL_TENSOR.FFN_UP_SHEXP:       "blk.{bid}.ffn_up_shexp",
    MODEL_TENSOR.FFN_ACT:            "blk.{bid}.ffn",
    MODEL_TENSOR.FFN_GATE_EXP:       "blk.{bid}.ffn_gate_exps",
    MODEL_TENSOR.FFN_DOWN_EXP:       "blk.{bid}.ffn_down_exps",
@ -438,6 +459,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.ATTN_Q_NORM,
        MODEL_TENSOR.ATTN_K_NORM,
    ],
    MODEL_ARCH.QWEN: [
        MODEL_TENSOR.TOKEN_EMBD,
@ -467,6 +490,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.QWEN2MOE: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_GATE_INP,
        MODEL_TENSOR.FFN_GATE_EXP,
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
        MODEL_TENSOR.FFN_GATE_INP_SHEXP,
        MODEL_TENSOR.FFN_GATE_SHEXP,
        MODEL_TENSOR.FFN_DOWN_SHEXP,
        MODEL_TENSOR.FFN_UP_SHEXP,
    ],
    MODEL_ARCH.PLAMO: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
@ -642,6 +684,30 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ATTN_K_NORM,
        MODEL_TENSOR.ATTN_Q_NORM,
    ],
    MODEL_ARCH.DBRX: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.ATTN_OUT_NORM,
        MODEL_TENSOR.FFN_GATE_INP,
        MODEL_TENSOR.FFN_GATE_EXP,
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
    ],
    MODEL_ARCH.OLMO: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    # TODO
 }
@ -870,3 +936,7 @@ KEY_TOKENIZER_CLS_ID     = Keys.Tokenizer.CLS_ID
 KEY_TOKENIZER_MASK_ID    = Keys.Tokenizer.MASK_ID
 KEY_TOKENIZER_HF_JSON    = Keys.Tokenizer.HF_JSON
 KEY_TOKENIZER_RWKV       = Keys.Tokenizer.RWKV
 KEY_TOKENIZER_PRIFIX_ID  = Keys.Tokenizer.PREFIX_ID
 KEY_TOKENIZER_SUFFIX_ID  = Keys.Tokenizer.SUFFIX_ID
 KEY_TOKENIZER_MIDDLE_ID  = Keys.Tokenizer.MIDDLE_ID
 KEY_TOKENIZER_EOT_ID     = Keys.Tokenizer.EOT_ID
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -6,7 +6,8 @@ import struct
 import tempfile
 from enum import Enum, auto
 from io import BufferedWriter
-from typing import IO, Any, Sequence
+from typing import IO, Any, Sequence, Mapping
 from string import ascii_letters, digits
 import numpy as np
@ -466,9 +467,47 @@ class GGUFWriter:
    def add_add_space_prefix(self, value: bool) -> None:
        self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
-    def add_chat_template(self, value: str) -> None:
+    def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
        if isinstance(value, list):
            template_default = None
            template_names = set()
            for choice in value:
                name = choice.get('name', '')
                template = choice.get('template')
                # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it
                name = ''.join((c if c in ascii_letters + digits else '_' for c in name))
                if name and template is not None:
                    if name == 'default':
                        template_default = template
                    else:
                        template_names.add(name)
                        self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template)
            if template_names:
                self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names))
            if template_default is None:
                return
            value = template_default
        self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
    def add_prefix_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)
    def add_suffix_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)
    def add_middle_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)
    def add_eot_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.EOT_ID, id)
    def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
        pack_prefix = ''
        if not skip_pack_prefix:
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -10,7 +10,7 @@ class TensorNameMap:
        # Token embeddings
        MODEL_TENSOR.TOKEN_EMBD: (
            "gpt_neox.embed_in",                         # gptneox
-            "transformer.wte",                           # gpt2 gpt-j mpt refact qwen
+            "transformer.wte",                           # gpt2 gpt-j mpt refact qwen dbrx
            "transformer.word_embeddings",               # falcon
            "word_embeddings",                           # bloom
            "model.embed_tokens",                        # llama-hf
@ -48,7 +48,7 @@ class TensorNameMap:
        # Output
        MODEL_TENSOR.OUTPUT: (
            "embed_out",                 # gptneox
-            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba
+            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx
            "output",                    # llama-pth bloom internlm2
            "word_embeddings_for_head",  # persimmon
            "lm_head.linear",            # phi2
@ -60,7 +60,7 @@ class TensorNameMap:
            "transformer.ln_f",                        # gpt2 gpt-j falcon
            "model.norm",                              # llama-hf baichuan internlm2
            "norm",                                    # llama-pth
-            "transformer.norm_f",                      # mpt
+            "transformer.norm_f",                      # mpt dbrx
            "ln_f",                                    # refact bloom qwen gpt2
            "language_model.encoder.final_layernorm",  # persimmon
            "model.final_layernorm",                   # persimmon
@ -96,6 +96,7 @@ class TensorNameMap:
            "model.layers.{bid}.norm",                              # mamba-qbert
            "backbone.layers.{bid}.norm",                           # mamba
            "transformer.decoder_layer.{bid}.rms_norm",             # Grok
            "transformer.blocks.{bid}.norm_attn_norm.norm_1",       # dbrx
        ),
        # Attention norm 2
@ -108,6 +109,7 @@ class TensorNameMap:
            "gpt_neox.layers.{bid}.attention.query_key_value",                     # gptneox
            "transformer.h.{bid}.attn.c_attn",                                     # gpt2 qwen
            "transformer.blocks.{bid}.attn.Wqkv",                                  # mpt
            "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv",                   # dbrx
            "transformer.h.{bid}.self_attention.query_key_value",                  # falcon
            "h.{bid}.self_attention.query_key_value",                              # bloom
            "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
@ -168,7 +170,8 @@ class TensorNameMap:
            "model.layers.layers.{bid}.self_attn.o_proj",                   # plamo
            "model.layers.{bid}.attention.wo",                              # internlm2
            "encoder.layers.{bid}.attn.out_proj",                           # nomic-bert
-            "transformer.decoder_layer.{bid}.multi_head_attention.linear"# Grok
+            "transformer.decoder_layer.{bid}.multi_head_attention.linear",  # Grok
            "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj",        # dbrx
        ),
        # Attention output norm
@ -176,6 +179,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
            "encoder.layers.{bid}.norm1",                      # nomic-bert
            "transformer.decoder_layer.{bid}.rms_norm_1",      # Grok
            "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
        ),
        # Rotary embeddings
@ -204,7 +208,13 @@ class TensorNameMap:
        MODEL_TENSOR.FFN_GATE_INP: (
            "layers.{bid}.feed_forward.gate",             # mixtral
            "model.layers.{bid}.block_sparse_moe.gate",   # mixtral
-            "transformer.decoder_layer.{bid}.router"    # Grok
+            "model.layers.{bid}.mlp.gate",                # qwen2moe
            "transformer.decoder_layer.{bid}.router",     # Grok
            "transformer.blocks.{bid}.ffn.router.layer",  # dbrx
        ),
        MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
            "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
        ),
        # Feed-forward up
@ -233,6 +243,12 @@ class TensorNameMap:
        MODEL_TENSOR.FFN_UP_EXP: (
            "layers.{bid}.feed_forward.experts.w3",          # mixtral (merged)
            "transformer.decoder_layer.{bid}.moe.linear_v",  # Grok (merged)
            "transformer.blocks.{bid}.ffn.experts.mlp.v1",   # dbrx
            "model.layers.{bid}.mlp.experts.up_proj",        # qwen2moe (merged)
        ),
        MODEL_TENSOR.FFN_UP_SHEXP: (
            "model.layers.{bid}.mlp.shared_expert.up_proj",  # qwen2moe
        ),
        # AWQ-activation gate
@ -252,7 +268,13 @@ class TensorNameMap:
        MODEL_TENSOR.FFN_GATE_EXP: (
            "layers.{bid}.feed_forward.experts.w1",         # mixtral (merged)
-            "transformer.decoder_layer.{bid}.moe.linear"            # Grok (merged)
+            "transformer.decoder_layer.{bid}.moe.linear",   # Grok (merged)
            "transformer.blocks.{bid}.ffn.experts.mlp.w1",  # dbrx
            "model.layers.{bid}.mlp.experts.gate_proj",     # qwen2moe (merged)
        ),
        MODEL_TENSOR.FFN_GATE_SHEXP: (
            "model.layers.{bid}.mlp.shared_expert.gate_proj",  # qwen2moe
        ),
        # Feed-forward down
@ -280,6 +302,12 @@ class TensorNameMap:
        MODEL_TENSOR.FFN_DOWN_EXP: (
            "layers.{bid}.feed_forward.experts.w2",          # mixtral (merged)
            "transformer.decoder_layer.{bid}.moe.linear_1",  # Grok (merged)
            "transformer.blocks.{bid}.ffn.experts.mlp.w2",   # dbrx
            "model.layers.{bid}.mlp.experts.down_proj",      # qwen2moe (merged)
        ),
        MODEL_TENSOR.FFN_DOWN_SHEXP: (
            "model.layers.{bid}.mlp.shared_expert.down_proj",  # qwen2moe
        ),
        MODEL_TENSOR.ATTN_Q_NORM: (
@ -358,7 +386,7 @@ class TensorNameMap:
                if tensor not in MODEL_TENSORS[arch]:
                    continue
                # TODO: make this configurable
-                n_experts = 8
+                n_experts = 60
                for xid in range(n_experts):
                    tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
                    self.mapping[tensor_name] = (tensor, tensor_name)
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@ -141,7 +141,7 @@ class SpecialVocab:
        with open(tokenizer_config_file, encoding = 'utf-8') as f:
            tokenizer_config = json.load(f)
        chat_template = tokenizer_config.get('chat_template')
-        if chat_template is None or isinstance(chat_template, str):
+        if chat_template is None or isinstance(chat_template, (str, list)):
            self.chat_template = chat_template
        else:
            print(
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@ -33,3 +33,4 @@ build-backend = "poetry.core.masonry.api"
 gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint"
 gguf-dump = "scripts:gguf_dump_entrypoint"
 gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint"
 gguf-new-metadata = "scripts:gguf_new_metadata_entrypoint"
--- a/gguf-py/scripts/init.py
+++ b/gguf-py/scripts/init.py
@ -8,5 +8,6 @@ os.environ["NO_LOCAL_GGUF"] = "TRUE"
 gguf_convert_endian_entrypoint = import_module("scripts.gguf-convert-endian").main
 gguf_dump_entrypoint           = import_module("scripts.gguf-dump").main
 gguf_set_metadata_entrypoint   = import_module("scripts.gguf-set-metadata").main
 gguf_new_metadata_entrypoint   = import_module("scripts.gguf-new-metadata").main
 del import_module, os
--- a/gguf-py/scripts/gguf-new-metadata.py
+++ b/gguf-py/scripts/gguf-new-metadata.py
@ -0,0 +1,190 @@
 #!/usr/bin/env python3
 import logging
 import argparse
 import os
 import sys
 import json
 from pathlib import Path
 import numpy as np
 from typing import Any, Mapping, Sequence
 # Necessary to load the local gguf package
 if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
    sys.path.insert(0, str(Path(__file__).parent.parent))
 import gguf
 logger = logging.getLogger("gguf-new-metadata")
 def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
        # Host is little endian
        host_endian = gguf.GGUFEndian.LITTLE
        swapped_endian = gguf.GGUFEndian.BIG
    else:
        # Sorry PDP or other weird systems that don't use BE or LE.
        host_endian = gguf.GGUFEndian.BIG
        swapped_endian = gguf.GGUFEndian.LITTLE
    if reader.byte_order == "S":
        return swapped_endian
    else:
        return host_endian
 def decode_field(field: gguf.ReaderField) -> Any:
    if field and field.types:
        main_type = field.types[0]
        if main_type == gguf.GGUFValueType.ARRAY:
            sub_type = field.types[-1]
            if sub_type == gguf.GGUFValueType.STRING:
                return [str(bytes(field.parts[idx]), encoding='utf8') for idx in field.data]
            else:
                return [pv for idx in field.data for pv in field.parts[idx].tolist()]
        if main_type == gguf.GGUFValueType.STRING:
            return str(bytes(field.parts[-1]), encoding='utf8')
        else:
            return field.parts[-1][0]
    return None
 def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
    field = reader.get_field(key)
    return decode_field(field)
 def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: Mapping[str, str], remove_metadata: Sequence[str]) -> None:
    for field in reader.fields.values():
        # Suppress virtual fields and fields written by GGUFWriter
        if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
            logger.debug(f'Suppressing {field.name}')
            continue
        # Skip old chat templates if we have new ones
        if field.name.startswith(gguf.Keys.Tokenizer.CHAT_TEMPLATE) and gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
            logger.debug(f'Skipping {field.name}')
            continue
        if field.name in remove_metadata:
            logger.debug(f'Removing {field.name}')
            continue
        old_val = decode_field(field)
        val = new_metadata.get(field.name, old_val)
        if field.name in new_metadata:
            logger.debug(f'Modifying {field.name}: "{old_val}" -> "{val}"')
            del new_metadata[field.name]
        elif val is not None:
            logger.debug(f'Copying {field.name}')
        if val is not None:
            writer.add_key(field.name)
            writer.add_val(val, field.types[0])
    if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
        logger.debug('Adding chat template(s)')
        writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE])
        del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE]
    # TODO: Support other types than string?
    for key, val in new_metadata.items():
        logger.debug(f'Adding {key}: {val}')
        writer.add_key(key)
        writer.add_val(val, gguf.GGUFValueType.STRING)
    for tensor in reader.tensors:
        # Dimensions are written in reverse order, so flip them first
        shape = np.flipud(tensor.shape)
        writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
    writer.write_header_to_file()
    writer.write_kv_data_to_file()
    writer.write_ti_data_to_file()
    for tensor in reader.tensors:
        writer.write_tensor_data(tensor.data)
    writer.close()
 def main() -> None:
    parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata")
    parser.add_argument("input",                                       type=Path, help="GGUF format model input filename")
    parser.add_argument("output",                                      type=Path, help="GGUF format model output filename")
    parser.add_argument("--general-name",                              type=str,  help="The models general.name")
    parser.add_argument("--general-description",                       type=str,  help="The models general.description")
    parser.add_argument("--chat-template",                             type=str,  help="Chat template string (or JSON string containing templates)")
    parser.add_argument("--chat-template-config",                      type=Path, help="Config file (tokenizer_config.json) containing chat template(s)")
    parser.add_argument("--remove-metadata",      action="append",     type=str,  help="Remove metadata (by key name) from output model")
    parser.add_argument("--force",                action="store_true",            help="Bypass warnings without confirmation")
    parser.add_argument("--verbose",              action="store_true",            help="Increase output verbosity")
    args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"])
    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
    new_metadata = {}
    remove_metadata = args.remove_metadata or []
    if args.general_name:
        new_metadata[gguf.Keys.General.NAME] = args.general_name
    if args.general_description:
        new_metadata[gguf.Keys.General.DESCRIPTION] = args.general_description
    if args.chat_template:
        new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template
    if args.chat_template_config:
        with open(args.chat_template_config, 'r') as fp:
            config = json.load(fp)
            template = config.get('chat_template')
            if template:
                new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = template
    if remove_metadata:
        logger.warning('*** Warning *** Warning *** Warning **')
        logger.warning('* Most metadata is required for a fully functional GGUF file,')
        logger.warning('* removing crucial metadata may result in a corrupt output file!')
        if not args.force:
            logger.warning('* Enter exactly YES if you are positive you want to proceed:')
            response = input('YES, I am sure> ')
            if response != 'YES':
                logger.info("You didn't enter YES. Okay then, see ya!")
                sys.exit(0)
    logger.info(f'* Loading: {args.input}')
    reader = gguf.GGUFReader(args.input, 'r')
    arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE)
    endianess = get_byteorder(reader)
    if os.path.isfile(args.output) and not args.force:
        logger.warning('*** Warning *** Warning *** Warning **')
        logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!')
        logger.warning('* Enter exactly YES if you are positive you want to proceed:')
        response = input('YES, I am sure> ')
        if response != 'YES':
            logger.info("You didn't enter YES. Okay then, see ya!")
            sys.exit(0)
    logger.info(f'* Writing: {args.output}')
    writer = gguf.GGUFWriter(args.output, arch=arch, endianess=endianess)
    alignment = get_field_data(reader, gguf.Keys.General.ALIGNMENT)
    if alignment is not None:
        logger.debug(f'Setting custom alignment: {alignment}')
        writer.data_alignment = alignment
    copy_with_new_metadata(reader, writer, new_metadata, remove_metadata)
 if __name__ == '__main__':
    main()
--- a/grammars/README.md
+++ b/grammars/README.md
@ -89,3 +89,13 @@ This guide provides a brief overview. Check out the GBNF files in this directory
 ```
 ./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
 ```
 ## Troubleshooting
 Grammars currently have performance gotchas (see https://github.com/ggerganov/llama.cpp/issues/4218).
 ### Efficient optional repetitions
 A common pattern is to allow repetitions of a pattern `x` up to N times.
 While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) will result in extremely slow inference. Instead, you can write `(x (x (x ... (x)?...)?)?)?` (w/ N-deep nesting)
--- a/llama.cpp
+++ b/llama.cpp
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@ -12,19 +12,7 @@ bench_args="${@:3}"
 rm -f llama-bench.sqlite
-backend="cpu"
+# to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...)
 if [[ "$OSTYPE" == "darwin"* ]]; then
    backend="metal"
 elif command -v nvcc &> /dev/null; then
    backend="cuda"
 fi
 make_opts=""
 if [[ "$backend" == "cuda" ]]; then
    make_opts="LLAMA_CUDA=1"
 fi
 git checkout $1
 make clean && make -j32 $make_opts llama-bench
--- a/scripts/get-wikitext-2.sh
+++ b/scripts/get-wikitext-2.sh
@ -1,10 +1,11 @@
 #!/bin/bash
 wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
 unzip wikitext-2-raw-v1.zip
 echo "Usage:"
 echo ""
-echo "  ./perplexity -m model.gguf -f wiki.test.raw [other params]"
+echo "  ./perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]"
 echo ""
 exit 0
--- a/sgemm.cpp
+++ b/sgemm.cpp
--- a/sgemm.h
+++ b/sgemm.h
@ -0,0 +1,12 @@
 #pragma once
 #include <stdbool.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 bool llamafile_sgemm(int, int, int, const void *, int, const void *, int,
                     void *, int, int, int, int, int, int, int);
 #ifdef __cplusplus
 }
 #endif
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -25,7 +25,7 @@ function(llama_test source)
    add_executable(${TEST_TARGET} ${source} get-model.cpp)
    install(TARGETS ${TEST_TARGET} RUNTIME)
-    target_link_libraries(${TEST_TARGET} PRIVATE common json-schema-to-grammar)
+    target_link_libraries(${TEST_TARGET} PRIVATE common)
    add_test(
        NAME ${TEST_TARGET}
        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -101,7 +101,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
                    } else if (t->type == GGML_TYPE_I8) {
                        tv.push_back((float)*(int8_t *) &buf[i]);
                    } else if (quantized) {
-                        tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type));
+                        tt.to_float(&buf[i], vq.data(), bs);
                        tv.insert(tv.end(), vq.begin(), vq.end());
                    } else {
                        GGML_ASSERT(false);
@ -948,14 +948,14 @@ struct test_mul_mat_id : public test_case {
    const ggml_type type_a;
    const ggml_type type_b;
    const int n_mats;
-    const int id;
+    const int n_used;
    const bool b; // brodcast b matrix
    const int64_t m;
    const int64_t n;
    const int64_t k;
    const bool v; // view (non-contiguous ids)
    std::string vars() override {
-        return VARS_TO_STR8(type_a, type_b, n_mats, id, m, n, k, v);
+        return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
    }
    double max_nmse_err() override {
@ -972,20 +972,22 @@ struct test_mul_mat_id : public test_case {
    }
    test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
-            int n_mats = 2, int id = 0,
+            int n_mats = 8, int n_used = 2, bool b = false,
-            int64_t m = 32, int64_t n = 32, int64_t k = 32, bool v = false)
+            int64_t m = 32, int64_t n = 32, int64_t k = 32)
-        : type_a(type_a), type_b(type_b), n_mats(n_mats), id(id),
+        : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
-            m(m), n(n), k(k), v(v) {}
+            m(m), n(n), k(k) {
            GGML_ASSERT(n_used <= n_mats);
        }
    ggml_tensor * build_graph(ggml_context * ctx) override {
        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
-        ggml_tensor * mats = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
+        ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
        ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
-        if (v) {
+        if (n_used != n_mats) {
-            ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0);
+            ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0);
        }
-        ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n);
+        ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n);
-        ggml_tensor * out = ggml_mul_mat_id(ctx, mats, ids, v ? id/2 : id, b);
+        ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
        return out;
    }
@ -1611,7 +1613,6 @@ public:
    }
 };
 // Llama
 struct test_llama : public test_llm {
    static constexpr float freq_base = 10000.0f;
@ -1875,9 +1876,29 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
    };
    const ggml_type base_types[] = {
        GGML_TYPE_F32, GGML_TYPE_F16,
        GGML_TYPE_Q4_0,
        GGML_TYPE_Q4_K,
        GGML_TYPE_IQ2_XXS
    };
    const ggml_type other_types[] = {
        GGML_TYPE_Q4_1,
        GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
        GGML_TYPE_Q8_0,
        GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
        GGML_TYPE_Q5_K,
        GGML_TYPE_Q6_K,
        GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
        GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
        GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
    };
    // unary ops
    for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
        test_cases.emplace_back(new test_unary((ggml_unary_op) op));
        test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }));
    }
    test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
@ -1982,7 +2003,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
    }
-    for (ggml_type type_a : all_types) {
+    for (ggml_type type_a : base_types) {
        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1,  1}, {1, 1}));
            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10,  1}, {1, 1}));
@ -2002,6 +2023,12 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        }
    }
    for (ggml_type type_a : other_types) {
        for (ggml_type type_b : {GGML_TYPE_F32}) {
            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1,  1}, {1, 1}));
        }
    }
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,  128, { 8,  1}, {1, 1}));
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,  128, { 8,  1}, {4, 1}));
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,   64, { 8,  1}, {4, 1}));
@ -2009,12 +2036,32 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 45, 128, { 8,  1}, {4, 1}));
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45,  64, { 8,  1}, {4, 1}));
-    for (ggml_type type_a : all_types) {
+    for (ggml_type type_a : base_types) {
        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
-            for (int n_mats : {2, 4, 8}) {
+            for (int n_mats : {4, 8}) {
-                for (int id = 0; id < n_mats; id++) {
+                for (int n_used : {1, 2, 4}) {
-                    for (bool v : {false, true}) {
+                    for (bool b : {false, true}) {
-                        test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, v));
+                        for (int n : {1, 32}) {
                            int m = 512;
                            int k = 256;
                            test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
                        }
                    }
                }
            }
        }
    }
    for (ggml_type type_a : other_types) {
        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
            for (int n_mats : {4}) {
                for (int n_used : {2}) {
                    for (bool b : {false}) {
                        for (int n : {1}) {
                            int m = 512;
                            int k = 256;
                            test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
                        }
                    }
                }
            }
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@ -45,6 +45,8 @@ int main(void) {
        // Orca-Vicuna
        // No template included in tokenizer_config.json, so this template likely needs to be manually set.
        "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{-'SYSTEM: ' + message['content'] + '\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
        // CohereForAI/c4ai-command-r-plus
        "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}"
    };
    std::vector<std::string> expected_output = {
        // teknium/OpenHermes-2.5-Mistral-7B
@ -69,6 +71,8 @@ int main(void) {
        "You are a helpful assistant\n\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
        // Orca-Vicuna
        "SYSTEM: You are a helpful assistant\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
        // CohereForAI/c4ai-command-r-plus
        "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
    };
    std::vector<char> formatted_chat(1024);
    int32_t res;
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@ -104,16 +104,16 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        R"""(
            array ::= "[" space ( value ("," space value)* )? "]" space
            boolean ::= ("true" | "false") space
            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            null ::= "null" space
-            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
            root ::= object
            space ::= " "?
-            string ::=  "\"" (
+            string ::= "\"" char* "\"" space
-                    [^"\\] |
+            value ::= object | array | string | number | boolean | null
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
            value ::= object | array | string | number | boolean
        )"""
    });
@ -133,10 +133,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            date-string ::= "\"" date "\"" space
            date-time ::= date "T" time
            date-time-string ::= "\"" date-time "\"" space
-            root ::= "[" space date-string "," space uuid "," space time-string "," space date-time-string "]" space
+            root ::= "[" space tuple-0 "," space uuid "," space tuple-2 "," space tuple-3 "]" space
            space ::= " "?
            time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
            time-string ::= "\"" time "\"" space
            tuple-0 ::= date-string
            tuple-2 ::= time-string
            tuple-3 ::= date-time-string
            uuid ::= "\"" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "\"" space
        )"""
    });
@ -148,10 +151,65 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "type": "string"
        })""",
        R"""(
-            root ::=  "\"" (
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-                    [^"\\] |
+            root ::= "\"" char* "\"" space
-                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+            space ::= " "?
-                    )* "\"" space
+        )"""
    });
    test({
        SUCCESS,
        "string w/ min length 1",
        R"""({
            "type": "string",
            "minLength": 1
        })""",
        R"""(
            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            root ::= "\"" char+ "\"" space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "string w/ min length 3",
        R"""({
            "type": "string",
            "minLength": 3
        })""",
        R"""(
            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            root ::= "\"" char char char (char)* "\"" space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "string w/ max length",
        R"""({
            "type": "string",
            "maxLength": 3
        })""",
        R"""(
            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            root ::= "\"" (char (char (char)?)?)? "\"" space
            space ::= " "?
        )"""
    });
    test({
        SUCCESS,
        "string w/ min & max length",
        R"""({
            "type": "string",
            "minLength": 1,
            "maxLength": 4
        })""",
        R"""(
            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            root ::= "\"" char (char (char (char)?)?)? "\"" space
            space ::= " "?
        )"""
    });
@ -175,7 +233,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "type": "integer"
        })""",
        R"""(
-            root ::= ("-"? ([0-9] | [1-9] [0-9]*)) space
+            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            root ::= ("-"? integral-part) space
            space ::= " "?
        )"""
    });
@ -223,12 +282,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "prefixItems": [{ "type": "string" }]
        })""",
        R"""(
            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            root ::= "[" space string "]" space
            space ::= " "?
-            string ::=  "\"" (
+            string ::= "\"" char* "\"" space
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
@ -239,13 +296,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "prefixItems": [{ "type": "string" }, { "type": "number" }]
        })""",
        R"""(
-            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
            root ::= "[" space string "," space number "]" space
            space ::= " "?
-            string ::=  "\"" (
+            string ::= "\"" char* "\"" space
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
@ -256,7 +313,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "type": "number"
        })""",
        R"""(
-            root ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
+            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            root ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
            space ::= " "?
        )"""
    });
@ -272,7 +331,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            boolean ::= ("true" | "false") space
-            root ::= "[" space boolean ( "," space boolean )( "," space boolean )* "]" space
+            root ::= "[" space boolean "," space boolean ("," space boolean)* "]" space
            space ::= " "?
        )"""
    });
@ -320,10 +379,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "maxItems": 5
        })""",
        R"""(
-            integer ::= ("-"? ([0-9] | [1-9] [0-9]*)) space
+            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            integer ::= ("-"? integral-part) space
            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            item ::= number | integer
-            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "[" space item ( "," space item )( "," space item )( "," space item )?( "," space item )? "]" space
+            root ::= "[" space item "," space item "," space item ("," space item ("," space item)?)? "]" space
            space ::= " "?
        )"""
    });
@ -372,11 +433,11 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        "regexp",
        R"""({
            "type": "string",
-            "pattern": "^(\\([0-9]{1,3}\\))?[0-9]{3}-[0-9]{4} and...$"
+            "pattern": "^(\\([0-9]{1,3}\\))?[0-9]{3}-[0-9]{4} a{3,5}nd...$"
        })""",
        R"""(
-            dot ::= [\U00000000-\x09\x0B\x0C\x0E-\U0010FFFF]
+            dot ::= [^\x0A\x0D]
-            root ::= "\"" ("(" root-1 root-1? root-1? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot "\"" space
+            root ::= "\"" ("(" root-1 (root-1 (root-1)?)? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " " "aaa" ("a" ("a")?)? "nd" dot dot dot "\"" space
            root-1 ::= [0-9]
            space ::= " "?
        )"""
@ -404,12 +465,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            a-kv ::= "\"a\"" space ":" space string
            b-kv ::= "\"b\"" space ":" space string
            c-kv ::= "\"c\"" space ":" space string
            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
            space ::= " "?
-            string ::=  "\"" (
+            string ::= "\"" char* "\"" space
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
@ -426,12 +485,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            a-kv ::= "\"a\"" space ":" space string
            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            root ::= "{" space  (a-kv )? "}" space
            space ::= " "?
-            string ::=  "\"" (
+            string ::= "\"" char* "\"" space
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
@ -452,12 +509,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            b-kv ::= "\"b\"" space ":" space string
            b-rest ::= ( "," space c-kv )?
            c-kv ::= "\"c\"" space ":" space string
            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            root ::= "{" space  (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
            space ::= " "?
-            string ::=  "\"" (
+            string ::= "\"" char* "\"" space
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
@ -478,14 +533,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            a-kv ::= "\"a\"" space ":" space string
            b-kv ::= "\"b\"" space ":" space string
            c-kv ::= "\"c\"" space ":" space string
            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            d-kv ::= "\"d\"" space ":" space string
            d-rest ::= ( "," space c-kv )?
            root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
            space ::= " "?
-            string ::=  "\"" (
+            string ::= "\"" char* "\"" space
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                )* "\"" space
        )"""
    });
@ -500,13 +553,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            additional-kv ::= string ":" space additional-value
            additional-kvs ::= additional-kv ( "," space additional-kv )*
            additional-value ::= "[" space (number ("," space number)*)? "]" space
-            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
            root ::= "{" space  (additional-kvs )? "}" space
            space ::= " "?
-            string ::=  "\"" (
+            string ::= "\"" char* "\"" space
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
@ -520,16 +573,16 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        R"""(
            array ::= "[" space ( value ("," space value)* )? "]" space
            boolean ::= ("true" | "false") space
            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            null ::= "null" space
-            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
            root ::= object
            space ::= " "?
-            string ::=  "\"" (
+            string ::= "\"" char* "\"" space
-                    [^"\\] |
+            value ::= object | array | string | number | boolean | null
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
            value ::= object | array | string | number | boolean
        )"""
    });
@ -542,16 +595,16 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        R"""(
            array ::= "[" space ( value ("," space value)* )? "]" space
            boolean ::= ("true" | "false") space
            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            null ::= "null" space
-            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
+            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
            root ::= object
            space ::= " "?
-            string ::=  "\"" (
+            string ::= "\"" char* "\"" space
-                    [^"\\] |
+            value ::= object | array | string | number | boolean | null
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
            value ::= object | array | string | number | boolean
        )"""
    });
@ -583,13 +636,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            a-kv ::= "\"a\"" space ":" space number
            additional-kv ::= string ":" space string
            additional-kvs ::= additional-kv ( "," space additional-kv )*
-            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
            root ::= "{" space a-kv ( "," space ( additional-kvs ) )? "}" space
            space ::= " "?
-            string ::=  "\"" (
+            string ::= "\"" char* "\"" space
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
@ -608,13 +661,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            a-rest ::= additional-kvs
            additional-kv ::= string ":" space number
            additional-kvs ::= additional-kv ( "," space additional-kv )*
-            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
            root ::= "{" space  (a-kv a-rest | additional-kvs )? "}" space
            space ::= " "?
-            string ::=  "\"" (
+            string ::= "\"" char* "\"" space
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
@ -636,13 +689,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            additional-kvs ::= additional-kv ( "," space additional-kv )*
            b-kv ::= "\"b\"" space ":" space number
            b-rest ::= additional-kvs
-            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
            root ::= "{" space a-kv ( "," space ( b-kv b-rest | additional-kvs ) )? "}" space
            space ::= " "?
-            string ::=  "\"" (
+            string ::= "\"" char* "\"" space
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
@ -650,9 +703,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        SUCCESS,
        "top-level $ref",
        R"""({
-            "$ref": "#/definitions/MyType",
+            "$ref": "#/definitions/foo",
            "definitions": {
-                "MyType": {
+                "foo": {
                    "type": "object",
                    "properties": {
                        "a": {
@ -667,14 +720,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            }
        })""",
        R"""(
-            MyType ::= "{" space MyType-a-kv "}" space
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-            MyType-a-kv ::= "\"a\"" space ":" space string
+            foo ::= "{" space foo-a-kv "}" space
-            root ::= MyType
+            foo-a-kv ::= "\"a\"" space ":" space string
            root ::= foo
            space ::= " "?
-            string ::=  "\"" (
+            string ::= "\"" char* "\"" space
                    [^"\\] |
                    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
                    )* "\"" space
        )"""
    });
@ -701,9 +752,11 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            alternative-1 ::= bar
            bar ::= "{" space  (bar-b-kv )? "}" space
            bar-b-kv ::= "\"b\"" space ":" space number
            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            foo ::= "{" space  (foo-a-kv )? "}" space
            foo-a-kv ::= "\"a\"" space ":" space number
-            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
+            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
            root ::= alternative-0 | alternative-1
            space ::= " "?
        )"""
@ -745,7 +798,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            c-kv ::= "\"c\"" space ":" space number
            d-kv ::= "\"d\"" space ":" space number
            d-rest ::= ( "," space c-kv )?
-            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
+            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
            root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
            space ::= " "?
        )"""
@ -786,7 +841,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            "definitions": {}
        })""",
        R"""(
-            number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
+            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
            number- ::= "{" space number-number-kv "}" space
            number-kv ::= "\"number\"" space ":" space number-
            number-number ::= "{" space number-number-root-kv "}" space
@ -816,7 +873,7 @@ int main() {
        test_all("Python", [](const TestCase & tc) {
            write("test-json-schema-input.tmp", tc.schema);
            tc.verify_status(std::system(
-                "python ./examples/json-schema-to-grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
+                "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
            tc.verify(read("test-grammar-output.tmp"));
        });
    } else {