Merge branch 'ggerganov:master' into vulkan-build-integration

2024-06-30 00:14:14 -03:00 · 2024-06-30 00:14:14 -03:00 · 4eab311ed0
commit 4eab311ed0
parent ac9a065d31 72272b83a3
30 changed files with 879 additions and 337 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -47,7 +47,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Test
@ -105,7 +105,7 @@ jobs:
          sysctl -a
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON
+          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Test
@ -222,7 +222,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
          cmake --build . --config Release -j $(nproc)
      - name: Test
@ -801,6 +801,7 @@ jobs:
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
          cd build
          $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
          & $sde -future -- ctest -L main -C Release --verbose --timeout 900
      - name: Determine tag name
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -79,14 +79,21 @@ set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
 set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
 set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
 set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})
-set(GGML_LLAMAFILE          ON)
+
-set(GGML_CUDA_USE_GRAPHS    ON)
+# change the default for these ggml options
 if (NOT DEFINED GGML_LLAMAFILE)
    set(GGML_LLAMAFILE ON)
 endif()
 if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
    set(GGML_CUDA_USE_GRAPHS ON)
 endif()
 # transition helpers
 function (llama_option_depr TYPE OLD NEW)
    if (${OLD})
        message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
-        set(${NEW} ON)
+        set(${NEW} ON PARENT_SCOPE)
    endif()
 endfunction()
@ -96,7 +103,6 @@ llama_option_depr(WARNING     LLAMA_KOMPUTE             GGML_KOMPUTE)
 llama_option_depr(WARNING     LLAMA_METAL               GGML_METAL)
 llama_option_depr(WARNING     LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
 llama_option_depr(WARNING     LLAMA_NATIVE              GGML_NATIVE)
 llama_option_depr(WARNING     LLAMA_OPENMP              GGML_OPENMP)
 llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
--- a/2
+++ b/2
@ -45,6 +45,7 @@ BUILD_TARGETS = \
 TEST_TARGETS = \
 	tests/test-autorelease \
 	tests/test-backend-ops \
 	tests/test-chat-template \
 	tests/test-double-float \
 	tests/test-grad0 \
 	tests/test-grammar-integration \
@ -1092,6 +1093,7 @@ clean:
 	rm -rvf src/*.o
 	rm -rvf tests/*.o
 	rm -rvf examples/*.o
 	rm -rvf common/*.o
 	rm -rvf *.a
 	rm -rvf *.dll
 	rm -rvf *.so
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1026,6 +1026,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.input_suffix = argv[i];
        return true;
    }
    if (arg == "--spm-infill") {
        params.spm_infill = true;
        return true;
    }
    if (arg == "--grammar") {
        CHECK_ARG
        sparams.grammar = argv[i];
@ -1409,6 +1413,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "main infill", "       --in-prefix-bos",        "prefix BOS to user inputs, preceding the `--in-prefix` string" });
    options.push_back({ "main infill", "       --in-prefix STRING",     "string to prefix user inputs with (default: empty)" });
    options.push_back({ "main infill", "       --in-suffix STRING",     "string to suffix after user inputs with (default: empty)" });
    options.push_back({ "server infill",
                                       "       --spm-infill",           "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
    options.push_back({ "sampling" });
    options.push_back({ "*",           "       --samplers SAMPLERS",    "samplers that will be used for generation in the order, separated by \';\'\n"
@ -2618,6 +2624,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
        const std::vector<llama_chat_msg> & msgs,
        bool add_ass) {
    int alloc_size = 0;
    bool fallback = false; // indicate if we must fallback to default chatml
    std::vector<llama_chat_message> chat;
    for (auto & msg : msgs) {
        chat.push_back({msg.role.c_str(), msg.content.c_str()});
@ -2630,10 +2637,26 @@ std::string llama_chat_apply_template(const struct llama_model * model,
    // run the first time to get the total output length
    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
    // error: chat template is not supported
    if (res < 0) {
        if (ptr_tmpl != nullptr) {
            // if the custom "tmpl" is not supported, we throw an error
            // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
            throw std::runtime_error("this custom template is not supported");
        } else {
            // If the built-in template is not supported, we default to chatml
            res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
            fallback = true;
        }
    }
    // if it turns out that our buffer is too small, we resize it
    if ((size_t) res > buf.size()) {
        buf.resize(res);
-        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+        res = llama_chat_apply_template(
            fallback ? nullptr : model,
            fallback ? "chatml" : ptr_tmpl,
            chat.data(), chat.size(), add_ass, buf.data(), buf.size());
    }
    std::string formatted_chat(buf.data(), res);
@ -2804,125 +2827,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
 //
 static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
    int32_t n_tensors;
    size_t n_bytes = 0;
    uint32_t max_direction_layer = 0;
    llama_control_vector_data result = { -1, {} };
-    // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
+    ggml_context * ctx = nullptr;
-    {
+    struct gguf_init_params meta_gguf_params = {
-        struct ggml_init_params meta_params = {
+        /* .no_alloc = */ false,
-            /* .mem_size   = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
+        /* .ctx      = */ &ctx,
-            /* .mem_buffer = */ nullptr,
+    };
-            /* .no_alloc   = */ true,
+    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
-        };
+    if (!ctx_gguf) {
-        ggml_context * meta_ctx = ggml_init(meta_params);
+        fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
-        struct gguf_init_params meta_gguf_params = {
+        return result;
            /* .no_alloc = */ true,
            /* .ctx      = */ &meta_ctx,
        };
        struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
        if (!meta_ctx_gguf) {
            fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
            ggml_free(meta_ctx);
            return result;
        }
        n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
        for (int i = 0; i < n_tensors; i++) {
            std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
            // split on '.'
            size_t dotpos = name.find('.');
            if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
                try {
                    uint32_t layer = std::stoi(name.substr(dotpos + 1));
                    if (layer == 0) {
                        fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
                        ggml_free(meta_ctx);
                        gguf_free(meta_ctx_gguf);
                        return result;
                    }
                    if (layer > max_direction_layer) {
                        max_direction_layer = layer;
                    }
                } catch (...) {
                    fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
                    ggml_free(meta_ctx);
                    gguf_free(meta_ctx_gguf);
                    return result;
                }
            }
            struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
            if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
                fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
                ggml_free(meta_ctx);
                gguf_free(meta_ctx_gguf);
                return result;
            }
            if (result.n_embd == -1) {
                result.n_embd = ggml_nelements(tensor_meta);
            } else if (ggml_nelements(tensor_meta) != result.n_embd) {
                fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
                ggml_free(meta_ctx);
                gguf_free(meta_ctx_gguf);
                return result;
            }
            n_bytes += ggml_nbytes(tensor_meta);
        }
        ggml_free(meta_ctx);
        gguf_free(meta_ctx_gguf);
    }
    int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
    if (n_tensors == 0) {
        fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
        return result;
    }
-    // load and scale tensors into final control vector context
+    for (int i = 0; i < n_tensors; i++) {
-    struct ggml_init_params ggml_params = {
+        std::string name = gguf_get_tensor_name(ctx_gguf, i);
        /* .mem_size   = */ ggml_tensor_overhead() * n_tensors + n_bytes,
        /* .mem_buffer = */ nullptr,
        /* .no_alloc   = */ false,
    };
    struct ggml_context * ctx = ggml_init(ggml_params);
-    struct gguf_init_params params = {
+        int layer_idx = -1;
        /*.no_alloc = */ false,
        /*.ctx      = */ &ctx,
    };
    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
    if (!ctx_gguf) {
        fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
        ggml_free(ctx);
        return result;
    }
-    // do not store data for layer 0 (it's not used)
+        // split on '.'
-    result.data.resize(result.n_embd * max_direction_layer);
+        size_t dotpos = name.find('.');
-
+        if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
-    for (uint32_t il = 1; il <= max_direction_layer; il++) {
+            try {
-        const std::string name = "direction." + std::to_string(il);
+                layer_idx = std::stoi(name.substr(dotpos + 1));
-        const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+            } catch (...) {
-
+                layer_idx = -1;
        float * dst = result.data.data() + result.n_embd * (il - 1);
        if (tensor) {
            const float * src = (const float *) tensor->data;
            for (int j = 0; j < result.n_embd; j++) {
                dst[j] = src[j] * load_info.strength;
            }
        } else {
            for (int j = 0; j < result.n_embd; j++) {
                dst[j] = 0.0f;
            }
        }
        if (layer_idx < 0) {
            fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        } else if (layer_idx == 0) {
            fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }
        struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
        if (tensor->type != GGML_TYPE_F32) {
            fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }
        if (ggml_n_dims(tensor) != 1) {
            fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }
        if (result.n_embd == -1) {
            result.n_embd = ggml_nelements(tensor);
        } else if (ggml_nelements(tensor) != result.n_embd) {
            fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
            result.n_embd = -1;
            break;
        }
        // extend if necessary - do not store data for layer 0 (it's not used)
        result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
        const float * src = (const float *) tensor->data;
        float * dst = result.data.data() + result.n_embd * (layer_idx - 1);  // layer 1 at [0]
        for (int j = 0; j < result.n_embd; j++) {
            dst[j] += src[j] * load_info.strength;  // allows multiple directions for same layer in same file
        }
    }
    if (result.n_embd == -1) {
        fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
        result.data.clear();
    }
    gguf_free(ctx_gguf);
    ggml_free(ctx);
    return result;
 }
@ -2933,16 +2918,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
        auto cur = llama_control_vector_load_one(info);
        if (cur.n_embd == -1) {
-            return result;
+            result.n_embd = -1;
            break;
        }
-        if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
+        if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
-            fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
+            fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
-            return result;
+            result.n_embd = -1;
            break;
        }
        if (result.n_embd == -1) {
            result = std::move(cur);
        } else {
            result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f);  // extend if necessary
            for (size_t i = 0; i < cur.data.size(); i++) {
                result.data[i] += cur.data[i];
            }
@ -2950,7 +2938,8 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
    }
    if (result.n_embd == -1) {
-        fprintf(stderr, "%s: no vectors passed\n", __func__);
+        fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
        result.data.clear();
    }
    return result;
--- a/common/common.h
+++ b/common/common.h
@ -250,6 +250,8 @@ struct gpt_params {
    std::string cvector_outfile       = "control_vector.gguf";
    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
    bool spm_infill = false; // suffix/prefix/middle pattern for infill
 };
 void gpt_params_handle_model_default(gpt_params & params);
@ -380,6 +382,8 @@ struct llama_chat_msg {
 bool llama_chat_verify_template(const std::string & tmpl);
 // CPP wrapper for llama_chat_apply_template
 // If the built-in template is not supported, we default to chatml
 // If the custom "tmpl" is not supported, we throw an error
 std::string llama_chat_apply_template(const struct llama_model * model,
        const std::string & tmpl,
        const std::vector<llama_chat_msg> & chat,
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -316,7 +316,7 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
 };
 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
-std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
+std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
 template <typename Iterator>
 std::string join(Iterator begin, Iterator end, const std::string & separator) {
@ -720,7 +720,7 @@ private:
            }
            prop_names.push_back(prop_name);
        }
-        if (!(additional_properties.is_boolean() && !additional_properties.get<bool>())) {
+        if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
            std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
            std::string value_rule =
                additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -85,6 +85,7 @@ models = [
    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
    {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
    {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
 ]
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -487,6 +487,9 @@ class Model:
        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
            res = "jina-v2-code"
        if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
            # ref: https://huggingface.co/LumiOpen/Viking-7B
            res = "viking"
        if res is None:
            logger.warning("\n")
@ -2337,6 +2340,46 @@ class GemmaModel(Model):
        return [(self.map_tensor_name(name), data_torch)]
@Model.register("Gemma2ForCausalLM")
 class Gemma2Model(Model):
    model_arch = gguf.MODEL_ARCH.GEMMA2
    def set_vocab(self):
        self._set_vocab_llama_hf()
        self.gguf_writer.add_add_space_prefix(False)
    def set_gguf_parameters(self):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_key_length(hparams["head_dim"])
        self.gguf_writer.add_value_length(hparams["head_dim"])
        self.gguf_writer.add_file_type(self.ftype)
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unusem
        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
        # To prevent errors, skip loading lm_head.weight.
        if name == "lm_head.weight":
            logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
            return []
        # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
        if name.endswith("norm.weight"):
            data_torch = data_torch + 1
        return [(self.map_tensor_name(name), data_torch)]
@Model.register("Starcoder2ForCausalLM")
 class StarCoder2Model(Model):
    model_arch = gguf.MODEL_ARCH.STARCODER2
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@ -15,6 +15,7 @@ In this section, we cover the most commonly used options for running the `infill
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
 -   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
 -   `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
 ## Input Prompts
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -210,6 +210,7 @@ int main(int argc, char ** argv) {
        suff_rm_leading_spc = false;
    }
    std::vector<llama_token> embd_inp;
    std::vector<llama_token> embd_end;
    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
    const int space_token = 29871;
@ -217,12 +218,13 @@ int main(int argc, char ** argv) {
        inp_sfx.erase(inp_sfx.begin());
    }
    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
    if (add_bos) {
        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
    }
    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
-    embd_inp = inp_pfx;
+    embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
-    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+    embd_end = params.spm_infill ? inp_pfx : inp_sfx;
    if (add_bos) {
        embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
    }
    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
    const llama_token middle_token = llama_token_middle(model);
    if (middle_token >= 0) {
@ -526,14 +528,14 @@ int main(int argc, char ** argv) {
                    inp_sfx.erase(inp_sfx.begin());
                }
                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
                if (add_bos) {
                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
                }
                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
-                embd_inp = inp_pfx;
+                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
-                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
                if (add_bos) {
                    embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
                }
                embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
                const llama_token middle_token = llama_token_middle(model);
                if (middle_token >= 0) {
                    embd_inp.push_back(middle_token);
                }
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -231,7 +231,7 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
 GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
 NON_LITERAL_SET = set('|.()[]{}*+?')
-ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
+ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')
 class SchemaConverter:
@ -602,7 +602,7 @@ class SchemaConverter:
                else:
                    add_component(t, is_required=True)
-            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[]))
+            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
        elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
            items = schema.get('items') or schema['prefixItems']
@ -691,7 +691,7 @@ class SchemaConverter:
        required_props = [k for k in sorted_props if k in required]
        optional_props = [k for k in sorted_props if k not in required]
-        if additional_properties != False:
+        if additional_properties is not None and additional_properties != False:
            sub_name = f'{name}{"-" if name else ""}additional'
            value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
                self._add_primitive('value', PRIMITIVE_RULES['value'])
--- a/examples/llama.android/llama/CMakeLists.txt
+++ b/examples/llama.android/llama/CMakeLists.txt
@ -1,55 +0,0 @@
 # For more information about using CMake with Android Studio, read the
 # documentation: https://d.android.com/studio/projects/add-native-code.html.
 # For more examples on how to use CMake, see https://github.com/android/ndk-samples.
 # Sets the minimum CMake version required for this project.
 cmake_minimum_required(VERSION 3.22.1)
 # Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
 # Since this is the top level CMakeLists.txt, the project name is also accessible
 # with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
 # build script scope).
 project("llama-android")
 ## Fetch latest llama.cpp from GitHub
 #include(FetchContent)
 #FetchContent_Declare(
 #        llama
 #        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
 #        GIT_TAG        master
 #)
 #
 ## Also provides "common"
 #FetchContent_MakeAvailable(llama)
 # llama.cpp CI uses the code from the current branch
 # ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
 add_subdirectory(../../../../../../ build-llama)
 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
 # You can define multiple libraries, and CMake builds them for you.
 # Gradle automatically packages shared libraries with your APK.
 #
 # In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
 # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
 # is preferred for the same purpose.
 #
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
 # for GameActivity/NativeActivity derived applications, the same library name must be
 # used in the AndroidManifest.xml file.
 add_library(${CMAKE_PROJECT_NAME} SHARED
    # List C/C++ source files with relative paths to this CMakeLists.txt.
        llama-android.cpp)
 # Specifies libraries CMake should link to your target library. You
 # can link libraries from various origins, such as libraries defined in this
 # build script, prebuilt third-party libraries, or Android system libraries.
 target_link_libraries(${CMAKE_PROJECT_NAME}
    # List libraries link to the target library
    llama
    common
    android
    log)
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
 # build script scope).
 project("llama-android")
-include(FetchContent)
+#include(FetchContent)
-FetchContent_Declare(
+#FetchContent_Declare(
-        llama
+#        llama
-        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-        GIT_TAG        master
+#        GIT_TAG        master
-)
+#)
 # Also provides "common"
-FetchContent_MakeAvailable(llama)
+#FetchContent_MakeAvailable(llama)
 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama)
 # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
 # is preferred for the same purpose.
 #
 #load local llama.cpp
 add_subdirectory(../../../../../../ build-llama)
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
 # for GameActivity/NativeActivity derived applications, the same library name must be
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -5,7 +5,7 @@
 #include <string>
 #include <unistd.h>
 #include "llama.h"
-#include "common/common.h"
+#include "common.h"
 // Write C++ code here.
 //
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -1121,20 +1121,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            }
            if (n < 32)
                hparams.image_grid_pinpoints[n] = 0;
-        } catch (std::runtime_error & e) {
+        } catch (std::runtime_error & /*e*/) {
            hparams.image_grid_pinpoints[0]=0;
        }
        try {
            int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
            strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
-        } catch (std::runtime_error & e) {
+        } catch (std::runtime_error & /*e*/) {
            strcpy(hparams.mm_patch_merge_type, "flat");
        }
        try {
            hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
-        } catch(const std::exception& e) {
+        } catch(const std::exception& /*e*/) {
            hparams.image_crop_resolution = hparams.image_size;
        }
@ -1173,7 +1173,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        try {
            vision_model.class_embedding  = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
            new_clip->has_class_embedding = true;
-        } catch (const std::exception& e) {
+        } catch (const std::exception& /*e*/) {
            new_clip->has_class_embedding = false;
        }
@ -1181,7 +1181,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.pre_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
            vision_model.pre_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
            new_clip->has_pre_norm = true;
-        } catch (std::exception & e) {
+        } catch (std::exception & /*e*/) {
            new_clip->has_pre_norm = false;
        }
@ -1189,21 +1189,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.post_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
            vision_model.post_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
            new_clip->has_post_norm = true;
-        } catch (std::exception & e) {
+        } catch (std::exception & /*e*/) {
            new_clip->has_post_norm = false;
        }
        try {
            vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
            new_clip->has_patch_bias = true;
-        } catch (std::exception & e) {
+        } catch (std::exception & /*e*/) {
            new_clip->has_patch_bias = false;
        }
        try {
            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
-        } catch(const std::exception& e) {
+        } catch(const std::exception& /*e*/) {
            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
        }
@ -1215,26 +1215,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                // Yi-type llava
                vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
                vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
            try {
                // missing in Yi-type llava
                vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
                vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
            try {
                // Yi-type llava
                vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
                vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
            try {
                // Yi-type llava
                vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
                vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
            try {
                vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
        } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projection
            vision_model.mm_model_mlp_1_w               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -810,7 +810,7 @@ int main(int argc, char ** argv) {
                        is_antiprompt = true;
                    }
-                    chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
+                    chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
                    is_interacting = true;
                    printf("\n");
                }
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -73,6 +73,7 @@ The project is under active development, and we are [looking for feedback and co
 - `-fa`, `--flash-attn` : enable flash attention (default: disabled).
 - `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
 - `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
 - `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
 **If compiled with `LLAMA_SERVER_SSL=ON`**
 - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@ -259,7 +259,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
 const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };
 const NON_LITERAL_SET = new Set('|.()[]{}*+?');
-const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');
+const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?');
 export class SchemaConverter {
  constructor(options) {
@ -751,7 +751,7 @@ export class SchemaConverter {
    const requiredProps = sortedProps.filter(k => required.has(k));
    const optionalProps = sortedProps.filter(k => !required.has(k));
-    if (additionalProperties !== false) {
+    if (additionalProperties) {
      const subName = `${name ?? ''}${name ? '-' : ''}additional`;
      const valueRule =
        additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2020,6 +2020,7 @@ struct server_context {
                        slot.t_start_generation = 0;
                        if (slot.infill) {
                            const bool add_bos = llama_should_add_bos_token(model);
                            bool suff_rm_leading_spc = true;
                            if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
                                params.input_suffix.erase(0, 1);
@ -2035,16 +2036,21 @@ struct server_context {
                            }
                            prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-                            prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
+                            suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
-                            prefix_tokens.insert(prefix_tokens.end(),   llama_token_suffix(model));
+
-                            prefix_tokens.insert(prefix_tokens.end(),   suffix_tokens.begin(), suffix_tokens.end());
+                            auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
                            auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
                            if (add_bos) {
                                embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
                            }
                            embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
                            const llama_token middle_token = llama_token_middle(model);
                            if (middle_token >= 0) {
-                                prefix_tokens.push_back(middle_token);
+                                embd_inp.push_back(middle_token);
                            }
-                            prompt_tokens = prefix_tokens;
+                            prompt_tokens = embd_inp;
                        } else {
                            prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
                        }
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1718318537,
+        "lastModified": 1718895438,
-        "narHash": "sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY=",
+        "narHash": "sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw+0Bwe5DLU=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "e9ee548d90ff586a6471b4ae80ae9cfcbceb3420",
+        "rev": "d603719ec6e294f034936c0d0dc06f689d91b6c3",
        "type": "github"
      },
      "original": {
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@ -2475,7 +2475,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
    const dim3 block_nums_mmq(nsm, 1, 1);
-    ggml_cuda_pool & pool = ctx.pool();
+    ggml_cuda_pool & pool = ctx.pool(id);
    ggml_cuda_pool_alloc<float> tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y);
    if (args.ne01 % mmq_y == 0) {
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -150,6 +150,7 @@ class MODEL_ARCH(IntEnum):
    INTERNLM2    = auto()
    MINICPM      = auto()
    GEMMA        = auto()
    GEMMA2       = auto()
    STARCODER2   = auto()
    MAMBA        = auto()
    XVERSE       = auto()
@ -180,10 +181,13 @@ class MODEL_TENSOR(IntEnum):
    ATTN_NORM            = auto()
    ATTN_NORM_2          = auto()
    ATTN_OUT_NORM        = auto()
    ATTN_POST_NORM       = auto()
    ATTN_ROT_EMBD        = auto()
    FFN_GATE_INP         = auto()
    FFN_GATE_INP_SHEXP   = auto()
    FFN_NORM             = auto()
    FFN_PRE_NORM         = auto()
    FFN_POST_NORM        = auto()
    FFN_GATE             = auto()
    FFN_DOWN             = auto()
    FFN_UP               = auto()
@ -270,6 +274,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.INTERNLM2:      "internlm2",
    MODEL_ARCH.MINICPM:        "minicpm",
    MODEL_ARCH.GEMMA:          "gemma",
    MODEL_ARCH.GEMMA2:         "gemma2",
    MODEL_ARCH.STARCODER2:     "starcoder2",
    MODEL_ARCH.MAMBA:          "mamba",
    MODEL_ARCH.XVERSE:         "xverse",
@ -303,9 +308,12 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.ATTN_Q_NORM:          "blk.{bid}.attn_q_norm",
    MODEL_TENSOR.ATTN_K_NORM:          "blk.{bid}.attn_k_norm",
    MODEL_TENSOR.ATTN_OUT_NORM:        "blk.{bid}.attn_output_norm",
    MODEL_TENSOR.ATTN_POST_NORM:       "blk.{bid}.post_attention_norm",
    MODEL_TENSOR.FFN_GATE_INP:         "blk.{bid}.ffn_gate_inp",
    MODEL_TENSOR.FFN_GATE_INP_SHEXP:   "blk.{bid}.ffn_gate_inp_shexp",
    MODEL_TENSOR.FFN_NORM:             "blk.{bid}.ffn_norm",
    MODEL_TENSOR.FFN_PRE_NORM:         "blk.{bid}.ffn_norm",
    MODEL_TENSOR.FFN_POST_NORM:        "blk.{bid}.post_ffw_norm",
    MODEL_TENSOR.FFN_GATE:             "blk.{bid}.ffn_gate",
    MODEL_TENSOR.FFN_DOWN:             "blk.{bid}.ffn_down",
    MODEL_TENSOR.FFN_UP:               "blk.{bid}.ffn_up",
@ -751,6 +759,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.FFN_NORM,
    ],
    MODEL_ARCH.GEMMA2: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_POST_NORM,
        MODEL_TENSOR.FFN_PRE_NORM,
        MODEL_TENSOR.FFN_POST_NORM,
    ],
    MODEL_ARCH.STARCODER2: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -187,6 +187,10 @@ class TensorNameMap:
            "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
        ),
        MODEL_TENSOR.ATTN_POST_NORM: (
            "model.layers.{bid}.post_attention_layernorm",     # gemma2
        ),
        # Rotary embeddings
        MODEL_TENSOR.ATTN_ROT_EMBD: (
            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",        # llama-hf
@ -210,6 +214,16 @@ class TensorNameMap:
            "transformer.decoder_layer.{bid}.rms_norm_2",                    # Grok
        ),
        # Post feed-forward norm
        MODEL_TENSOR.FFN_PRE_NORM: (
            "model.layers.{bid}.pre_feedforward_layernorm", # gemma2
        ),
        # Post feed-forward norm
        MODEL_TENSOR.FFN_POST_NORM: (
            "model.layers.{bid}.post_feedforward_layernorm", # gemma2
        ),
        MODEL_TENSOR.FFN_GATE_INP: (
            "layers.{bid}.feed_forward.gate",             # mixtral
            "model.layers.{bid}.block_sparse_moe.gate",   # mixtral
--- a/grammars/README.md
+++ b/grammars/README.md
@ -126,19 +126,257 @@ You can use GBNF grammars:
    - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
    - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
-Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
+Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
-Here is also a non-exhaustive list of **unsupported** features:
+```bash
 llama-cli \
  -hfr bartowski/Phi-3-medium-128k-instruct-GGUF \
  -hff Phi-3-medium-128k-instruct-Q8_0.gguf \
  -j '{
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "name": {
                "type": "string",
                "minLength": 1,
                "maxLength": 100
            },
            "age": {
                "type": "integer",
                "minimum": 0,
                "maximum": 150
            }
        },
        "required": ["name", "age"],
        "additionalProperties": false
    },
    "minItems": 10,
    "maxItems": 100
  }' \
  -p 'Generate a {name, age}[] JSON array with famous actors of all ages.'
 ```
- `additionalProperties`: to be fixed in https://github.com/ggerganov/llama.cpp/pull/7840
+<details>
- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`
+
-    - `integer` constraints to be implemented in https://github.com/ggerganov/llama.cpp/pull/7797
+<summary>Show grammar</summary>
- Remote `$ref`s in the C++ version (Python & JavaScript versions fetch https refs)
+
- Mixing `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
+You can convert any schema in command-line with:
- `string` formats `uri`, `email`
+
 ```bash
 examples/json_schema_to_grammar.py name-age-schema.json
 ```
 ```
 char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
 item ::= "{" space item-name-kv "," space item-age-kv "}" space
 item-age ::= ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-4] [0-9] | [5] "0")) space
 item-age-kv ::= "\"age\"" space ":" space item-age
 item-name ::= "\"" char{1,100} "\"" space
 item-name-kv ::= "\"name\"" space ":" space item-name
 root ::= "[" space item ("," space item){9,99} "]" space
 space ::= | " " | "\n" [ \t]{0,20}
 ```
 </details>
 Here is also a list of known limitations (contributions welcome):
 - `additionalProperties` defaults to `false` (produces faster grammars + reduces hallucinations).
 - `"additionalProperties": true` may produce keys that contain unescaped newlines.
 - Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp).
 - Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
 - [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works)
 - `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number`
 - Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073)
 - [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$`
 - Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs)
 - `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email`
 - No [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
 And a non-exhaustive list of other unsupported features that are unlikely to be implemented (hard and/or too slow to support w/ stateless grammars):
 - [`uniqueItems`](https://json-schema.org/draft/2020-12/json-schema-validation#name-uniqueitems)
 - [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains`
 - `uniqueItems`
 - `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing))
 - [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not)
 - [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas`
- [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
+
 ### A word about additionalProperties
 > [!WARNING]
 > The JSON schemas spec states `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties) by default.
 > Since this is slow and seems prone to hallucinations, we default to no additional properties.
 > You can set `"additionalProperties": true` in the the schema of any object to explicitly allow additional properties.
 If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can enable additional properties with the `extra` config on each model class:
 ```python
 # pip install pydantic
 import json
 from typing import Annotated, List
 from pydantic import BaseModel, Extra, Field
 class QAPair(BaseModel):
    class Config:
        extra = 'allow'  # triggers additionalProperties: true in the JSON schema
    question: str
    concise_answer: str
    justification: str
 class Summary(BaseModel):
    class Config:
        extra = 'allow'
    key_facts: List[Annotated[str, Field(pattern='- .{5,}')]]
    question_answers: List[Annotated[List[QAPair], Field(min_items=5)]]
 print(json.dumps(Summary.model_json_schema(), indent=2))
 ```
 <details>
 <summary>Show JSON schema & grammar</summary>
 ```json
 {
  "$defs": {
    "QAPair": {
      "additionalProperties": true,
      "properties": {
        "question": {
          "title": "Question",
          "type": "string"
        },
        "concise_answer": {
          "title": "Concise Answer",
          "type": "string"
        },
        "justification": {
          "title": "Justification",
          "type": "string"
        }
      },
      "required": [
        "question",
        "concise_answer",
        "justification"
      ],
      "title": "QAPair",
      "type": "object"
    }
  },
  "additionalProperties": true,
  "properties": {
    "key_facts": {
      "items": {
        "pattern": "^- .{5,}$",
        "type": "string"
      },
      "title": "Key Facts",
      "type": "array"
    },
    "question_answers": {
      "items": {
        "items": {
          "$ref": "#/$defs/QAPair"
        },
        "minItems": 5,
        "type": "array"
      },
      "title": "Question Answers",
      "type": "array"
    }
  },
  "required": [
    "key_facts",
    "question_answers"
  ],
  "title": "Summary",
  "type": "object"
 }
 ```
 ```
 QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv ( "," space ( QAPair-additional-kv ( "," space QAPair-additional-kv )* ) )? "}" space
 QAPair-additional-k ::= ["] ( [c] ([o] ([n] ([c] ([i] ([s] ([e] ([_] ([a] ([n] ([s] ([w] ([e] ([r] char+ | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"e] char*) | [^"s] char*) | [^"i] char*) | [^"c] char*) | [^"n] char*) | [^"o] char*) | [j] ([u] ([s] ([t] ([i] ([f] ([i] ([c] ([a] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"a] char*) | [^"c] char*) | [^"i] char*) | [^"f] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"u] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"cjq] char* )? ["] space
 QAPair-additional-kv ::= QAPair-additional-k ":" space value
 QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string
 QAPair-justification-kv ::= "\"justification\"" space ":" space string
 QAPair-question-kv ::= "\"question\"" space ":" space string
 additional-k ::= ["] ( [k] ([e] ([y] ([_] ([f] ([a] ([c] ([t] ([s] char+ | [^"s] char*) | [^"t] char*) | [^"c] char*) | [^"a] char*) | [^"f] char*) | [^"_] char*) | [^"y] char*) | [^"e] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] ([_] ([a] ([n] ([s] ([w] ([e] ([r] ([s] char+ | [^"s] char*) | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"kq] char* )? ["] space
 additional-kv ::= additional-k ":" space value
 array ::= "[" space ( value ("," space value)* )? "]" space
 boolean ::= ("true" | "false") space
 char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
 decimal-part ::= [0-9]{1,16}
 dot ::= [^\x0A\x0D]
 integral-part ::= [0] | [1-9] [0-9]{0,15}
 key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space
 key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space
 key-facts-item-1 ::= dot
 key-facts-kv ::= "\"key_facts\"" space ":" space key-facts
 null ::= "null" space
 number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
 object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
 question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space
 question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space
 question-answers-item-item ::= QAPair
 question-answers-kv ::= "\"question_answers\"" space ":" space question-answers
 root ::= "{" space key-facts-kv "," space question-answers-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space
 space ::= | " " | "\n" [ \t]{0,20}
 string ::= "\"" char* "\"" space
 value ::= object | array | string | number | boolean | null
 ```
 </details>
 If you're using [Zod](https://zod.dev/), you can make your objects to explicitly allow extra properties w/ `nonstrict()` / `passthrough()` (or explicitly no extra props w/ `z.object(...).strict()` or `z.strictObject(...)`) but note that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always sets `"additionalProperties": false` anyway.
 ```js
 import { z } from 'zod';
 import { zodToJsonSchema } from 'zod-to-json-schema';
 const Foo = z.object({
  age: z.number().positive(),
  email: z.string().email(),
 }).strict();
 console.log(zodToJsonSchema(Foo));
 ```
 <details>
 <summary>Show JSON schema & grammar</summary>
 ```json
 {
  "type": "object",
  "properties": {
    "age": {
      "type": "number",
      "exclusiveMinimum": 0
    },
    "email": {
      "type": "string",
      "format": "email"
    }
  },
  "required": [
    "age",
    "email"
  ],
  "additionalProperties": false,
  "$schema": "http://json-schema.org/draft-07/schema#"
 }
 ```
 ```
 age-kv ::= "\"age\"" space ":" space number
 char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
 decimal-part ::= [0-9]{1,16}
 email-kv ::= "\"email\"" space ":" space string
 integral-part ::= [0] | [1-9] [0-9]{0,15}
 number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
 root ::= "{" space age-kv "," space email-kv "}" space
 space ::= | " " | "\n" [ \t]{0,20}
 string ::= "\"" char* "\"" space
 ```
 </details>
--- a/include/llama.h
+++ b/include/llama.h
@ -88,6 +88,7 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
        LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
        LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
        LLAMA_VOCAB_PRE_TYPE_VIKING         = 16,
    };
    // note: these values should be synchronized with ggml_rope
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@ -136,42 +136,41 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
    # LICENSE                -> LICENSE
    # scripts/gen-authors.sh -> scripts/gen-authors.sh
-    cat ggml-src.patch | sed \
+    cat ggml-src.patch | sed -E \
-        -e 's/CMakeLists.txt/ggml\/CMakeLists.txt/g' \
+        -e 's/([[:space:]]|[ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \
-        -e 's/src\/CMakeLists.txt/ggml\/src\/CMakeLists.txt/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \
-        -e 's/cmake\/FindSIMD.cmake/ggml\/cmake\/FindSIMD.cmake/g' \
+        -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \
-        -e 's/src\/ggml\.c/ggml/src/ggml.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \
-        -e 's/src\/ggml-alloc\.c/ggml/src/ggml-alloc.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \
-        -e 's/src\/ggml-backend-impl\.h/ggml/src/ggml-backend-impl.h/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \
-        -e 's/src\/ggml-backend\.c/ggml/src/ggml-backend.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \
-        -e 's/src\/ggml-common\.h/ggml/src/ggml-common.h/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \
-        -e 's/src\/ggml-cuda\//ggml-cuda\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
-        -e 's/src\/ggml-cuda\.cu/ggml/src/ggml-cuda.cu/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \
-        -e 's/src\/ggml-impl\.h/ggml/src/ggml-impl.h/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \
-        -e 's/src\/ggml-kompute\.cpp/ggml/src/ggml-kompute.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\.cpp/\1ggml\/src\/ggml-kompute.cpp/g' \
-        -e 's/src\/ggml-metal\.m/ggml/src/ggml-metal.m/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-metal\.m/\1ggml\/src\/ggml-metal.m/g' \
-        -e 's/src\/ggml-quants\.c/ggml/src/ggml-quants.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \
-        -e 's/src\/ggml-quants\.h/ggml/src/ggml-quants.h/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \
-        -e 's/src\/ggml-rpc\.cpp/ggml/src/ggml-rpc.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \
-        -e 's/src\/ggml-sycl\.cpp/ggml/src/ggml-sycl.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \
-        -e 's/src\/ggml-vulkan\.cpp/ggml/src/ggml-vulkan.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \
-        -e 's/include\/ggml\.h/ggml/include/ggml.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \
-        -e 's/include\/ggml-alloc\.h/ggml/include/ggml-alloc.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \
-        -e 's/include\/ggml-backend\.h/ggml/include/ggml-backend.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \
-        -e 's/include\/ggml-blas\.h/ggml/include/ggml-blas.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \
-        -e 's/include\/ggml-cuda\.h/ggml/include/ggml-cuda.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \
-        -e 's/include\/ggml-kompute\.h/ggml/include/ggml-kompute.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \
-        -e 's/include\/ggml-metal\.h/ggml/include/ggml-metal.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \
-        -e 's/include\/ggml-rpc\.h/ggml/include/ggml-rpc.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \
-        -e 's/include\/ggml-sycl\.h/ggml/include/ggml-sycl.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \
-        -e 's/include\/ggml-vulkan\.h/ggml/include/ggml-vulkan.h/g' \
+        -e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \
-        -e 's/tests\/test-opt\.cpp/tests\/test-opt.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)examples\/common\.h/examples\/common.h/g' \
-        -e 's/tests\/test-grad0\.cpp/tests\/test-grad0.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/examples\/common.cpp/g' \
-        -e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/examples\/common-ggml.h/g' \
-        -e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \
-        -e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)LICENSE/LICENSE/g' \
-        -e 's/LICENSE/LICENSE/g' \
+        -e 's/([[:space:]]|[ab]\/)scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \
        -e 's/scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \
        > ggml-src.patch.tmp
    mv ggml-src.patch.tmp ggml-src.patch
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -217,6 +217,7 @@ enum llm_arch {
    LLM_ARCH_INTERNLM2,
    LLM_ARCH_MINICPM,
    LLM_ARCH_GEMMA,
    LLM_ARCH_GEMMA2,
    LLM_ARCH_STARCODER2,
    LLM_ARCH_MAMBA,
    LLM_ARCH_XVERSE,
@ -257,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_INTERNLM2,       "internlm2"    },
    { LLM_ARCH_MINICPM,         "minicpm"      },
    { LLM_ARCH_GEMMA,           "gemma"        },
    { LLM_ARCH_GEMMA2,          "gemma2"       },
    { LLM_ARCH_STARCODER2,      "starcoder2"   },
    { LLM_ARCH_MAMBA,           "mamba"        },
    { LLM_ARCH_XVERSE,          "xverse"       },
@ -478,10 +480,12 @@ enum llm_tensor {
    LLM_TENSOR_ATTN_NORM,
    LLM_TENSOR_ATTN_NORM_2,
    LLM_TENSOR_ATTN_OUT_NORM,
    LLM_TENSOR_ATTN_POST_NORM,
    LLM_TENSOR_ATTN_ROT_EMBD,
    LLM_TENSOR_FFN_GATE_INP,
    LLM_TENSOR_FFN_GATE_INP_SHEXP,
    LLM_TENSOR_FFN_NORM,
    LLM_TENSOR_FFN_POST_NORM,
    LLM_TENSOR_FFN_GATE,
    LLM_TENSOR_FFN_DOWN,
    LLM_TENSOR_FFN_UP,
@ -1004,6 +1008,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_GEMMA2,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
        },
    },
    {
        LLM_ARCH_STARCODER2,
        {
@ -2038,6 +2060,9 @@ enum e_model {
    MODEL_8x22B,
    MODEL_16x12B,
    MODEL_10B_128x3_66B,
    MODEL_57B_A14B,
    MODEL_9B,
    MODEL_27B,
 };
 static const size_t kiB = 1024;
@ -2214,6 +2239,7 @@ struct llama_layer {
    struct ggml_tensor * attn_q_a_norm;
    struct ggml_tensor * attn_kv_a_norm;
    struct ggml_tensor * attn_sub_norm;
    struct ggml_tensor * attn_post_norm;
    struct ggml_tensor * ffn_sub_norm;
    // attention
@ -2237,6 +2263,7 @@ struct llama_layer {
    // normalization
    struct ggml_tensor * ffn_norm;
    struct ggml_tensor * ffn_norm_b;
    struct ggml_tensor * ffn_post_norm;
    struct ggml_tensor * layer_out_norm;
    struct ggml_tensor * layer_out_norm_b;
    struct ggml_tensor * ffn_norm_exps;
@ -4267,6 +4294,9 @@ static const char * llama_model_type_name(e_model type) {
        case MODEL_8x22B:         return "8x22B";
        case MODEL_16x12B:        return "16x12B";
        case MODEL_10B_128x3_66B: return "10B+128x3.66B";
        case MODEL_57B_A14B:      return "57B.A14B";
        case MODEL_9B:            return "9B";
        case MODEL_27B:           return "27B";
        default:                  return "?B";
    }
 }
@ -4588,6 +4618,7 @@ static void llm_load_hparams(
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 24: model.type = e_model::MODEL_A2_7B; break;
                    case 28: model.type = e_model::MODEL_57B_A14B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
@ -4668,6 +4699,16 @@ static void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
               }
            } break;
        case LLM_ARCH_GEMMA2:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 42: model.type = e_model::MODEL_9B; break;
                    case 46: model.type = e_model::MODEL_27B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
               }
            } break;
        case LLM_ARCH_STARCODER2:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@ -5067,6 +5108,9 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "poro-chat") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
            } else if (
                tokenizer_pre == "viking") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@ -5152,10 +5196,10 @@ static void llm_load_vocab(
        if (gen_name.find("code") != std::string::npos) {
            if (model.arch == LLM_ARCH_LLAMA
              && 32010 < vocab.id_to_token.size()
-              && vocab.id_to_token[32007].text == "<PRE>"
+              && vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
-              && vocab.id_to_token[32008].text == "<SUF>"
+              && vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
-              && vocab.id_to_token[32009].text == "<MID>"
+              && vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
-              && vocab.id_to_token[32010].text == "<EOT>") {
+              && vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
                vocab.special_prefix_id = 32007;
                vocab.special_suffix_id = 32008;
                vocab.special_middle_id = 32009;
@ -6506,6 +6550,40 @@ static bool llm_load_tensors(
                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                    }
                } break;
            case LLM_ARCH_GEMMA2:
                {
                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                    // output
                    model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
                    const int64_t n_ff          = hparams.n_ff;
                    const int64_t n_embd_head_k = hparams.n_embd_head_k;
                    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
                    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
                    for (uint32_t i = 0; i < n_layer; ++i) {
                        ggml_context * ctx_layer = ctx_for_layer(i);
                        ggml_context * ctx_split = ctx_for_layer_split(i);
                        auto & layer = model.layers[i];
                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
                        layer.attn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                        layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
                    }
                } break;
            case LLM_ARCH_STARCODER2:
                {
                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@ -10917,6 +10995,125 @@ struct llm_build_context {
        return gf;
    }
    struct ggml_cgraph * build_gemma2() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
        const int64_t n_embd_head_k = hparams.n_embd_head_k;
        struct ggml_tensor * cur;
        struct ggml_tensor * inpL;
        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
        cb(inpL, "inp_scaled", -1);
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();
        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
        for (int il = 0; il < n_layer; ++il) {
            // norm
            cur = llm_build_norm(ctx0, inpL, hparams,
                    model.layers[il].attn_norm, NULL,
                    LLM_NORM_RMS, cb, il);
            cb(cur, "attn_norm", il);
            // self-attention
            {
                // compute Q and K and RoPE them
                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
                cb(Qcur, "Qcur", il);
                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
                cb(Kcur, "Kcur", il);
                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
                cb(Vcur, "Vcur", il);
                Qcur = ggml_rope_ext(
                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Qcur, "Qcur", il);
                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
                cb(Qcur, "Qcur_scaled", il);
                Kcur = ggml_rope_ext(
                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Kcur, "Kcur", il);
                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
                        model.layers[il].wo, NULL,
                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
            }
            cur = llm_build_norm(ctx0, cur, hparams,
                    model.layers[il].attn_post_norm, NULL,
                    LLM_NORM_RMS, cb, il);
            cb(cur, "attn_post_norm", il);
            if (il == n_layer - 1) {
                // skip computing output for unused tokens
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
            }
            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
            cb(sa_out, "sa_out", il);
            cur = llm_build_norm(ctx0, sa_out, hparams,
                    model.layers[il].ffn_norm, NULL,
                    LLM_NORM_RMS, cb, il);
            cb(cur, "ffn_norm", il);
            // feed-forward network
            {
                cur = llm_build_ffn(ctx0, cur,
                        model.layers[il].ffn_up,   NULL, NULL,
                        model.layers[il].ffn_gate, NULL, NULL,
                        model.layers[il].ffn_down, NULL, NULL,
                        NULL,
                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
                cb(cur, "ffn_out", il);
            }
            cur = llm_build_norm(ctx0, cur, hparams,
                model.layers[il].ffn_post_norm, NULL,
                LLM_NORM_RMS, cb, -1);
            cb(cur, "ffn_post_norm", -1);
            cur = ggml_add(ctx0, cur, sa_out);
            cur = lctx.cvec.apply_to(ctx0, cur, il);
            cb(cur, "l_out", il);
            // input for next layer
            inpL = cur;
        }
        cur = inpL;
        cur = llm_build_norm(ctx0, cur, hparams,
                model.output_norm, NULL,
                LLM_NORM_RMS, cb, -1);
        cb(cur, "result_norm", -1);
        // lm_head
        cur = ggml_mul_mat(ctx0, model.output, cur);
        cb(cur, "result_output", -1);
        ggml_build_forward_expand(gf, cur);
        return gf;
    }
    struct ggml_cgraph * build_starcoder2() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@ -12297,6 +12494,10 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_gemma();
            } break;
        case LLM_ARCH_GEMMA2:
            {
                result = llm.build_gemma2();
            } break;
        case LLM_ARCH_STARCODER2:
            {
                result = llm.build_starcoder2();
@ -13703,6 +13904,12 @@ struct llm_tokenizer_bpe {
                    " ?[^(\\s|.,!?…。，、।۔،)]+",
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_VIKING:
                regex_exprs = {
                    "\\p{N}",
                    " ?[^(\\s|.,!?…。，、।۔،)]+",
                };
                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@ -17585,6 +17792,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_PHI2:
        case LLM_ARCH_PHI3:
        case LLM_ARCH_GEMMA:
        case LLM_ARCH_GEMMA2:
        case LLM_ARCH_STARCODER2:
        case LLM_ARCH_GPTNEOX:
            return LLAMA_ROPE_TYPE_NEOX;
@ -19405,7 +19613,10 @@ static int32_t llama_chat_apply_template_internal(
    std::string & dest, bool add_ass) {
    // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
    std::stringstream ss;
-    if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
+    auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
        return tmpl.find(haystack) != std::string::npos;
    };
    if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
        // chatml template
        for (auto message : chat) {
            ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@ -19413,16 +19624,16 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|im_start|>assistant\n";
        }
-    } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) {
+    } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
        // llama2 template and its variants
        // [variant] support system message
-        bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos || tmpl == "mistral";
+        bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral";
        // [variant] space before + after response
-        bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
+        bool space_around_response = tmpl_contains("' ' + eos_token");
        // [variant] add BOS inside history
-        bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
+        bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
        // [variant] trim spaces from the input message
-        bool strip_message = tmpl.find("content.strip()") != std::string::npos;
+        bool strip_message = tmpl_contains("content.strip()");
        // construct the prompt
        bool is_inside_turn = true; // skip BOS at the beginning
        ss << "[INST] ";
@ -19448,7 +19659,7 @@ static int32_t llama_chat_apply_template_internal(
            }
        }
        // llama2 templates seem to not care about "add_generation_prompt"
-    } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
+    } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
        // Phi 3
        for (auto message : chat) {
            std::string role(message->role);
@ -19457,7 +19668,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|assistant|>\n";
        }
-    } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
+    } else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
        // zephyr template
        for (auto message : chat) {
            ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@ -19465,7 +19676,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|assistant|>\n";
        }
-    } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
+    } else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) {
        // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
        for (auto message : chat) {
            std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
@ -19474,7 +19685,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<s>assistant\n";
        }
-    } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
+    } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("<start_of_turn>")) {
        // google/gemma-7b-it
        std::string system_prompt = "";
        for (auto message : chat) {
@ -19496,7 +19707,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<start_of_turn>model\n";
        }
-    } else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
+    } else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
        // OrionStarAI/Orion-14B-Chat
        std::string system_prompt = "";
        for (auto message : chat) {
@ -19516,7 +19727,7 @@ static int32_t llama_chat_apply_template_internal(
                ss << message->content << "</s>";
            }
        }
-    } else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
+    } else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) {
        // openchat/openchat-3.5-0106,
        for (auto message : chat) {
            std::string role(message->role);
@ -19530,13 +19741,13 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "GPT4 Correct Assistant:";
        }
-    } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
+    } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) {
        // eachadea/vicuna-13b-1.1 (and Orca variant)
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                // Orca-Vicuna variant uses a system prefix
-                if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
+                if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) {
                    ss << "SYSTEM: " << message->content << "\n";
                } else {
                    ss << message->content << "\n\n";
@ -19550,7 +19761,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "ASSISTANT:";
        }
-    } else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
+    } else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) {
        // deepseek-ai/deepseek-coder-33b-instruct
        for (auto message : chat) {
            std::string role(message->role);
@ -19565,7 +19776,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "### Response:\n";
        }
-    } else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
+    } else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) {
        // CohereForAI/c4ai-command-r-plus
        for (auto message : chat) {
            std::string role(message->role);
@ -19580,7 +19791,7 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
        }
-    } else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
+    } else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
        // Llama 3
        for (auto message : chat) {
            std::string role(message->role);
@ -19589,6 +19800,33 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
        }
    } else if (tmpl == "minicpm" || tmpl_contains(u8"<用户>")) {
        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "user") {
                ss << u8"<用户>";
                ss << trim(message->content);
                ss << "<AI>";
            } else {
                ss << trim(message->content);
            }
        }
    } else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
        // DeepSeek-V2
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                ss << message->content << "\n\n";
            } else if (role == "user") {
                ss << "User: " << message->content << "\n\n";
            } else if (role == "assistant") {
                ss << "Assistant: " << message->content << u8"<｜end▁of▁sentence｜>";
            }
        }
        if (add_ass) {
            ss << "Assistant:";
        }
    } else {
        // template not supported
        return -1;
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@ -57,7 +57,11 @@ int main(void) {
        //Phi-3-medium
        "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
        //Phi-3-vision
-        "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}"
+        "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
        u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
        // DeepSeek-V2
        "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
    };
    std::vector<std::string> expected_output = {
        // teknium/OpenHermes-2.5-Mistral-7B
@ -94,6 +98,10 @@ int main(void) {
        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
        //Phi-3-vision
        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
        u8"You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>",
        // DeepSeek-V2
        u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<｜end▁of▁sentence｜>User: Who are you\n\nAssistant:    I am an assistant   <｜end▁of▁sentence｜>User: Another question\n\nAssistant:",
    };
    std::vector<char> formatted_chat(1024);
    int32_t res;
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@ -993,6 +993,40 @@ static void test_json_schema() {
        }
    );
    test_schema(
        "simple pattern",
        // Schema
        R"""({
            "pattern": "^[a-zA-Z0-9_-]*$"
        })""",
        // Passing strings
        {
            R"""("")""",
            R"""("He_llo-12")""",
        },
        // Failing strings
        {
            R"""("!")""",
            R"""("Hello World")""",
        }
    );
    test_schema(
        "pattern with escapes",
        // Schema
        R"""({
            "pattern": "^a\\^\\$\\.\\[\\]\\(\\)\\|\\{\\}\\*\\+\\?b$"
        })""",
        // Passing strings
        {
            R"""("a^$.[]()|{}*+?b")""",
        },
        // Failing strings
        {
            R"""("ab")""",
        }
    );
    test_schema(
        "",
        // Schema
@ -1062,8 +1096,6 @@ static void test_json_schema() {
            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
            // "By extension, even an empty object is valid"
            R"""({})""",
            // "By default, providing additional properties is valid"
            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
        },
        // Failing strings
@ -1074,6 +1106,9 @@ static void test_json_schema() {
            R"""({ "street_name": "Pennsylvania", "number": 1600 })""",
            // Reorder properties
            R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
            // "Additional properties default to false for generation, even though the spec says true.
            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
        }
    );
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@ -1120,28 +1120,15 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        R"""(
            alternative-0 ::= foo
            alternative-1 ::= bar
-            array ::= "[" space ( value ("," space value)* )? "]" space
+            bar ::= "{" space  (bar-b-kv )? "}" space
            bar ::= "{" space  (bar-b-kv bar-b-rest | bar-additional-kv ( "," space bar-additional-kv )* )? "}" space
            bar-additional-k ::= ["] ( [b] char+ | [^"b] char* )? ["] space
            bar-additional-kv ::= bar-additional-k ":" space value
            bar-b-kv ::= "\"b\"" space ":" space number
            bar-b-rest ::= ( "," space bar-additional-kv )*
            boolean ::= ("true" | "false") space
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            decimal-part ::= [0-9]{1,16}
-            foo ::= "{" space  (foo-a-kv foo-a-rest | foo-additional-kv ( "," space foo-additional-kv )* )? "}" space
+            foo ::= "{" space  (foo-a-kv )? "}" space
            foo-a-kv ::= "\"a\"" space ":" space number
            foo-a-rest ::= ( "," space foo-additional-kv )*
            foo-additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
            foo-additional-kv ::= foo-additional-k ":" space value
            integral-part ::= [0] | [1-9] [0-9]{0,15}
            null ::= "null" space
            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
            root ::= alternative-0 | alternative-1
            space ::= | " " | "\n" [ \t]{0,20}
            string ::= "\"" char* "\"" space
            value ::= object | array | string | number | boolean | null
        )"""
    });
@ -1177,25 +1164,15 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        })""",
        R"""(
            a-kv ::= "\"a\"" space ":" space number
            additional-k ::= ["] ( [a] char+ | [b] char+ | [c] char+ | [d] char+ | [^"abcd] char* )? ["] space
            additional-kv ::= additional-k ":" space value
            array ::= "[" space ( value ("," space value)* )? "]" space
            b-kv ::= "\"b\"" space ":" space number
            boolean ::= ("true" | "false") space
            c-kv ::= "\"c\"" space ":" space number
            c-rest ::= ( "," space additional-kv )*
            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
            d-kv ::= "\"d\"" space ":" space number
-            d-rest ::= ( "," space c-kv )? c-rest
+            d-rest ::= ( "," space c-kv )?
            decimal-part ::= [0-9]{1,16}
            integral-part ::= [0] | [1-9] [0-9]{0,15}
            null ::= "null" space
            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
+            root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
            root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv c-rest | additional-kv ( "," space additional-kv )* ) )? "}" space
            space ::= | " " | "\n" [ \t]{0,20}
            string ::= "\"" char* "\"" space
            value ::= object | array | string | number | boolean | null
        )"""
    });
@ -1262,26 +1239,30 @@ int main() {
        }
    });
-    if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
+    if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) {
-        test_all("Python", [](const TestCase & tc) {
+        fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m");
            write("test-json-schema-input.tmp", tc.schema);
            tc.verify_status(std::system(
                "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
            tc.verify(read("test-grammar-output.tmp"));
        });
    } else {
-        fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
+        if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
-    }
+            test_all("Python", [](const TestCase & tc) {
                write("test-json-schema-input.tmp", tc.schema);
                tc.verify_status(std::system(
                    "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
                tc.verify(read("test-grammar-output.tmp"));
            });
        } else {
            fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
        }
-    if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
+        if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
-        test_all("JavaScript", [](const TestCase & tc) {
+            test_all("JavaScript", [](const TestCase & tc) {
-            write("test-json-schema-input.tmp", tc.schema);
+                write("test-json-schema-input.tmp", tc.schema);
-            tc.verify_status(std::system(
+                tc.verify_status(std::system(
-                "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
+                    "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
-            tc.verify(read("test-grammar-output.tmp"));
+                tc.verify(read("test-grammar-output.tmp"));
-        });
+            });
-    } else {
+        } else {
-        fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
+            fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
        }
    }
    test_all("Check Expectations Validity", [](const TestCase & tc) {