Merge remote-tracking branch 'origin/master' into json-type

2024-06-25 23:15:41 +01:00 · 2024-06-25 23:15:41 +01:00 · 7caa7b9e83
commit 7caa7b9e83
parent 72282e7b13 6fcbf68235
38 changed files with 2203 additions and 232 deletions
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@ -30,8 +30,10 @@ RUN make -j$(nproc) llama-server
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
 COPY --from=build /app/llama-server /llama-server
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@ -20,10 +20,12 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
+    apt-get install -y libcurl4-openssl-dev curl
 COPY --from=build /app/build/bin/llama-server /llama-server
 ENV LC_ALL=C.utf8
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@ -43,8 +43,10 @@ ENV CXX=/opt/rocm/llvm/bin/clang++
 # Enable cURL
 ENV LLAMA_CURL=1
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
+    apt-get install -y libcurl4-openssl-dev curl
 RUN make -j$(nproc) llama-server
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@ -5,15 +5,11 @@ FROM ubuntu:$UBUNTU_VERSION as build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
-# Install Vulkan SDK
+# Install Vulkan SDK and cURL
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
    apt update -y && \
-    apt-get install -y vulkan-sdk
+    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
 # Install cURL
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev
 # Build it
 WORKDIR /app
@ -28,4 +24,6 @@ RUN cp /app/build/bin/llama-server /llama-server && \
 ENV LC_ALL=C.utf8
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev
+    apt-get install -y build-essential git libcurl4-openssl-dev curl
 WORKDIR /app
@ -22,4 +22,6 @@ COPY --from=build /app/llama-server /llama-server
 ENV LC_ALL=C.utf8
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -10,7 +10,7 @@
 name: Publish Docker image
 on:
-  pull_request:
+  #pull_request:
  push:
    branches:
      - master
@ -22,7 +22,7 @@ concurrency:
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
-    if: github.event.pull_request.draft == false
+    #if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    env:
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1263,11 +1263,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        return true;
    }
    // cvector params
    if (arg == "--completions-file") {
        CHECK_ARG
        params.cvector_completions_file = argv[i];
        return true;
    }
    if (arg == "--positive-file") {
        CHECK_ARG
        params.cvector_positive_file = argv[i];
@ -1278,11 +1273,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.cvector_negative_file = argv[i];
        return true;
    }
    if (arg == "--completions") {
        CHECK_ARG
        params.n_completions = std::stoi(argv[i]);
        return true;
    }
    if (arg == "--pca-batch") {
        CHECK_ARG
        params.n_pca_batch = std::stoi(argv[i]);
@ -1293,6 +1283,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.n_pca_iterations = std::stoi(argv[i]);
        return true;
    }
    if (arg == "--method") {
        CHECK_ARG
        std::string value(argv[i]);
        /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
        else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
        else { invalid_param = true; }
        return true;
    }
 #ifndef LOG_DISABLE_LOGS
    // Parse args for logging parameters
    if (log_param_single_parse(argv[i])) {
@ -1444,7 +1442,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "main",        "       --cfg-negative-prompt-file FNAME",
                                                                        "negative prompt file to use for guidance" });
    options.push_back({ "main",        "       --cfg-scale N",          "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
-
+    options.push_back({ "main",        "       --chat-template JINJA_TEMPLATE",
                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
                                                                        "only commonly used templates are accepted:\n"
                                                                        "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
    options.push_back({ "grammar" });
    options.push_back({ "*",           "       --grammar GRAMMAR",      "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
    options.push_back({ "*",           "       --grammar-file FNAME",   "file to read grammar from" });
@ -1538,9 +1539,11 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (implies --no-mmap)" });
    options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
    options.push_back({ "*",           "       --lora-base FNAME",      "optional model to use as a base for the layers modified by the LoRA adapter" });
-    options.push_back({ "*",           "       --control-vector FNAME", "add a control vector" });
+    options.push_back({ "*",           "       --control-vector FNAME", "add a control vector\n"
                                                                        "note: this argument can be repeated to add multiple control vectors" });
    options.push_back({ "*",           "       --control-vector-scaled FNAME SCALE",
-                                                                        "add a control vector with user defined scaling SCALE" });
+                                                                        "add a control vector with user defined scaling SCALE\n"
                                                                        "note: this argument can be repeated to add multiple scaled control vectors" });
    options.push_back({ "*",           "       --control-vector-layer-range START END",
                                                                        "layer range to apply the control vector(s) to, start and end inclusive" });
    options.push_back({ "*",           "-m,    --model FNAME",          "model path (default: models/$filename with filename from --hf-file\n"
@ -1621,11 +1624,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "cvector",     "-o,    --output FNAME",         "output file (default: '%s')", params.cvector_outfile.c_str() });
    options.push_back({ "cvector",     "       --positive-file FNAME",  "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
    options.push_back({ "cvector",     "       --negative-file FNAME",  "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
    options.push_back({ "cvector",     "       --completions-file FNAME",
                                                                        "completions file (default: '%s')", params.cvector_completions_file.c_str() });
    options.push_back({ "cvector",     "       --completions N",        "number of lines of completions file to use (default: %d)", params.n_completions });
    options.push_back({ "cvector",     "       --pca-batch N",          "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
    options.push_back({ "cvector",     "       --pca-iter N",           "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
    options.push_back({ "cvector",     "       --method {pca,mean}",    "dimensionality reduction method to be used (default: pca)" });
    printf("usage: %s [options]\n", argv[0]);
@ -2602,12 +2603,67 @@ bool llama_should_add_bos_token(const llama_model * model) {
    return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
 }
 //
 // Chat template utils
 //
 bool llama_chat_verify_template(const std::string & tmpl) {
    llama_chat_message chat[] = {{"user", "test"}};
    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
    return res >= 0;
 }
 std::string llama_chat_apply_template(const struct llama_model * model,
        const std::string & tmpl,
        const std::vector<llama_chat_msg> & msgs,
        bool add_ass) {
    int alloc_size = 0;
    std::vector<llama_chat_message> chat;
    for (auto & msg : msgs) {
        chat.push_back({msg.role.c_str(), msg.content.c_str()});
        alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
    }
    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
    std::vector<char> buf(alloc_size);
    // run the first time to get the total output length
    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
    // if it turns out that our buffer is too small, we resize it
    if ((size_t) res > buf.size()) {
        buf.resize(res);
        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
    }
    std::string formatted_chat(buf.data(), res);
    return formatted_chat;
 }
 std::string llama_chat_format_single(const struct llama_model * model,
        const std::string & tmpl,
        const std::vector<llama_chat_msg> & past_msg,
        const llama_chat_msg & new_msg,
        bool add_ass) {
    auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
    std::vector<llama_chat_msg> chat_new(past_msg);
    chat_new.push_back(new_msg);
    auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
    auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
    return formatted;
 }
 std::string llama_chat_format_example(const struct llama_model * model,
        const std::string & tmpl) {
    std::vector<llama_chat_msg> msgs = {
        {"system",    "You are a helpful assistant"},
        {"user",      "Hello"},
        {"assistant", "Hi there"},
        {"user",      "How are you?"},
    };
    return llama_chat_apply_template(model, tmpl, msgs, true);
 }
 //
 // KV cache utils
 //
--- a/common/common.h
+++ b/common/common.h
@ -52,6 +52,12 @@ int32_t cpu_get_num_math();
 // CLI argument parsing
 //
 // dimensionality reduction methods, used by cvector-generator
 enum dimre_method {
    DIMRE_METHOD_PCA,
    DIMRE_METHOD_MEAN,
 };
 struct gpt_params {
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
@ -238,13 +244,12 @@ struct gpt_params {
    bool compute_ppl    = true;  // whether to compute perplexity
    // cvector-generator params
-    int n_completions = 64;
+    int n_pca_batch = 100;
    int n_pca_batch = 20;
    int n_pca_iterations = 1000;
-    std::string cvector_outfile          = "control_vector.gguf";
+    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
+    std::string cvector_outfile       = "control_vector.gguf";
-    std::string cvector_positive_file    = "examples/cvector-generator/positive.txt";
+    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
-    std::string cvector_negative_file    = "examples/cvector-generator/negative.txt";
+    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
 };
 void gpt_params_handle_model_default(gpt_params & params);
@ -365,9 +370,32 @@ bool llama_should_add_bos_token(const llama_model * model);
 // Chat template utils
 //
 // same with llama_chat_message, but uses std::string
 struct llama_chat_msg {
    std::string role;
    std::string content;
 };
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool llama_chat_verify_template(const std::string & tmpl);
 // CPP wrapper for llama_chat_apply_template
 std::string llama_chat_apply_template(const struct llama_model * model,
        const std::string & tmpl,
        const std::vector<llama_chat_msg> & chat,
        bool add_ass);
 // Format single message, while taking into account the position of that message in chat history
 std::string llama_chat_format_single(const struct llama_model * model,
        const std::string & tmpl,
        const std::vector<llama_chat_msg> & past_msg,
        const llama_chat_msg & new_msg,
        bool add_ass);
 // Returns an example of formatted chat
 std::string llama_chat_format_example(const struct llama_model * model,
        const std::string & tmpl);
 //
 // KV cache utils
 //
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -40,6 +40,233 @@ static std::string build_repetition(const std::string & item_rule, int min_items
    return result;
 }
 /* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
 class string_view {
    const std::string & _str;
    const size_t _start;
    const size_t _end;
 public:
    string_view(const std::string & str, size_t start = 0, size_t end  = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
    size_t size() const {
        return _end - _start;
    }
    size_t length() const {
        return size();
    }
    operator std::string() const {
        return str();
    }
    std::string str() const {
        return _str.substr(_start, _end - _start);
    }
    string_view substr(size_t pos, size_t len = std::string::npos) const {
        return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
    }
    char operator[](size_t pos) const {
        auto index = _start + pos;
        if (index >= _end) {
            throw std::out_of_range("string_view index out of range");
        }
        return _str[_start + pos];
    }
    bool operator==(const string_view & other) const {
        std::string this_str = *this;
        std::string other_str = other;
        return this_str == other_str;
    }
 };
 static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
    auto has_min = min_value != std::numeric_limits<int>::min();
    auto has_max = max_value != std::numeric_limits<int>::max();
    auto digit_range = [&](char from, char to) {
        out << "[";
        if (from == to) {
            out << from;
        } else {
            out << from << "-" << to;
        }
        out << "]";
    };
    auto more_digits = [&](int min_digits, int max_digits) {
        out << "[0-9]";
        if (min_digits == max_digits && min_digits == 1) {
            return;
        }
        out << "{";
        out << min_digits;
        if (max_digits != min_digits) {
            out << ",";
            if (max_digits != std::numeric_limits<int>::max()) {
                out << max_digits;
            }
        }
        out << "}";
    };
    std::function<void(const string_view &, const string_view &)> uniform_range =
        [&](const string_view & from, const string_view & to) {
            size_t i = 0;
            while (i < from.length() && i < to.length() && from[i] == to[i]) {
                i++;
            }
            if (i > 0) {
                out << "\"" << from.substr(0, i).str() << "\"";
            }
            if (i < from.length() && i < to.length()) {
                if (i > 0) {
                    out << " ";
                }
                auto sub_len = from.length() - i - 1;
                if (sub_len > 0) {
                    auto from_sub = from.substr(i + 1);
                    auto to_sub = to.substr(i + 1);
                    auto sub_zeros = repeat("0", sub_len);
                    auto sub_nines = repeat("9", sub_len);
                    auto to_reached = false;
                    out << "(";
                    if (from_sub == sub_zeros) {
                        digit_range(from[i], to[i] - 1);
                        out << " ";
                        more_digits(sub_len, sub_len);
                    } else {
                        out << "[" << from[i] << "] ";
                        out << "(";
                        uniform_range(from_sub, sub_nines);
                        out << ")";
                        if (from[i] < to[i] - 1) {
                            out << " | ";
                            if (to_sub == sub_nines) {
                                digit_range(from[i] + 1, to[i]);
                                to_reached = true;
                            } else {
                                digit_range(from[i] + 1, to[i] - 1);
                            }
                            out << " ";
                            more_digits(sub_len, sub_len);
                        }
                    }
                    if (!to_reached) {
                        out << " | ";
                        digit_range(to[i], to[i]);
                        out << " ";
                        uniform_range(sub_zeros, to_sub);
                    }
                    out << ")";
                } else {
                    out << "[" << from[i] << "-" << to[i] << "]";
                }
            }
        };
    if (has_min && has_max) {
        if (min_value < 0 && max_value < 0) {
            out << "\"-\" (";
            _build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
            out << ")";
            return;
        }
        if (min_value < 0) {
            out << "\"-\" (";
            _build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
            out << ") | ";
            min_value = 0;
        }
        auto min_s = std::to_string(min_value);
        auto max_s = std::to_string(max_value);
        auto min_digits = min_s.length();
        auto max_digits = max_s.length();
        for (auto digits = min_digits; digits < max_digits; digits++) {
            uniform_range(min_s, repeat("9", digits));
            min_s = "1" + repeat("0", digits);
            out << " | ";
        }
        uniform_range(min_s, max_s);
        return;
    }
    auto less_decimals = std::max(decimals_left - 1, 1);
    if (has_min) {
        if (min_value < 0) {
            out << "\"-\" (";
            _build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
            out << ") | [0] | [1-9] ";
            more_digits(0, decimals_left - 1);
        } else if (min_value == 0) {
            if (top_level) {
                out << "[0] | [1-9] ";
                more_digits(0, less_decimals);
            } else {
                more_digits(1, decimals_left);
            }
        } else if (min_value <= 9) {
            char c = '0' + min_value;
            auto range_start = top_level ? '1' : '0';
            if (c > range_start) {
                digit_range(range_start, c - 1);
                out << " ";
                more_digits(1, less_decimals);
                out << " | ";
            }
            digit_range(c, '9');
            out << " ";
            more_digits(0, less_decimals);
        } else {
            auto min_s = std::to_string(min_value);
            auto len = min_s.length();
            auto c = min_s[0];
            if (c > '1') {
                digit_range(top_level ? '1' : '0', c - 1);
                out << " ";
                more_digits(len, less_decimals);
                out << " | ";
            }
            digit_range(c, c);
            out << " (";
            _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
            out << ")";
            if (c < '9') {
                out << " | ";
                digit_range(c + 1, '9');
                out << " ";
                more_digits(len - 1, less_decimals);
            }
        }
        return;
    }
    if (has_max) {
        if (max_value >= 0) {
            if (top_level) {
                out << "\"-\" [1-9] ";
                more_digits(0, less_decimals);
                out << " | ";
            }
            _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
        } else {
            out << "\"-\" (";
            _build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
            out << ")";
        }
        return;
    }
    throw std::runtime_error("At least one of min_value or max_value must be set");
 }
 const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
 struct BuiltinRule {
@ -160,7 +387,6 @@ static std::string format_literal(const std::string & literal) {
    return "\"" + escaped + "\"";
 }
 class SchemaConverter {
 private:
    std::function<json(const std::string &)> _fetch_json;
@ -688,6 +914,24 @@ public:
            int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
        } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
            int min_value = std::numeric_limits<int>::min();
            int max_value = std::numeric_limits<int>::max();
            if (schema.contains("minimum")) {
                min_value = schema["minimum"].get<int>();
            } else if (schema.contains("exclusiveMinimum")) {
                min_value = schema["exclusiveMinimum"].get<int>() + 1;
            }
            if (schema.contains("maximum")) {
                max_value = schema["maximum"].get<int>();
            } else if (schema.contains("exclusiveMaximum")) {
                max_value = schema["exclusiveMaximum"].get<int>() - 1;
            }
            std::stringstream out;
            out << "(";
            _build_min_max_int(min_value, max_value, out);
            out << ") space";
            return _add_rule(rule_name, out.str());
        } else if (schema.empty() || schema_type == "object") {
            return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
        } else {
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -28,9 +28,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
        std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
-        result->grammar = llama_grammar_init(
+        struct llama_grammar * grammar = llama_grammar_init(
                grammar_rules.data(),
                grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
        if (grammar == nullptr) {
            throw std::runtime_error("Failed to initialize llama_grammar");
        }
        result->grammar = grammar;
    }
    result->prev.resize(params.n_prev);
@ -59,9 +63,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
    if (!ctx->parsed_grammar.rules.empty()) {
        std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
-        ctx->grammar = llama_grammar_init(
+        struct llama_grammar * grammar = llama_grammar_init(
                grammar_rules.data(),
                grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
        if (grammar == nullptr) {
            throw std::runtime_error("Failed to initialize llama_grammar");
        }
        ctx->grammar = grammar;
    }
    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
--- a/examples/cvector-generator/README.md
+++ b/examples/cvector-generator/README.md
@ -11,13 +11,16 @@ Related PRs:
 ```sh
 # CPU only
-./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
+./cvector-generator -m ./llama-3.Q4_K_M.gguf
 # With GPU
-./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
+./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
 # With advanced options
-./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100
+./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
 # Using mean value instead of PCA
 ./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
 # To see help message
 ./cvector-generator -h
@ -32,3 +35,11 @@ If you have multiple lines per prompt, you can escape the newline character (cha
 <|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
 <|im_start|>system\nYou are in a very good mood today<|im_end|>
 ```
 Example to use output file with `llama-cli`:
 (Tips: The control vector works better when apply to layers higher than 10)
 ```sh
 ./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
 ```
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@ -2,6 +2,7 @@
 #include "llama.h"
 #include "ggml.h"
 #include "pca.hpp"
 #include "mean.hpp"
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
@ -38,9 +39,10 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
    gpt_params_print_usage(argc, argv, params);
    printf("\nexample usage:\n");
-    printf("\n    CPU only:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
+    printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
-    printf("\n    with GPU:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
+    printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
-    printf("\n    advanced:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]);
+    printf("\n    advanced:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
    printf("\n    using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
    printf("\n");
 }
@ -223,23 +225,30 @@ struct train_context {
    // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
    // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
-    void build_v_diff() {
+    void build_v_diff(bool transpose) {
        printf("build_v_diff\n");
        for (int il = 0; il < n_layers - 1; il++) {
            auto & diff_tmp = v_diff_tmp[il];
            int n_elem = diff_tmp.size() / sizeof(float);
            GGML_ASSERT(n_elem % n_embd == 0);
            int n_rows = n_elem / n_embd;
-            struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
+            struct ggml_tensor * diff = transpose
                ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
                : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
            ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
            // copy data & transpose
            diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
-            float * arr = (float *) diff_tmp.data();
+            if (transpose) {
-            for (int ir = 0; ir < n_rows; ++ir) {
+                // copy data & transpose
-                for (int ic = 0; ic < n_embd; ++ic) {
+                float * arr = (float *) diff_tmp.data();
-                    float f = arr[ir*n_embd + ic];
+                for (int ir = 0; ir < n_rows; ++ir) {
-                    ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
+                    for (int ic = 0; ic < n_embd; ++ic) {
                        float f = arr[ir*n_embd + ic];
                        ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
                    }
                }
            } else {
                // only copy
                memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
            }
            v_diff.push_back(diff);
            print_debug_tensor(diff);
@ -263,8 +272,8 @@ struct tokenized_prompt {
    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
        const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-        tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
+        tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
-        tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
+        tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
        padding_seq(ctx, tokens_pos, max_seq_len);
        padding_seq(ctx, tokens_neg, max_seq_len);
@ -373,20 +382,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
        fprintf(stderr, "must provide at least one prompt pair\n");
        return 1;
    }
-
+    ctx_train.positive_entries = positive_prompts;
-    // create templated prompts
+    ctx_train.negative_entries = negative_prompts;
    std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
    auto format_template = [](std::string persona, std::string suffix) {
        // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] "
        return persona + suffix;
    };
    for (size_t i = 0; i < positive_prompts.size(); ++i) {
        for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
            // TODO replicate the truncations done by the python implementation
            ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
            ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
        }
    }
    return 0;
 }
@ -480,15 +477,22 @@ int main(int argc, char ** argv) {
    llama_free(ctx);
    llama_free_model(model);
-    // prepare ctx_train for PCA
+    bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
    ctx_train.build_v_diff();
-    // run PCA
+    // prepare ctx_train for PCA
-    PCA::pca_params pca_params;
+    ctx_train.build_v_diff(use_pca);
-    pca_params.n_threads = params.n_threads;
+
-    pca_params.n_batch = params.n_pca_batch;
+    if (use_pca) {
-    pca_params.n_iterations = params.n_pca_iterations;
+        // run PCA
-    PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
+        PCA::pca_params pca_params;
        pca_params.n_threads = params.n_threads;
        pca_params.n_batch = params.n_pca_batch;
        pca_params.n_iterations = params.n_pca_iterations;
        PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
    } else {
        // run mean
        mean::run(ctx_train.v_diff, ctx_train.v_final);
    }
    // write output vectors to gguf
    export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
--- a/examples/cvector-generator/mean.hpp
+++ b/examples/cvector-generator/mean.hpp
@ -0,0 +1,48 @@
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
 #include <string>
 #include <vector>
 #include <math.h>
 namespace mean {
 static void run(
        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
        const std::vector<struct ggml_tensor *> & v_output) {
    printf("%s: Running mean...\n", __func__);
    for (size_t il = 0; il < v_input.size(); ++il) {
        // prepare output vector
        struct ggml_tensor * ctrl_out = v_output[il];
        ggml_format_name(ctrl_out, "direction.%ld", il+1);
        // calculate mean vector
        struct ggml_tensor * t_layer = v_input[il];
        GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
        for (int ic = 0; ic < t_layer->ne[0]; ic++) {
            float f = 0.0;
            for (int ir = 0; ir < t_layer->ne[1]; ir++) {
                f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
            }
            f /= t_layer->ne[1];
            ggml_set_f32_1d(ctrl_out, ic, f);
        }
        // normalize output vector
        float norm = 0.0;
        for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
            float f = ggml_get_f32_1d(ctrl_out, i);
            norm += f*f;
        }
        norm = sqrt(norm);
        for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
            float f = ggml_get_f32_1d(ctrl_out, i);
            ggml_set_f32_1d(ctrl_out, i, f / norm);
        }
        printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
    }
 }
 }
--- a/examples/cvector-generator/negative.txt
+++ b/examples/cvector-generator/negative.txt
@ -1 +1,4 @@
-[INST] Act like a person who is extremely sad. [/INST] 
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
 <|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
 <|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
 <|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@ -290,7 +290,7 @@ static void power_iteration(
        }
        printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
-            __func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
+            __func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
    }
    // get output tensor
@ -298,6 +298,9 @@ static void power_iteration(
    ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
    //print_debug_tensor(output);
    ggml_gallocr_free(allocr);
    // TODO @ngxson : The output vector is randomly inverted
    // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
 }
 static void run_pca(
--- a/examples/cvector-generator/positive.txt
+++ b/examples/cvector-generator/positive.txt
@ -1 +1,4 @@
-[INST] Act like a person who is extremely happy. [/INST] 
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
 <|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
 <|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
 <|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@ -101,7 +101,9 @@ int main(int argc, char** argv) {
    auto grammar = llama_grammar_init(
            grammar_rules.data(),
            grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-
+    if (grammar == nullptr) {
        throw std::runtime_error("Failed to initialize llama_grammar");
    }
    // Read the input file
    std::string input_str;
    {
--- a/examples/json-schema-pydantic-example.py
+++ b/examples/json-schema-pydantic-example.py
@ -53,6 +53,7 @@ if __name__ == '__main__':
        question: str
        concise_answer: str
        justification: str
        stars: Annotated[int, Field(ge=1, le=5)]
    class PyramidalSummary(BaseModel):
        title: str
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -4,7 +4,7 @@ import itertools
 import json
 import re
 import sys
-from typing import Any, Dict, List, Set, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
@ -23,6 +23,170 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
    result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
    return f'({result})?' if min_items == 0 else result
 def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
    has_min = min_value != None
    has_max = max_value != None
    def digit_range(from_char: str, to_char: str):
        out.append("[")
        if from_char == to_char:
            out.append(from_char)
        else:
            out.append(from_char)
            out.append("-")
            out.append(to_char)
        out.append("]")
    def more_digits(min_digits: int, max_digits: int):
        out.append("[0-9]")
        if min_digits == max_digits and min_digits == 1:
            return
        out.append("{")
        out.append(str(min_digits))
        if max_digits != min_digits:
            out.append(",")
            if max_digits != sys.maxsize:
                out.append(str(max_digits))
        out.append("}")
    def uniform_range(from_str: str, to_str: str):
        i = 0
        while i < len(from_str) and from_str[i] == to_str[i]:
            i += 1
        if i > 0:
            out.append("\"")
            out.append(from_str[:i])
            out.append("\"")
        if i < len(from_str):
            if i > 0:
                out.append(" ")
            sub_len = len(from_str) - i - 1
            if sub_len > 0:
                from_sub = from_str[i+1:]
                to_sub = to_str[i+1:]
                sub_zeros = "0" * sub_len
                sub_nines = "9" * sub_len
                to_reached = False
                out.append("(")
                if from_sub == sub_zeros:
                    digit_range(from_str[i], chr(ord(to_str[i]) - 1))
                    out.append(" ")
                    more_digits(sub_len, sub_len)
                else:
                    out.append("[")
                    out.append(from_str[i])
                    out.append("] ")
                    out.append("(")
                    uniform_range(from_sub, sub_nines)
                    out.append(")")
                    if ord(from_str[i]) < ord(to_str[i]) - 1:
                        out.append(" | ")
                        if to_sub == sub_nines:
                            digit_range(chr(ord(from_str[i]) + 1), to_str[i])
                            to_reached = True
                        else:
                            digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1))
                        out.append(" ")
                        more_digits(sub_len, sub_len)
                if not to_reached:
                    out.append(" | ")
                    digit_range(to_str[i], to_str[i])
                    out.append(" ")
                    uniform_range(sub_zeros, to_sub)
                out.append(")")
            else:
                out.append("[")
                out.append(from_str[i])
                out.append("-")
                out.append(to_str[i])
                out.append("]")
    if has_min and has_max:
        if min_value < 0 and max_value < 0:
            out.append("\"-\" (")
            _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
            out.append(")")
            return
        if min_value < 0:
            out.append("\"-\" (")
            _generate_min_max_int(0, -min_value, out, decimals_left, top_level=True)
            out.append(") | ")
            min_value = 0
        min_s = str(min_value)
        max_s = str(max_value)
        min_digits = len(min_s)
        max_digits = len(max_s)
        for digits in range(min_digits, max_digits):
            uniform_range(min_s, "9" * digits)
            min_s = "1" + "0" * digits
            out.append(" | ")
        uniform_range(min_s, max_s)
        return
    less_decimals = max(decimals_left - 1, 1)
    if has_min:
        if min_value < 0:
            out.append("\"-\" (")
            _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
            out.append(") | [0] | [1-9] ")
            more_digits(0, decimals_left - 1)
        elif min_value == 0:
            if top_level:
                out.append("[0] | [1-9] ")
                more_digits(0, less_decimals)
            else:
                more_digits(1, decimals_left)
        elif min_value <= 9:
            c = str(min_value)
            range_start = '1' if top_level else '0'
            if c > range_start:
                digit_range(range_start, chr(ord(c) - 1))
                out.append(" ")
                more_digits(1, less_decimals)
                out.append(" | ")
            digit_range(c, "9")
            out.append(" ")
            more_digits(0, less_decimals)
        else:
            min_s = str(min_value)
            length = len(min_s)
            c = min_s[0]
            if c > "1":
                digit_range("1" if top_level else "0", chr(ord(c) - 1))
                out.append(" ")
                more_digits(length, less_decimals)
                out.append(" | ")
            digit_range(c, c)
            out.append(" (")
            _generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False)
            out.append(")")
            if c < "9":
                out.append(" | ")
                digit_range(chr(ord(c) + 1), "9")
                out.append(" ")
                more_digits(length - 1, less_decimals)
        return
    if has_max:
        if max_value >= 0:
            if top_level:
                out.append("\"-\" [1-9] ")
                more_digits(0, less_decimals)
                out.append(" | ")
            _generate_min_max_int(0, max_value, out, decimals_left, top_level=True)
        else:
            out.append("\"-\" (")
            _generate_min_max_int(-max_value, None, out, decimals_left, top_level=False)
            out.append(")")
        return
    raise RuntimeError("At least one of min_value or max_value must be set")
 class BuiltinRule:
    def __init__(self, content: str, deps: list = None):
@ -432,6 +596,24 @@ class SchemaConverter:
            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
        elif schema_type in (None, 'integer') and \
                ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
            min_value = None
            max_value = None
            if 'minimum' in schema:
                min_value = schema['minimum']
            elif 'exclusiveMinimum' in schema:
                min_value = schema['exclusiveMinimum'] + 1
            if 'maximum' in schema:
                max_value = schema['maximum']
            elif 'exclusiveMaximum' in schema:
                max_value = schema['exclusiveMaximum'] - 1
            out = ["("]
            _generate_min_max_int(min_value, max_value, out)
            out.append(") space")
            return self._add_rule(rule_name, ''.join(out))
        elif (schema_type == 'object') or (len(schema) == 0):
            return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -39,12 +39,12 @@ static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;
-static bool file_exists(const std::string &path) {
+static bool file_exists(const std::string & path) {
    std::ifstream f(path.c_str());
    return f.good();
 }
-static bool file_is_empty(const std::string &path) {
+static bool file_is_empty(const std::string & path) {
    std::ifstream f;
    f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
    f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
@ -117,6 +117,14 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
    LOG_TEE("%s", text);
 }
 static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
    llama_chat_msg new_msg{role, content};
    auto formatted = llama_chat_format_single(
        model, g_params->chat_template, chat_msgs, new_msg, role == "user");
    chat_msgs.push_back({role, content});
    return formatted;
 }
 int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;
@ -190,6 +198,7 @@ int main(int argc, char ** argv) {
    llama_model * model;
    llama_context * ctx;
    llama_context * ctx_guidance = NULL;
    std::vector<llama_chat_msg> chat_msgs;
    g_model = &model;
    g_ctx = &ctx;
@ -215,6 +224,8 @@ int main(int argc, char ** argv) {
                __func__, n_ctx_train, n_ctx);
    }
    LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
    // print system information
    {
        LOG_TEE("\n");
@ -249,16 +260,21 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd_inp;
-    if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
+    {
-        LOG("tokenize the prompt\n");
+        auto prompt = params.conversation
-        embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
+            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
-    } else {
+            : params.prompt;
-        LOG("use session tokens\n");
+        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
-        embd_inp = session_tokens;
+            LOG("tokenize the prompt\n");
-    }
+            embd_inp = ::llama_tokenize(ctx, prompt, true, true);
        } else {
            LOG("use session tokens\n");
            embd_inp = session_tokens;
        }
-    LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
+        LOG("prompt: \"%s\"\n", log_tostr(prompt));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+        LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }
    // Should not run without any tokens
    if (embd_inp.empty()) {
@ -478,6 +494,7 @@ int main(int argc, char ** argv) {
    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
    std::ostringstream output_ss;     g_output_ss     = &output_ss;
    std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
    // the first thing we will do is to output the prompt, so set color accordingly
    console::set_display(console::prompt);
@ -793,11 +810,18 @@ int main(int argc, char ** argv) {
                        is_antiprompt = true;
                    }
                    chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
                    is_interacting = true;
                    printf("\n");
                }
            }
            // if current token is not EOG, we add it to current assistant message
            if (params.conversation) {
                auto id = llama_sampling_last(ctx_sampling);
                assistant_ss << llama_token_to_piece(ctx, id, false);
            }
            if (n_past > 0 && is_interacting) {
                LOG("waiting for user input\n");
@ -848,8 +872,12 @@ int main(int argc, char ** argv) {
                        string_process_escapes(buffer);
                    }
                    std::string user_inp = params.conversation
                        ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
                        : std::move(buffer);
                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
+                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, params.conversation);
                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
@ -864,6 +892,9 @@ int main(int argc, char ** argv) {
                        output_ss << llama_token_to_piece(ctx, token);
                    }
                    // reset assistant message
                    assistant_ss.str("");
                    n_remain -= line_inp.size();
                    LOG("n_remain: %d\n", n_remain);
                } else {
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@ -24,6 +24,201 @@ function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
  return minItems === 0 ? `(${result})?` : result;
 }
 function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) {
  const hasMin = minValue !== null;
  const hasMax = maxValue !== null;
  function digitRange(fromChar, toChar) {
      out.push("[");
      if (fromChar === toChar) {
          out.push(fromChar);
      } else {
          out.push(fromChar);
          out.push("-");
          out.push(toChar);
      }
      out.push("]");
  }
  function moreDigits(minDigits, maxDigits) {
      out.push("[0-9]");
      if (minDigits === maxDigits && minDigits === 1) {
          return;
      }
      out.push("{");
      out.push(minDigits.toString());
      if (maxDigits !== minDigits) {
          out.push(",");
          if (maxDigits !== Number.MAX_SAFE_INTEGER) {
              out.push(maxDigits.toString());
          }
      }
      out.push("}");
  }
  function uniformRange(fromStr, toStr) {
      let i = 0;
      while (i < fromStr.length && fromStr[i] === toStr[i]) {
          i++;
      }
      if (i > 0) {
          out.push("\"");
          out.push(fromStr.slice(0, i));
          out.push("\"");
      }
      if (i < fromStr.length) {
          if (i > 0) {
              out.push(" ");
          }
          const subLen = fromStr.length - i - 1;
          if (subLen > 0) {
              const fromSub = fromStr.slice(i + 1);
              const toSub = toStr.slice(i + 1);
              const subZeros = "0".repeat(subLen);
              const subNines = "9".repeat(subLen);
              let toReached = false;
              out.push("(");
              if (fromSub === subZeros) {
                  digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1));
                  out.push(" ");
                  moreDigits(subLen, subLen);
              } else {
                  out.push("[");
                  out.push(fromStr[i]);
                  out.push("] ");
                  out.push("(");
                  uniformRange(fromSub, subNines);
                  out.push(")");
                  if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) {
                      out.push(" | ");
                      if (toSub === subNines) {
                          digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]);
                          toReached = true;
                      } else {
                          digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1));
                      }
                      out.push(" ");
                      moreDigits(subLen, subLen);
                  }
              }
              if (!toReached) {
                  out.push(" | ");
                  digitRange(toStr[i], toStr[i]);
                  out.push(" ");
                  uniformRange(subZeros, toSub);
              }
              out.push(")");
          } else {
              out.push("[");
              out.push(fromStr[i]);
              out.push("-");
              out.push(toStr[i]);
              out.push("]");
          }
      }
  }
  if (hasMin && hasMax) {
      if (minValue < 0 && maxValue < 0) {
          out.push("\"-\" (");
          _generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true);
          out.push(")");
          return;
      }
      if (minValue < 0) {
          out.push("\"-\" (");
          _generateMinMaxInt(0, -minValue, out, decimalsLeft, true);
          out.push(") | ");
          minValue = 0;
      }
      let minS = minValue.toString();
      const maxS = maxValue.toString();
      const minDigits = minS.length;
      const maxDigits = maxS.length;
      for (let digits = minDigits; digits < maxDigits; digits++) {
          uniformRange(minS, "9".repeat(digits));
          minS = "1" + "0".repeat(digits);
          out.push(" | ");
      }
      uniformRange(minS, maxS);
      return;
  }
  const lessDecimals = Math.max(decimalsLeft - 1, 1);
  if (hasMin) {
      if (minValue < 0) {
          out.push("\"-\" (");
          _generateMinMaxInt(null, -minValue, out, decimalsLeft, false);
          out.push(") | [0] | [1-9] ");
          moreDigits(0, decimalsLeft - 1);
      } else if (minValue === 0) {
          if (topLevel) {
              out.push("[0] | [1-9] ");
              moreDigits(0, lessDecimals);
          } else {
              moreDigits(1, decimalsLeft);
          }
      } else if (minValue <= 9) {
          const c = minValue.toString();
          const range_start = topLevel ? '1' : '0';
          if (c > range_start) {
              digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1));
              out.push(" ");
              moreDigits(1, lessDecimals);
              out.push(" | ");
          }
          digitRange(c, "9");
          out.push(" ");
          moreDigits(0, lessDecimals);
      } else {
          const minS = minValue.toString();
          const length = minS.length;
          const c = minS[0];
          if (c > "1") {
              digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1));
              out.push(" ");
              moreDigits(length, lessDecimals);
              out.push(" | ");
          }
          digitRange(c, c);
          out.push(" (");
          _generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false);
          out.push(")");
          if (c < "9") {
              out.push(" | ");
              digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9");
              out.push(" ");
              moreDigits(length - 1, lessDecimals);
          }
      }
      return;
  }
  if (hasMax) {
      if (maxValue >= 0) {
          if (topLevel) {
              out.push("\"-\" [1-9] ");
              moreDigits(0, lessDecimals);
              out.push(" | ");
          }
          _generateMinMaxInt(0, maxValue, out, decimalsLeft, true);
      } else {
          out.push("\"-\" (");
          _generateMinMaxInt(-maxValue, null, out, decimalsLeft, false);
          out.push(")");
      }
      return;
  }
  throw new Error("At least one of minValue or maxValue must be set");
 }
 class BuiltinRule {
  constructor(content, deps) {
    this.content = content;
@ -435,6 +630,24 @@ export class SchemaConverter {
      const minLen = schema.minLength || 0;
      const maxLen = schema.maxLength;
      return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
    } else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) {
      let minValue = null;
      let maxValue = null;
      if ('minimum' in schema) {
        minValue = schema.minimum;
      } else if ('exclusiveMinimum' in schema) {
        minValue = schema.exclusiveMinimum + 1;
      }
      if ('maximum' in schema) {
        maxValue = schema.maximum;
      } else if ('exclusiveMaximum' in schema) {
        maxValue = schema.exclusiveMaximum - 1;
      }
      const out = ["("];
      _generateMinMaxInt(minValue, maxValue, out);
      out.push(") space");
      return this._addRule(ruleName, out.join(''));
    } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
      return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
    } else {
--- a/examples/server/public_simplechat/readme.md
+++ b/examples/server/public_simplechat/readme.md
@ -3,6 +3,13 @@
 by Humans for All.
 ## quickstart
 To run from the build dir
 bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat
 Continue reading for the details.
 ## overview
@ -14,6 +21,8 @@ own system prompts.
 This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
 or potentially as it is being generated, in a streamed manner from the server/ai-model.
 ![Chat and Settings screens](./simplechat_screens.webp "Chat and Settings screens")
 Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
 open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
@ -170,17 +179,23 @@ It is attached to the document object. Some of these can also be updated using t
    The histogram/freq based trimming logic is currently tuned for english language wrt its
    is-it-a-alpabetic|numeral-char regex match logic.
-  chatRequestOptions - maintains the list of options/fields to send along with chat request,
+  apiRequestOptions - maintains the list of options/fields to send along with api request,
  irrespective of whether /chat/completions or /completions endpoint.
    If you want to add additional options/fields to send to the server/ai-model, and or
    modify the existing options value or remove them, for now you can update this global var
    using browser's development-tools/console.
-    For string and numeric fields in chatRequestOptions, including even those added by a user
+    For string, numeric and boolean fields in apiRequestOptions, including even those added by a
-    at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto
+    user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto
    created.
    cache_prompt option supported by example/server is allowed to be controlled by user, so that
    any caching supported wrt system-prompt and chat history, if usable can get used. When chat
    history sliding window is enabled, cache_prompt logic may or may not kick in at the backend
    wrt same, based on aspects related to model, positional encoding, attention mechanism etal.
    However system prompt should ideally get the benefit of caching.
  headers - maintains the list of http headers sent when request is made to the server. By default
  Content-Type is set to application/json. Additionally Authorization entry is provided, which can
  be set if needed using the settings ui.
@ -197,10 +212,10 @@ It is attached to the document object. Some of these can also be updated using t
    >0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
-By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
+By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control
-implications of loading of the ai-model's context window by chat history, wrt chat response to
+the implications of loading of the ai-model's context window by chat history, wrt chat response to
-some extent in a simple crude way. You may also want to control the context size enabled when
+some extent in a simple crude way. You may also want to control the context size enabled when the
-the server loads ai-model, on the server end.
+server loads ai-model, on the server end.
 Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
@ -237,12 +252,12 @@ also be started with a model context size of 1k or more, to be on safe side.
  internal n_predict, for now add the same here on the client side, maybe later add max_tokens
  to /completions endpoint handling code on server side.
-NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions
+NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions
-wrt the set of fields sent to server along with the user query. To check how the model behaves
+wrt the set of fields sent to server along with the user query, to check how the model behaves
 wrt repeatations in general in the generated text response.
 A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
-using the providing settings ui.
+using the provided settings ui (for settings exposed through the ui).
 ### OpenAi / Equivalent API WebService
@ -253,7 +268,7 @@ for a minimal chatting experimentation by setting the below.
 * the baseUrl in settings ui
  * https://api.openai.com/v1 or similar
-* Wrt request body - gMe.chatRequestOptions
+* Wrt request body - gMe.apiRequestOptions
  * model (settings ui)
  * any additional fields if required in future
--- a/examples/server/public_simplechat/simplechat.js
+++ b/examples/server/public_simplechat/simplechat.js
@ -222,8 +222,8 @@ class SimpleChat {
     * @param {Object} obj
     */
    request_jsonstr_extend(obj) {
-        for(let k in gMe.chatRequestOptions) {
+        for(let k in gMe.apiRequestOptions) {
-            obj[k] = gMe.chatRequestOptions[k];
+            obj[k] = gMe.apiRequestOptions[k];
        }
        if (gMe.bStream) {
            obj["stream"] = true;
@ -740,11 +740,12 @@ class Me {
            "Authorization": "", // Authorization: Bearer OPENAI_API_KEY
        }
        // Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
-        this.chatRequestOptions = {
+        this.apiRequestOptions = {
            "model": "gpt-3.5-turbo",
            "temperature": 0.7,
            "max_tokens": 1024,
            "n_predict": 1024,
            "cache_prompt": false,
            //"frequency_penalty": 1.2,
            //"presence_penalty": 1.2,
        };
@ -800,51 +801,55 @@ class Me {
            ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
            ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
            ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
            ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
            ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
            ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
            ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
            ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
            ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
        }
-        ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv);
+        ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv);
        ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
    }
    /**
-     * Auto create ui input elements for fields in ChatRequestOptions
+     * Auto create ui input elements for fields in apiRequestOptions
     * Currently supports text and number field types.
     * @param {HTMLDivElement} elDiv
     */
-    show_settings_chatrequestoptions(elDiv) {
+    show_settings_apirequestoptions(elDiv) {
        let typeDict = {
            "string": "text",
            "number": "number",
        };
        let fs = document.createElement("fieldset");
        let legend = document.createElement("legend");
-        legend.innerText = "ChatRequestOptions";
+        legend.innerText = "ApiRequestOptions";
        fs.appendChild(legend);
        elDiv.appendChild(fs);
-        for(const k in this.chatRequestOptions) {
+        for(const k in this.apiRequestOptions) {
-            let val = this.chatRequestOptions[k];
+            let val = this.apiRequestOptions[k];
            let type = typeof(val);
-            if (!((type == "string") || (type == "number"))) {
+            if (((type == "string") || (type == "number"))) {
-                continue;
+                let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{
                    if (type == "number") {
                        val = Number(val);
                    }
                    this.apiRequestOptions[k] = val;
                });
                fs.appendChild(inp.div);
            } else if (type == "boolean") {
                let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{
                    this.apiRequestOptions[k] = userVal;
                });
                fs.appendChild(bbtn.div);
            }
            let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{
                if (type == "number") {
                    val = Number(val);
                }
                this.chatRequestOptions[k] = val;
            });
            fs.appendChild(inp.div);
        }
    }
@ -870,6 +875,23 @@ class Me {
        });
        elDiv.appendChild(bb.div);
        bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
            this.bTrimGarbage = val;
        });
        elDiv.appendChild(bb.div);
        this.show_settings_apirequestoptions(elDiv);
        let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
            this.apiEP = ApiEP.Type[val];
        });
        elDiv.appendChild(sel.div);
        sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
            this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
        });
        elDiv.appendChild(sel.div);
        bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
            this.bCompletionFreshChatAlways = val;
        });
@ -880,23 +902,6 @@ class Me {
        });
        elDiv.appendChild(bb.div);
        bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
            this.bTrimGarbage = val;
        });
        elDiv.appendChild(bb.div);
        let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
            this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
        });
        elDiv.appendChild(sel.div);
        sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
            this.apiEP = ApiEP.Type[val];
        });
        elDiv.appendChild(sel.div);
        this.show_settings_chatrequestoptions(elDiv);
    }
 }
--- a/examples/server/public_simplechat/simplechat_screens.webp
+++ b/examples/server/public_simplechat/simplechat_screens.webp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2606,17 +2606,9 @@ int main(int argc, char ** argv) {
    // print sample chat example to make it clear which template is used
    {
        json chat;
        chat.push_back({{"role", "system"},    {"content", "You are a helpful assistant"}});
        chat.push_back({{"role", "user"},      {"content", "Hello"}});
        chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
        chat.push_back({{"role", "user"},      {"content", "How are you?"}});
        const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
        LOG_INFO("chat template", {
-            {"chat_example", chat_example},
+            {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
-            {"built_in", params.chat_template.empty()},
+            {"built_in",     params.chat_template.empty()},
        });
    }
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -118,36 +118,17 @@ static inline void server_log(const char * level, const char * function, int lin
 // Format given chat. If tmpl is empty, we take the template from model metadata
 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
-    size_t alloc_size = 0;
+    std::vector<llama_chat_msg> chat;
    // vector holding all allocated string to be passed to llama_chat_apply_template
    std::vector<std::string> str(messages.size() * 2);
    std::vector<llama_chat_message> chat(messages.size());
    for (size_t i = 0; i < messages.size(); ++i) {
        const auto & curr_msg = messages[i];
-        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
+        std::string role    = json_value(curr_msg, "role",    std::string(""));
-        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
+        std::string content = json_value(curr_msg, "content", std::string(""));
-        alloc_size     += str[i*2 + 1].length();
+        chat.push_back({role, content});
        chat[i].role    = str[i*2 + 0].c_str();
        chat[i].content = str[i*2 + 1].c_str();
    }
-    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
+    auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
    std::vector<char> buf(alloc_size * 2);
    // run the first time to get the total output length
    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
    // if it turns out that our buffer is too small, we resize it
    if ((size_t) res > buf.size()) {
        buf.resize(res);
        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
    }
    const std::string formatted_chat(buf.data(), res);
    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
    return formatted_chat;
 }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -1924,16 +1924,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
    } else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // FP32 precision KQV single-batch for batch size 1 without FlashAttention
        ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
    } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
               && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
        // KQ + KQV multi-batch without FlashAttention
        ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
    } else if (use_dequantize_mul_mat_vec) {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
    } else if (use_mul_mat_vec_q) {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
    } else if (use_mul_mat_q) {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
    } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
               && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
        // KQ + KQV multi-batch without FlashAttention
        ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
    } else {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
    }
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -4620,7 +4620,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
    } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // KQV single-batch
        ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
-    } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
+    } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
        // KQ + KQV multi-batch
        ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
    } else if (use_dequantize_mul_mat_vec) {
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@ -69,6 +69,7 @@ class GGUFReader:
    # I - same as host, S - swapped
    byte_order: Literal['I'] | Literal['S'] = 'I'
    alignment: int = GGUF_DEFAULT_ALIGNMENT
    data_offset: int
    # Note: Internal helper, API may change.
    gguf_scalar_to_np: dict[GGUFValueType, type[np.generic]] = {
@ -88,9 +89,13 @@ class GGUFReader:
    def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
        self.data = np.memmap(path, mode = mode)
        offs = 0
        # Check for GGUF magic
        if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
            raise ValueError('GGUF magic invalid')
        offs += 4
        # Check GGUF version
        temp_version = self._get(offs, np.uint32)
        if temp_version[0] & 65535 == 0:
            # If we get 0 here that means it's (probably) a GGUF file created for
@ -103,12 +108,16 @@ class GGUFReader:
        self.fields: OrderedDict[str, ReaderField] = OrderedDict()
        self.tensors: list[ReaderTensor] = []
        offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
        # Check tensor count and kv count
        temp_counts = self._get(offs, np.uint64, 2)
        offs += self._push_field(ReaderField(offs, 'GGUF.tensor_count', [temp_counts[:1]], [0], [GGUFValueType.UINT64]))
        offs += self._push_field(ReaderField(offs, 'GGUF.kv_count', [temp_counts[1:]], [0], [GGUFValueType.UINT64]))
        tensor_count, kv_count = temp_counts
        offs = self._build_fields(offs, kv_count)
-        offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
+
        # Build Tensor Info Fields
        offs, tensors_fields = self._build_tensor_info(offs, tensor_count)
        new_align = self.fields.get('general.alignment')
        if new_align is not None:
            if new_align.types != [GGUFValueType.UINT32]:
@ -117,6 +126,7 @@ class GGUFReader:
        padding = offs % self.alignment
        if padding != 0:
            offs += self.alignment - padding
        self.data_offset = offs
        self._build_tensors(offs, tensors_fields)
    _DT = TypeVar('_DT', bound = npt.DTypeLike)
@ -193,18 +203,29 @@ class GGUFReader:
        # We can't deal with this one.
        raise ValueError('Unknown/unhandled field type {gtype}')
-    def _get_tensor(self, orig_offs: int) -> ReaderField:
+    def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
        offs = orig_offs
        # Get Tensor Name
        name_len, name_data = self._get_str(offs)
        offs += int(name_len.nbytes + name_data.nbytes)
        # Get Tensor Dimensions Count
        n_dims = self._get(offs, np.uint32)
        offs += int(n_dims.nbytes)
        # Get Tensor Dimension Array
        dims = self._get(offs, np.uint64, n_dims[0])
        offs += int(dims.nbytes)
        # Get Tensor Encoding Scheme Type
        raw_dtype = self._get(offs, np.uint32)
        offs += int(raw_dtype.nbytes)
        # Get Tensor Offset
        offset_tensor = self._get(offs, np.uint64)
        offs += int(offset_tensor.nbytes)
        return ReaderField(
            orig_offs,
            str(bytes(name_data), encoding = 'utf-8'),
@ -233,10 +254,10 @@ class GGUFReader:
            offs += field_size
        return offs
-    def _build_tensors_fields(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
+    def _build_tensor_info(self, offs: int, count: int) -> tuple[int, list[ReaderField]]:
        tensor_fields = []
        for _ in range(count):
-            field = self._get_tensor(offs)
+            field = self._get_tensor_info_field(offs)
            offs += sum(int(part.nbytes) for part in field.parts)
            tensor_fields.append(field)
        return offs, tensor_fields
--- a/gguf-py/scripts/gguf-dump.py
+++ b/gguf-py/scripts/gguf-dump.py
@ -319,6 +319,27 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
        markdown_content += "\n"
        markdown_content += "### Tensor Data Offset\n"
        markdown_content += '\n'
        markdown_content += 'This table contains the offset and data segment relative to start of file\n'
        markdown_content += '\n'
        tensor_mapping_table: list[dict[str, str | int]] = []
        for key, tensor in enumerate(reader.tensors):
            data_offset_pretty = '{0:#16x}'.format(tensor.data_offset)
            data_size_pretty = '{0:#16x}'.format(tensor.n_bytes)
            tensor_mapping_table.append({"t_id":key, "layer_name":tensor.name, "data_offset":data_offset_pretty, "data_size":data_size_pretty})
        tensors_mapping_table_header_map = [
            {'key_name':'t_id',         'header_name':'T_ID',               'align':'right'},
            {'key_name':'layer_name',   'header_name':'Tensor Layer Name',  'align':'left'},
            {'key_name':'data_offset',  'header_name':'Data Offset (B)',    'align':'right'},
            {'key_name':'data_size',    'header_name':'Data Size (B)',      'align':'right'},
        ]
        markdown_content += markdown_table_with_alignment_support(tensors_mapping_table_header_map, tensor_mapping_table)
        markdown_content += "\n"
        for group in tensor_prefix_order:
            tensors = tensor_groups[group]
            group_elements = sum(tensor.n_elements for tensor in tensors)
@ -370,6 +391,8 @@ def main() -> None:
    parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
    parser.add_argument("--json",       action="store_true", help="Produce JSON output")
    parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
    parser.add_argument("--data-offset",    action="store_true", help="Start of data offset")
    parser.add_argument("--data-alignment", action="store_true", help="Data alignment applied globally to data field")
    parser.add_argument("--markdown",   action="store_true", help="Produce markdown output")
    parser.add_argument("--verbose",    action="store_true", help="increase output verbosity")
@ -377,7 +400,7 @@ def main() -> None:
    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
-    if not args.json and not args.markdown:
+    if not args.json and not args.markdown and not args.data_offset and not args.data_alignment:
        logger.info(f'* Loading: {args.model}')
    reader = GGUFReader(args.model, 'r')
@ -386,6 +409,10 @@ def main() -> None:
        dump_metadata_json(reader, args)
    elif args.markdown:
        dump_markdown_metadata(reader, args)
    elif args.data_offset:
        print(reader.data_offset)  # noqa: NP100
    elif args.data_alignment:
        print(reader.alignment)  # noqa: NP100
    else:
        dump_metadata(reader, args)
--- a/llama.cpp
+++ b/llama.cpp
@ -226,6 +226,7 @@ enum llm_arch {
    LLM_ARCH_ARCTIC,
    LLM_ARCH_DEEPSEEK2,
    LLM_ARCH_BITNET,
    LLM_ARCH_T5,
    LLM_ARCH_UNKNOWN,
 };
@ -265,6 +266,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_ARCTIC,          "arctic"       },
    { LLM_ARCH_DEEPSEEK2,       "deepseek2"    },
    { LLM_ARCH_BITNET,          "bitnet"       },
    { LLM_ARCH_T5,              "t5"           },
    { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
@ -297,6 +299,7 @@ enum llm_kv {
    LLM_KV_EXPERT_WEIGHTS_SCALE,
    LLM_KV_POOLING_TYPE,
    LLM_KV_LOGIT_SCALE,
    LLM_KV_DECODER_START_TOKEN_ID,
    LLM_KV_ATTENTION_HEAD_COUNT,
    LLM_KV_ATTENTION_HEAD_COUNT_KV,
@ -309,6 +312,7 @@ enum llm_kv {
    LLM_KV_ATTENTION_CAUSAL,
    LLM_KV_ATTENTION_Q_LORA_RANK,
    LLM_KV_ATTENTION_KV_LORA_RANK,
    LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_FREQ_BASE,
@ -346,6 +350,8 @@ enum llm_kv {
    LLM_KV_TOKENIZER_ADD_BOS,
    LLM_KV_TOKENIZER_ADD_EOS,
    LLM_KV_TOKENIZER_ADD_PREFIX,
    LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
    LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
    LLM_KV_TOKENIZER_HF_JSON,
    LLM_KV_TOKENIZER_RWKV,
    LLM_KV_TOKENIZER_PREFIX_ID,
@ -383,18 +389,20 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_EXPERT_WEIGHTS_SCALE,              "%s.expert_weights_scale"              },
    { LLM_KV_POOLING_TYPE ,                     "%s.pooling_type"                      },
    { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
    { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            },
-    { LLM_KV_ATTENTION_HEAD_COUNT,          "%s.attention.head_count"             },
+    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
-    { LLM_KV_ATTENTION_HEAD_COUNT_KV,       "%s.attention.head_count_kv"          },
+    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
-    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,      "%s.attention.max_alibi_bias"         },
+    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"         },
-    { LLM_KV_ATTENTION_CLAMP_KQV,           "%s.attention.clamp_kqv"              },
+    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"              },
-    { LLM_KV_ATTENTION_KEY_LENGTH,          "%s.attention.key_length"             },
+    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"             },
-    { LLM_KV_ATTENTION_VALUE_LENGTH,        "%s.attention.value_length"           },
+    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"           },
-    { LLM_KV_ATTENTION_LAYERNORM_EPS,       "%s.attention.layer_norm_epsilon"     },
+    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"     },
-    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,   "%s.attention.layer_norm_rms_epsilon" },
+    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon" },
-    { LLM_KV_ATTENTION_CAUSAL,              "%s.attention.causal"                 },
+    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                 },
-    { LLM_KV_ATTENTION_Q_LORA_RANK,         "%s.attention.q_lora_rank"            },
+    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
-    { LLM_KV_ATTENTION_KV_LORA_RANK,        "%s.attention.kv_lora_rank"           },
+    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
@ -415,29 +423,31 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_SSM_STATE_SIZE,                "%s.ssm.state_size"     },
    { LLM_KV_SSM_TIME_STEP_RANK,            "%s.ssm.time_step_rank" },
-    { LLM_KV_TOKENIZER_MODEL,               "tokenizer.ggml.model"              },
+    { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
-    { LLM_KV_TOKENIZER_PRE,                 "tokenizer.ggml.pre"                },
+    { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
-    { LLM_KV_TOKENIZER_LIST,                "tokenizer.ggml.tokens"             },
+    { LLM_KV_TOKENIZER_LIST,                 "tokenizer.ggml.tokens"                   },
-    { LLM_KV_TOKENIZER_TOKEN_TYPE,          "tokenizer.ggml.token_type"         },
+    { LLM_KV_TOKENIZER_TOKEN_TYPE,           "tokenizer.ggml.token_type"               },
-    { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,    "tokenizer.ggml.token_type_count"   },
+    { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,     "tokenizer.ggml.token_type_count"         },
-    { LLM_KV_TOKENIZER_SCORES,              "tokenizer.ggml.scores"             },
+    { LLM_KV_TOKENIZER_SCORES,               "tokenizer.ggml.scores"                   },
-    { LLM_KV_TOKENIZER_MERGES,              "tokenizer.ggml.merges"             },
+    { LLM_KV_TOKENIZER_MERGES,               "tokenizer.ggml.merges"                   },
-    { LLM_KV_TOKENIZER_BOS_ID,              "tokenizer.ggml.bos_token_id"       },
+    { LLM_KV_TOKENIZER_BOS_ID,               "tokenizer.ggml.bos_token_id"             },
-    { LLM_KV_TOKENIZER_EOS_ID,              "tokenizer.ggml.eos_token_id"       },
+    { LLM_KV_TOKENIZER_EOS_ID,               "tokenizer.ggml.eos_token_id"             },
-    { LLM_KV_TOKENIZER_UNK_ID,              "tokenizer.ggml.unknown_token_id"   },
+    { LLM_KV_TOKENIZER_UNK_ID,               "tokenizer.ggml.unknown_token_id"         },
-    { LLM_KV_TOKENIZER_SEP_ID,              "tokenizer.ggml.seperator_token_id" },
+    { LLM_KV_TOKENIZER_SEP_ID,               "tokenizer.ggml.seperator_token_id"       },
-    { LLM_KV_TOKENIZER_PAD_ID,              "tokenizer.ggml.padding_token_id"   },
+    { LLM_KV_TOKENIZER_PAD_ID,               "tokenizer.ggml.padding_token_id"         },
-    { LLM_KV_TOKENIZER_CLS_ID,              "tokenizer.ggml.cls_token_id"       },
+    { LLM_KV_TOKENIZER_CLS_ID,               "tokenizer.ggml.cls_token_id"             },
-    { LLM_KV_TOKENIZER_MASK_ID,             "tokenizer.ggml.mask_token_id"      },
+    { LLM_KV_TOKENIZER_MASK_ID,              "tokenizer.ggml.mask_token_id"            },
-    { LLM_KV_TOKENIZER_ADD_BOS,             "tokenizer.ggml.add_bos_token"      },
+    { LLM_KV_TOKENIZER_ADD_BOS,              "tokenizer.ggml.add_bos_token"            },
-    { LLM_KV_TOKENIZER_ADD_EOS,             "tokenizer.ggml.add_eos_token"      },
+    { LLM_KV_TOKENIZER_ADD_EOS,              "tokenizer.ggml.add_eos_token"            },
-    { LLM_KV_TOKENIZER_ADD_PREFIX,          "tokenizer.ggml.add_space_prefix"   },
+    { LLM_KV_TOKENIZER_ADD_PREFIX,           "tokenizer.ggml.add_space_prefix"         },
-    { LLM_KV_TOKENIZER_HF_JSON,             "tokenizer.huggingface.json"        },
+    { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,      "tokenizer.ggml.remove_extra_whitespaces" },
-    { LLM_KV_TOKENIZER_RWKV,                "tokenizer.rwkv.world"              },
+    { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap"     },
-    { LLM_KV_TOKENIZER_PREFIX_ID,           "tokenizer.ggml.prefix_token_id"    },
+    { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
-    { LLM_KV_TOKENIZER_SUFFIX_ID,           "tokenizer.ggml.suffix_token_id"    },
+    { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
-    { LLM_KV_TOKENIZER_MIDDLE_ID,           "tokenizer.ggml.middle_token_id"    },
+    { LLM_KV_TOKENIZER_PREFIX_ID,            "tokenizer.ggml.prefix_token_id"          },
-    { LLM_KV_TOKENIZER_EOT_ID,              "tokenizer.ggml.eot_token_id"       },
+    { LLM_KV_TOKENIZER_SUFFIX_ID,            "tokenizer.ggml.suffix_token_id"          },
    { LLM_KV_TOKENIZER_MIDDLE_ID,            "tokenizer.ggml.middle_token_id"          },
    { LLM_KV_TOKENIZER_EOT_ID,               "tokenizer.ggml.eot_token_id"             },
 };
 struct LLM_KV {
@ -504,6 +514,34 @@ enum llm_tensor {
    LLM_TENSOR_ATTN_KV_A_NORM,
    LLM_TENSOR_ATTN_SUB_NORM,
    LLM_TENSOR_FFN_SUB_NORM,
    LLM_TENSOR_DEC_ATTN_NORM,
    LLM_TENSOR_DEC_ATTN_Q,
    LLM_TENSOR_DEC_ATTN_K,
    LLM_TENSOR_DEC_ATTN_V,
    LLM_TENSOR_DEC_ATTN_OUT,
    LLM_TENSOR_DEC_ATTN_REL_B,
    LLM_TENSOR_DEC_CROSS_ATTN_NORM,
    LLM_TENSOR_DEC_CROSS_ATTN_Q,
    LLM_TENSOR_DEC_CROSS_ATTN_K,
    LLM_TENSOR_DEC_CROSS_ATTN_V,
    LLM_TENSOR_DEC_CROSS_ATTN_OUT,
    LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
    LLM_TENSOR_DEC_FFN_NORM,
    LLM_TENSOR_DEC_FFN_GATE,
    LLM_TENSOR_DEC_FFN_DOWN,
    LLM_TENSOR_DEC_FFN_UP,
    LLM_TENSOR_DEC_OUTPUT_NORM,
    LLM_TENSOR_ENC_ATTN_NORM,
    LLM_TENSOR_ENC_ATTN_Q,
    LLM_TENSOR_ENC_ATTN_K,
    LLM_TENSOR_ENC_ATTN_V,
    LLM_TENSOR_ENC_ATTN_OUT,
    LLM_TENSOR_ENC_ATTN_REL_B,
    LLM_TENSOR_ENC_FFN_NORM,
    LLM_TENSOR_ENC_FFN_GATE,
    LLM_TENSOR_ENC_FFN_DOWN,
    LLM_TENSOR_ENC_FFN_UP,
    LLM_TENSOR_ENC_OUTPUT_NORM,
 };
 static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@ -1135,6 +1173,41 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
            { LLM_TENSOR_FFN_SUB_NORM,       "blk.%d.ffn_sub_norm" },
        },
    },
    {
        LLM_ARCH_T5,
        {
            { LLM_TENSOR_TOKEN_EMBD,           "token_embd" },
            { LLM_TENSOR_OUTPUT,               "output" },
            { LLM_TENSOR_DEC_OUTPUT_NORM,      "dec.output_norm" },
            { LLM_TENSOR_DEC_ATTN_NORM,        "dec.blk.%d.attn_norm" },
            { LLM_TENSOR_DEC_ATTN_Q,           "dec.blk.%d.attn_q" },
            { LLM_TENSOR_DEC_ATTN_K,           "dec.blk.%d.attn_k" },
            { LLM_TENSOR_DEC_ATTN_V,           "dec.blk.%d.attn_v" },
            { LLM_TENSOR_DEC_ATTN_OUT,         "dec.blk.%d.attn_o" },
            { LLM_TENSOR_DEC_ATTN_REL_B,       "dec.blk.%d.attn_rel_b" },
            { LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "dec.blk.%d.cross_attn_norm" },
            { LLM_TENSOR_DEC_CROSS_ATTN_Q,     "dec.blk.%d.cross_attn_q" },
            { LLM_TENSOR_DEC_CROSS_ATTN_K,     "dec.blk.%d.cross_attn_k" },
            { LLM_TENSOR_DEC_CROSS_ATTN_V,     "dec.blk.%d.cross_attn_v" },
            { LLM_TENSOR_DEC_CROSS_ATTN_OUT,   "dec.blk.%d.cross_attn_o" },
            { LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "dec.blk.%d.cross_attn_rel_b" },
            { LLM_TENSOR_DEC_FFN_NORM,         "dec.blk.%d.ffn_norm" },
            { LLM_TENSOR_DEC_FFN_GATE,         "dec.blk.%d.ffn_gate" },
            { LLM_TENSOR_DEC_FFN_DOWN,         "dec.blk.%d.ffn_down" },
            { LLM_TENSOR_DEC_FFN_UP,           "dec.blk.%d.ffn_up" },
            { LLM_TENSOR_ENC_OUTPUT_NORM,      "enc.output_norm" },
            { LLM_TENSOR_ENC_ATTN_NORM,        "enc.blk.%d.attn_norm" },
            { LLM_TENSOR_ENC_ATTN_Q,           "enc.blk.%d.attn_q" },
            { LLM_TENSOR_ENC_ATTN_K,           "enc.blk.%d.attn_k" },
            { LLM_TENSOR_ENC_ATTN_V,           "enc.blk.%d.attn_v" },
            { LLM_TENSOR_ENC_ATTN_OUT,         "enc.blk.%d.attn_o" },
            { LLM_TENSOR_ENC_ATTN_REL_B,       "enc.blk.%d.attn_rel_b" },
            { LLM_TENSOR_ENC_FFN_NORM,         "enc.blk.%d.ffn_norm" },
            { LLM_TENSOR_ENC_FFN_GATE,         "enc.blk.%d.ffn_gate" },
            { LLM_TENSOR_ENC_FFN_DOWN,         "enc.blk.%d.ffn_down" },
            { LLM_TENSOR_ENC_FFN_UP,           "enc.blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_UNKNOWN,
        {
@ -2356,6 +2429,11 @@ struct llama_vocab {
    bool tokenizer_add_bos          = false;
    bool tokenizer_add_eos          = false;
    bool tokenizer_ignore_merges    = false;
    bool tokenizer_remove_extra_whitespaces   = false;
    bool tokenizer_escape_whitespaces         = true;
    bool tokenizer_treat_whitespace_as_suffix = false;
    std::vector<char> precompiled_charsmap;
    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
        GGML_ASSERT(token_left.find(' ') == std::string::npos);
@ -4191,6 +4269,7 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
        case LLAMA_VOCAB_TYPE_SPM:  return "SPM";
        case LLAMA_VOCAB_TYPE_BPE:  return "BPE";
        case LLAMA_VOCAB_TYPE_WPM:  return "WPM";
        case LLAMA_VOCAB_TYPE_UGM:  return "UGM";
        default:                    return "unknown";
    }
 }
@ -4870,6 +4949,45 @@ static void llm_load_vocab(
            vocab.special_pad_id  = -1;
            vocab.special_cls_id  = -1;
            vocab.special_mask_id = -1;
        } else if (tokenizer_model == "t5") {
            vocab.type = LLAMA_VOCAB_TYPE_UGM;
            // default special tokens
            vocab.special_bos_id  = -1;
            vocab.special_eos_id  = 1;
            vocab.special_unk_id  = 2;
            vocab.special_sep_id  = -1;
            vocab.special_pad_id  = 0;
            vocab.special_cls_id  = -1;
            vocab.special_mask_id = -1;
            const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
            if (add_space_prefix_keyidx != -1) {
                vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
            } // The default value of add_space_prefix is true.
            const int remove_extra_whitespaces_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
            if (remove_extra_whitespaces_keyidx != -1) {
                vocab.tokenizer_remove_extra_whitespaces = gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
            } // The default value of remove_extra_whitespaces is false.
            const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
            if (precompiled_charsmap_keyidx != -1) {
                size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
                const char * precompiled_charsmap = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                vocab.precompiled_charsmap.assign(precompiled_charsmap, precompiled_charsmap + n_precompiled_charsmap);
 #ifdef IS_BIG_ENDIAN
                // correct endiannes of data in precompiled_charsmap binary blob
                uint32_t * xcda_blob_size = (uint32_t *) &vocab.precompiled_charsmap[0];
                *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
                assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
                size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
                uint32_t * xcda_array = (uint32_t *) &vocab.precompiled_charsmap[sizeof(uint32_t)];
                for (size_t i = 0; i < xcda_array_size; ++i) {
                    xcda_array[i] = __builtin_bswap32(xcda_array[i]);
                }
 #endif
            }
        } else {
            throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
        }
@ -4952,6 +5070,10 @@ static void llm_load_vocab(
            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
            vocab.tokenizer_add_bos = true;
            vocab.tokenizer_add_eos = false;
        } else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) {
            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
            vocab.tokenizer_add_bos = false;
            vocab.tokenizer_add_eos = true;
        } else {
            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
        }
@ -13213,12 +13335,18 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
 }
 static bool llama_is_unused_token(const llama_vocab& vocab, llama_token id) {
    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
 }
 static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
    GGML_ASSERT(llama_is_byte_token(vocab, id));
    const auto & token_data = vocab.id_to_token.at(id);
    switch (llama_vocab_get_type(vocab)) {
-        case LLAMA_VOCAB_TYPE_SPM: {
+        case LLAMA_VOCAB_TYPE_SPM:
        case LLAMA_VOCAB_TYPE_UGM: {
            auto buf = token_data.text.substr(3, 2);
            return strtol(buf.c_str(), NULL, 16);
        }
@ -13238,7 +13366,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
    static const char * hex = "0123456789ABCDEF";
    switch (llama_vocab_get_type(vocab)) {
-        case LLAMA_VOCAB_TYPE_SPM: {
+        case LLAMA_VOCAB_TYPE_SPM:
        case LLAMA_VOCAB_TYPE_UGM: {
            const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
            auto token = vocab.token_to_id.find(buf);
            if (token != vocab.token_to_id.end()) {
@ -13826,6 +13955,383 @@ struct llm_tokenizer_wpm {
    const llama_vocab & vocab;
 };
 struct naive_trie {
    naive_trie() : has_value(false), value(0) {
    }
    void insert(const char * key, size_t len, int32_t value = 0) {
        if (len == 0) {
            this->has_value = true;
            this->value = value;
            return;
        }
        char c = key[0];
        auto res = children.find(c);
        if (res != children.end()) {
            res->second.insert(key + 1, len - 1, value);
        } else {
            auto res = children.insert(std::make_pair(c, naive_trie()));
            res.first->second.insert(key + 1, len - 1, value);
        }
    }
    std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) {
        if (len == 0 || offset == len) {
            return std::make_pair(key, offset);
        }
        char c = key[offset];
        auto res = children.find(c);
        if (res != children.end()) {
            return res->second.get_longest_prefix(key, len, offset + 1);
        } else {
            return std::make_pair(key, offset);
        }
    }
    struct naive_trie * traverse(const char c) {
        auto res = children.find(c);
        if (res != children.end()) {
            return &res->second;
        } else {
            return NULL;
        }
    }
    std::map<char, struct naive_trie> children;
    bool has_value;
    llama_token value;
 };
 struct llm_tokenizer_ugm {
    llm_tokenizer_ugm(const llama_vocab & vocab) : vocab(vocab) {
        if (vocab.precompiled_charsmap.size() > 0) {
            size_t charsmap_offset = 0;
            // First four bytes of precompiled_charsmap contains length of binary
            // blob containing XOR-compressed compact double array (XCDA) entries
            uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0];
            charsmap_offset += sizeof(xcda_blob_size);
            if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) {
                throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
            }
            // Next xcda_blob_size bytes contain entries of XOR-compressed compact
            // double array (XCDA). Each entry is bit-packed into a 32-bit integer.
            xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset];
            xcda_array_size = xcda_blob_size / sizeof(uint32_t);
            charsmap_offset += xcda_blob_size;
            // Remaining bytes of precompiled charsmap contain null-terminated
            // replacement strings for prefixes matched by the XCDA.
            prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset];
            prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset;
        }
        for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
            const auto &token_data = vocab.id_to_token[id];
            if (llama_is_normal_token(vocab, id)) {
                min_score = std::min<float>(min_score, token_data.score);
                max_score = std::max<float>(max_score, token_data.score);
            }
            if (llama_is_normal_token(vocab, id) ||
                llama_is_user_defined_token(vocab, id) ||
                llama_is_unused_token(vocab, id)) {
                token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
            }
            if (llama_is_user_defined_token(vocab, id)) {
                user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
            }
        }
        unknown_token_score = min_score - unknown_token_score_penalty;
    }
    /* This implementation is based on SentencePiece optimized Viterbi algorithm for
     * unigram language models. The general idea is to:
     * - move along the input sequence in steps of one UTF code point,
     * - at each step find all possible tokenizations of the prefix by
     *   traversing the tokens trie,
     * - for each tokenization store the best one so far (by higher score)
     * - use the position in sequence after given token as an index to store
     *   results
     * - if there was no valid tokenization of the current UTF code point
     *   then use unknown token with additional score penalty
     * After processing the whole sequence we backtrack from the end to get
     * the best tokenization.
    */
    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
        // normalize the input first
        std::string normalized;
        normalize(text, &normalized);
        size_t input_len = normalized.size();
        // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {0, 0, -FLT_MAX});
        // at the beginning tokenization score is zero
        tokenization_results[0] = { 0, 0, 0 };
        for (size_t input_offset = 0; input_offset < input_len;) {
            size_t prefix_offset = input_offset;
            // calculate how many code units are in the currently processed UTF code point
            size_t n_utf8_code_units = std::min<size_t>(utf8_len(normalized[input_offset]), input_len - input_offset);
            // traverse the token matcher trie to find a matching token
            bool single_codepoint_token_found = false;
            const struct best_tokenization & current_best = tokenization_results[input_offset];
            struct naive_trie * node  = token_matcher.traverse(normalized[prefix_offset++]);
            while (prefix_offset <= input_len && node != NULL) {
                // check if we found valid token in prefix
                if (node->has_value) {
                    // check if it corresponds to the whole UTF code point
                    if (prefix_offset - input_offset == n_utf8_code_units) {
                        single_codepoint_token_found = true;
                    }
                    llama_token token_id = node->value;
                    const auto &token_data = vocab.id_to_token[token_id];
                    // we set the user-defined token scores to 0 to make them more likely to be selected
                    // (normal token scores are log probabilities, so they are negative)
                    // score type is double here to make tokenization results exactly
                    // the same as in the HF tokenizer using SentencePiece
                    const double token_score = llama_is_user_defined_token(vocab, token_id) ? 0.0 : token_data.score;
                    const double challenger_score = current_best.score_sum + token_score;
                    struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                    if (challenger_score > current_champ.score_sum) {
                        struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
                        current_champ = challenger;
                    }
                }
                node = node->traverse(normalized[prefix_offset++]);
            }
            // if we didn't find a valid token corresponding to the whole UTF code point
            // then use unknown token as the tokenization of this UTF code point
            if (!single_codepoint_token_found) {
                const double challenger_score = current_best.score_sum + unknown_token_score;
                prefix_offset = input_offset + n_utf8_code_units;
                struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                if (challenger_score > current_champ.score_sum) {
                    struct best_tokenization challenger = { vocab.special_unk_id, input_offset, (float) challenger_score };
                    current_champ = challenger;
                }
            }
            // move to the next UTF code point
            input_offset += n_utf8_code_units;
        }
        // now backtrack from the end to gather token ids of the best tokenization
        // merge sequences of consecutive unknown tokens into single unknown tokens
        bool is_prev_unknown = false;
        for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
            bool is_unknown = tokenization.token_id == vocab.special_unk_id;
            if (!(is_prev_unknown && is_unknown)) {
                output.push_back(tokenization.token_id);
            }
            if (tokenization.input_offset == 0) {
                break;
            }
            is_prev_unknown = is_unknown;
        }
        // reverse the output since we added tokens starting from the end of the input
        std::reverse(output.begin(), output.end());
    }
 private:
    const llama_vocab & vocab;
    // helper structure for returning normalization results
    struct normalization_result {
        const char * normalized;
        size_t normalized_len;
        size_t consumed_input;
    };
    void normalize(const std::string& input, std::string * normalized) {
        normalized->clear();
        normalized->reserve(input.size() * 3);
        const std::string space = vocab.tokenizer_escape_whitespaces ? escaped_space : " ";
        bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
        bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
        bool shall_merge_spaces = vocab.tokenizer_remove_extra_whitespaces;
        bool is_space_prepended = false;
        bool processing_non_ws = false;
        size_t input_len = input.size();
        for (size_t input_offset = 0; input_offset < input_len; ) {
            auto norm_res = normalize_prefix(input, input_offset);
            for (size_t i = 0; i < norm_res.normalized_len; i++) {
                char c = norm_res.normalized[i];
                if (c != ' ') {
                    if (!processing_non_ws) {
                        processing_non_ws = true;
                        if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) {
                            normalized->append(space);
                            is_space_prepended = true;
                        }
                    }
                    normalized->push_back(c);
                } else {
                    if (processing_non_ws) {
                        processing_non_ws = false;
                    }
                    if (!shall_merge_spaces) {
                        normalized->append(space);
                    }
                }
            }
            input_offset += norm_res.consumed_input;
        }
        if (shall_append_space) {
            normalized->append(space);
        }
    }
    /*
     * This structure is a view wrapper for XOR-compressed double array (XCDA)
     * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
     * Eeach bit-packed entry contains:
     * - BASE array value in bits 10-30
     * - LCHECK array value in bits 0-7
     * - LEAF array value in bit 9
     * Entries containing indexes of replacement sequences have set bit 31
     */
    struct xcda_array_view {
    public:
        xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
        }
        uint32_t get_base(size_t index) {
            uint32_t packed_node = get_node(index);
            return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6);
        }
        uint32_t get_lcheck(size_t index) {
            uint32_t packed_node = get_node(index);
            return packed_node & ((1U << 31) | 0xff);
        }
        bool get_leaf(size_t index) {
            uint32_t packed_node = get_node(index);
            return (packed_node >> 8) & 1;
        }
        uint32_t get_value(size_t index) {
            uint32_t packed_node = get_node(index);
            return packed_node & ((1U << 31) - 1);
        }
    private:
        uint32_t get_node(size_t index) {
            if (index > xcda_array_size) {
                throw std::runtime_error("Index out of array bounds in XCDA array!");
            }
            return xcda_array[index];
        }
        const uint32_t * xcda_array;
        size_t xcda_array_size;
    };
    struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
        if (input_offset == input.size()) {
            return { &input[input_offset], 0, 0 };
        }
        // if input prefix matches some user-defined token return this token as normalization result
        auto user_defined_token_match = user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
        if (user_defined_token_match.second > 0) {
            return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
        }
        size_t longest_prefix_length = 0;
        size_t longest_prefix_offset = 0;
        if (xcda_array_size > 0) {
            struct xcda_array_view xcda_view(xcda_array, xcda_array_size);
            // Find the longest normalized sequence matching the input prefix by walking
            // the XOR-compressed compact double array (XCDA) starting from the root node
            // We find the index of the next node by calculating BASE[s] ^ c where s is
            // the index of the previous node and c is a numerical character value
            uint32_t node_index = 0;
            // get BASE of the root node
            node_index = xcda_view.get_base(node_index);
            for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
                unsigned char c = input[prefix_offset];
                if (c == 0) {
                    break;
                }
                node_index ^= c;
                // if value of LCHECK is not c it means that this is not a child of
                // the previous node, so we stop matching
                if (xcda_view.get_lcheck(node_index) != c) {
                    break;
                }
                bool is_leaf = xcda_view.get_leaf(node_index);
                // get BASE of the current node
                node_index ^= xcda_view.get_base(node_index);
                // if LEAF of the current node is true, it means that its BASE points to the node
                // containing index of replacement sequence for currently matched input prefix
                if (is_leaf)
                {
                    longest_prefix_length = prefix_offset - input_offset + 1;
                    // get index of replacement sequence for currently matched input prefix
                    longest_prefix_offset = xcda_view.get_value(node_index);
                }
            }
        }
        if (longest_prefix_length > 0) {
            // we have a match, so return the replacement sequence
            if (longest_prefix_offset >= prefix_replacements_size) {
                throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
            }
            const char * prefix_replacement = &prefix_replacements[longest_prefix_offset];
            return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
        } else {
            // check if the input prefix contains a valid sequence of UTF-8 code units
            try {
                // if yes, return this sequence unmodified
                size_t prefix_offset = input_offset;
                unicode_cpt_from_utf8(input, prefix_offset);
                return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
            } catch(std::invalid_argument & ex) {
                // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
                return { "\xEF\xBF\xBD", 3, 1 };
            }
        }
    }
    // escaped space symbol - U+2581 (Lower One Eighth Block)
    const std::string escaped_space = "\xE2\x96\x81";
    const char * prefix_replacements = NULL;
    size_t prefix_replacements_size = 0;
    const uint32_t * xcda_array = NULL;
    size_t xcda_array_size = 0;
    struct naive_trie user_defined_token_matcher;
    // this structure stores the best tokenization so far at input_offset
    struct best_tokenization {
        llama_token token_id;
        size_t input_offset;
        float score_sum;
    };
    float min_score = FLT_MAX;
    float max_score = -FLT_MAX;
    float unknown_token_score_penalty = 10.0;
    float unknown_token_score;
    struct naive_trie token_matcher;
 };
 typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
@ -14086,6 +14592,39 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                    output.push_back(vocab.special_sep_id);
                }
            } break;
        case LLAMA_VOCAB_TYPE_UGM:
            {
                llm_tokenizer_ugm tokenizer(vocab);
                if (add_special && vocab.tokenizer_add_bos != 0) {
                    GGML_ASSERT(vocab.special_bos_id != -1);
                    output.push_back(vocab.special_bos_id);
                }
                for (const auto & fragment : fragment_buffer) {
                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
 #ifdef PRETOKENIZERDEBUG
                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
                        tokenizer.tokenize(raw_text, output);
                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                        output.push_back(fragment.token);
                    }
                }
                if (add_special && vocab.tokenizer_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
                    LLAMA_LOG_WARN(
                        "%s: Added a BOS token to the prompt as specified by the model but the prompt "
                        "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
                        "Are you sure this is what you want?\n", __FUNCTION__);
                }
                if (add_special && vocab.tokenizer_add_eos == 1) {
                    GGML_ASSERT(vocab.special_eos_id != -1);
                    output.push_back(vocab.special_eos_id);
                }
            } break;
        case LLAMA_VOCAB_TYPE_NONE:
            GGML_ASSERT(false);
    }
@ -14500,7 +15039,8 @@ struct llama_grammar * llama_grammar_init(
            continue;
        }
        if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
-            throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
+            LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
            return nullptr;
        }
    }
@ -16963,6 +17503,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_BLOOM:
        case LLM_ARCH_MAMBA:
        case LLM_ARCH_JINA_BERT_V2:
        case LLM_ARCH_T5:
            return LLAMA_ROPE_TYPE_NONE;
        // use what we call a normal RoPE, operating on pairs of consecutive head values
@ -18658,6 +19199,10 @@ llama_token llama_token_eot(const struct llama_model * model) {
    return model->vocab.special_eot_id;
 }
 llama_token llama_token_pad(const struct llama_model * model) {
    return model->vocab.special_pad_id;
 }
 int32_t llama_tokenize(
    const struct llama_model * model,
                  const char * text,
@ -18724,7 +19269,8 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
    if (0 <= token && token < llama_n_vocab(model)) {
        switch (llama_vocab_get_type(model->vocab)) {
            case LLAMA_VOCAB_TYPE_WPM:
-            case LLAMA_VOCAB_TYPE_SPM: {
+            case LLAMA_VOCAB_TYPE_SPM:
            case LLAMA_VOCAB_TYPE_UGM: {
                // NOTE: we accept all unsupported token types,
                // suppressing them like CONTROL tokens.
                if (llama_is_normal_token(model->vocab, token)) {
@ -18818,10 +19364,10 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|im_start|>assistant\n";
        }
-    } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
+    } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) {
        // llama2 template and its variants
        // [variant] support system message
-        bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
+        bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos || tmpl == "mistral";
        // [variant] space before + after response
        bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
        // [variant] add BOS inside history
--- a/llama.h
+++ b/llama.h
@ -67,6 +67,7 @@ extern "C" {
        LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
        LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
        LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
        LLAMA_VOCAB_TYPE_UGM  = 4, // T5 tokenizer based on Unigram
    };
    // pre-tokenization types
@ -857,6 +858,7 @@ extern "C" {
    LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
    LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
    LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
    // Returns -1 if unknown, 1 for true or 0 for false.
    LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);
@ -924,6 +926,12 @@ extern "C" {
    // Grammar
    //
    /// Initialize a llama_grammar.
    ///
    /// @param rules The rule elements of the grammar to initialize.
    /// @param n_rules The number of rules.
    /// @param start_rule_index The index of the root rule (the starting point of the grammar).
    /// @return The initialized llama_grammar or nullptr if initialization failed.
    LLAMA_API struct llama_grammar * llama_grammar_init(
            const llama_grammar_element ** rules,
                                 size_t    n_rules,
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@ -7,6 +7,7 @@
 #include <cassert>
 #include "llama.h"
 #include "common.h"
 int main(void) {
    llama_chat_message conversation[] = {
@ -119,5 +120,24 @@ int main(void) {
        std::cout << output << "\n-------------------------\n";
        assert(output == expected);
    }
    // test llama_chat_format_single
    std::cout << "\n\n=== llama_chat_format_single ===\n\n";
    std::vector<llama_chat_msg> chat2;
    chat2.push_back({"system", "You are a helpful assistant"});
    chat2.push_back({"user", "Hello"});
    chat2.push_back({"assistant", "I am assistant"});
    llama_chat_msg new_msg{"user", "How are you"};
    auto fmt_single = [&](std::string tmpl) {
        auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
        std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n";
        return output;
    };
    assert(fmt_single("chatml") == "<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
    assert(fmt_single("llama2") == "[INST] How are you [/INST]");
    assert(fmt_single("gemma") == "<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
    assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
    return 0;
 }
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@ -36,10 +36,10 @@ static llama_grammar* build_grammar(const std::string & grammar_str) {
 static bool test_build_grammar_fails(const std::string & grammar_str) {
    fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
    bool grammar_fails = false;
-    try {
+    llama_grammar * grammar = build_grammar(grammar_str);
-        build_grammar(grammar_str);
+    if (grammar != nullptr) {
        fprintf(stderr, "  ❌ Expected build failure, but succeeded\n");
-    } catch (const std::exception & err) {
+    } else {
        grammar_fails = true;
        fprintf(stdout, "  ✅︎\n");
    }
@ -148,6 +148,250 @@ static void test_schema(const std::string & test_desc, const std::string & schem
 }
 static void test_simple_grammar() {
    test_schema(
        "min 0",
        R"""({
            "type": "integer",
            "minimum": 0
        })""",
        // Passing strings
        {
            "0",
            "10",
            "12",
            "10000",
        },
        // Failing strings
        {
            "-1",
            "-10",
            "-10000",
            "-100000000000000000000000000000000",
            "100000000000000000000000000000000",
            "00",
            "01",
            "-0",
        }
    );
    test_schema(
        "min 2",
        // Schema
        R"""({
            "type": "integer",
            "minimum": 2
        })""",
        // Passing strings
        {
            "2",
            "3",
            "4",
            "10",
            "20",
            "1234567890000000",
        },
        // Failing strings
        {
            "0",
            "1",
            "-1",
            "-100",
            "0",
            "1",
            "01",
            "02",
            "12345678900000000",
        }
    );
    test_schema(
        "min 456",
        R"""({
            "type": "integer",
            "minimum": 456
        })""",
        // Passing strings
        {
            "456",
            "4560",
            "457",
            "460",
            "500",
        },
        // Failing strings
        {
            "455",
            "356",
            "50",
            "050",
            "-1",
            "-456",
        }
    );
    test_schema(
        "min -123",
        R"""({
            "type": "integer",
            "minimum": -123
        })""",
        // Passing strings
        {
            "-123",
            "-122",
            "-11",
            "-1",
            "0",
            "1",
            "123",
            "1234",
            "2345",
        },
        // Failing strings
        {
            "-1234",
            "-124",
        }
    );
    test_schema(
        "max 9999",
        // Schema
        R"""({
            "type": "integer",
            "maximum": 9999
        })""",
        // Passing strings
        {
            "-99999",
            "0",
            "9999",
        },
        // Failing strings
        {
            "10000",
            "99991",
        }
    );
    test_schema(
        "max -9999",
        // Schema
        R"""({
            "type": "integer",
            "maximum": -9999
        })""",
        // Passing strings
        {
            "-10000",
            "-9999",
        },
        // Failing strings
        {
            "-9998",
            "0",
            "9999",
        }
    );
    test_schema(
        "min 5 max 30",
        // Schema
        R"""({
            "type": "integer",
            "minimum": 5,
            "maximum": 30
        })""",
        // Passing strings
        {
            "5",
            "10",
            "30",
        },
        // Failing strings
        {
            "05",
            "4",
            "-1",
            "31",
            "123",
            "0123",
        }
    );
    test_schema(
        "min -1 max 1",
        R"""({
            "type": "integer",
            "minimum": -1,
            "maximum": 1
        })""",
        // Passing strings
        {
            "-1",
            "0",
            "1",
        },
        // Failing strings
        {
            "-11",
            "-10",
            "-2",
            "2",
            "10",
            "11",
        }
    );
    test_schema(
        "min -123 max 42",
        R"""({
            "type": "integer",
            "minimum": -123,
            "maximum": 42
        })""",
        // Passing strings
        {
            "-123",
            "-122",
            "-13",
            "-11",
            "-2",
            "-1",
            "0",
            "1",
            "5",
            "10",
            "39",
            "40",
            "42",
        },
        // Failing strings
        {
            "-0123",
            "-124",
            "-1123",
            "-200",
            "43",
            "123",
            "0123",
        }
    );
    test_schema(
        "exclusive min / max",
        // Schema
        R"""({
            "type": "integer",
            "exclusiveMinimum": 0,
            "exclusiveMaximum": 10000
        })""",
        // Passing strings
        {
            "1",
            "9999",
        },
        // Failing strings
        {
            "0",
            "01",
            "10000",
            "99999",
        }
    );
    // Test case for a simple grammar
    test_grammar(
        "simple grammar",
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@ -80,6 +80,232 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        runner(tc);
    };
    test({
        SUCCESS,
        "min 0",
        R"""({
            "type": "integer",
            "minimum": 0
        })""",
        R"""(
            root ::= ([0] | [1-9] [0-9]{0,15}) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "min 1",
        R"""({
            "type": "integer",
            "minimum": 1
        })""",
        R"""(
            root ::= ([1-9] [0-9]{0,15}) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "min 3",
        R"""({
            "type": "integer",
            "minimum": 3
        })""",
        R"""(
            root ::= ([1-2] [0-9]{1,15} | [3-9] [0-9]{0,15}) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "min 9",
        R"""({
            "type": "integer",
            "minimum": 9
        })""",
        R"""(
            root ::= ([1-8] [0-9]{1,15} | [9] [0-9]{0,15}) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "min 10",
        R"""({
            "type": "integer",
            "minimum": 10
        })""",
        R"""(
            root ::= ([1] ([0-9]{1,15}) | [2-9] [0-9]{1,15}) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "min 25",
        R"""({
            "type": "integer",
            "minimum": 25
        })""",
        R"""(
            root ::= ([1] [0-9]{2,15} | [2] ([0-4] [0-9]{1,14} | [5-9] [0-9]{0,14}) | [3-9] [0-9]{1,15}) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "max 30",
        R"""({
            "type": "integer",
            "maximum": 30
        })""",
        R"""(
            root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-2] [0-9] | [3] "0")) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "min -5",
        R"""({
            "type": "integer",
            "minimum": -5
        })""",
        R"""(
            root ::= ("-" ([0-5]) | [0] | [1-9] [0-9]{0,15}) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "min -123",
        R"""({
            "type": "integer",
            "minimum": -123
        })""",
        R"""(
            root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0] | [1-9] [0-9]{0,15}) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "max -5",
        R"""({
            "type": "integer",
            "maximum": -5
        })""",
        R"""(
            root ::= ("-" ([0-4] [0-9]{1,15} | [5-9] [0-9]{0,15})) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "max 1",
        R"""({
            "type": "integer",
            "maximum": 1
        })""",
        R"""(
            root ::= ("-" [1-9] [0-9]{0,15} | [0-1]) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "max 100",
        R"""({
            "type": "integer",
            "maximum": 100
        })""",
        R"""(
            root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-8] [0-9] | [9] [0-9]) | "100") space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "min 0 max 23",
        R"""({
            "type": "integer",
            "minimum": 0,
            "maximum": 23
        })""",
        R"""(
            root ::= ([0-9] | ([1] [0-9] | [2] [0-3])) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "min 15 max 300",
        R"""({
            "type": "integer",
            "minimum": 15,
            "maximum": 300
        })""",
        R"""(
            root ::= (([1] ([5-9]) | [2-9] [0-9]) | ([1-2] [0-9]{2} | [3] "00")) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "min 5 max 30",
        R"""({
            "type": "integer",
            "minimum": 5,
            "maximum": 30
        })""",
        R"""(
            root ::= ([5-9] | ([1-2] [0-9] | [3] "0")) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "min -123 max 42",
        R"""({
            "type": "integer",
            "minimum": -123,
            "maximum": 42
        })""",
        R"""(
            root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0-9] | ([1-3] [0-9] | [4] [0-2])) space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "min -10 max 10",
        R"""({
            "type": "integer",
            "minimum": -10,
            "maximum": 10
        })""",
        R"""(
            root ::= ("-" ([0-9] | "10") | [0-9] | "10") space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        FAILURE,
        "unknown type",
@ -422,6 +648,44 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        )"""
    });
    test({
        SUCCESS,
        "min + max items with min + max values across zero",
        R"""({
            "items": {
                "type": "integer",
                "minimum": -12,
                "maximum": 207
            },
            "minItems": 3,
            "maxItems": 5
        })""",
        R"""(
            item ::= ("-" ([0-9] | "1" [0-2]) | [0-9] | ([1-8] [0-9] | [9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
            root ::= "[" space item ("," space item){2,4} "]" space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "min + max items with min + max values",
        R"""({
            "items": {
                "type": "integer",
                "minimum": 12,
                "maximum": 207
            },
            "minItems": 3,
            "maxItems": 5
        })""",
        R"""(
            item ::= (([1] ([2-9]) | [2-9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
            root ::= "[" space item ("," space item){2,4} "]" space
            space ::= | " " | "\n" [ \t]{0,20}
        )"""
    });
    test({
        SUCCESS,
        "simple regexp",
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@ -116,6 +116,10 @@ int main()
    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
    grammar = llama_grammar_init(
        grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
    if (grammar == nullptr)
    {
        throw std::runtime_error("Failed to initialize llama_grammar");
    }
    std::vector<std::vector<llama_grammar_element>> expected_stacks = {
        {
--- a/unicode.cpp
+++ b/unicode.cpp
@ -23,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    return result;
 }
-static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
+uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
    assert(offset < utf8.size());
    if (!(utf8[offset + 0] & 0x80)) {
        auto result = utf8[offset + 0];
--- a/unicode.h
+++ b/unicode.h
@ -48,6 +48,7 @@ struct codepoint_flags {
 std::string unicode_cpt_to_utf8(uint32_t cp);
 uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);