Merge branch 'master' into gg/llama-kv-cache

ggml-ci
2025-02-02 11:07:05 +02:00 · 2025-02-02 11:07:05 +02:00 · 74b0807245
commit 74b0807245
parent 3e23be7911 ff227703d6
36 changed files with 2392 additions and 157 deletions
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -65,6 +65,7 @@ add_library(${TARGET} STATIC
    console.h
    json-schema-to-grammar.cpp
    json.hpp
+    llguidance.cpp
    log.cpp
    log.h
    minja.hpp
@ -91,6 +92,33 @@ if (LLAMA_CURL)
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
 endif ()

+if (LLAMA_LLGUIDANCE)
+    include(ExternalProject)
+    set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
+    set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
+    ExternalProject_Add(llguidance_ext
+        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
+        # v0.6.12:
+        GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
+        PREFIX ${CMAKE_BINARY_DIR}/llguidance
+        SOURCE_DIR ${LLGUIDANCE_SRC}
+        BUILD_IN_SOURCE TRUE
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND cargo build --release
+        INSTALL_COMMAND ""
+        BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/libllguidance.a ${LLGUIDANCE_PATH}/llguidance.h
+        UPDATE_COMMAND ""
+    )
+    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
+
+    add_library(llguidance STATIC IMPORTED)
+    set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/libllguidance.a)
+    add_dependencies(llguidance llguidance_ext)
+
+    target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance)
+endif ()
+
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)
 target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
--- a/common/chat-template.hpp
+++ b/common/chat-template.hpp
@ -283,10 +283,12 @@ class chat_template {
                    message["role"] = "user";
                    auto obj = json {
                        {"tool_response", {
-                            {"tool", message.at("name")},
                            {"content", message.at("content")},
                        }},
                    };
+                    if (message.contains("name")) {
+                        obj["tool_response"]["name"] = message.at("name");
+                    }
                    if (message.contains("tool_call_id")) {
                        obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
                    }
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -384,14 +384,19 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
            tool_rules.push_back(
                builder.add_rule(
                    name + "-call",
-                    "\"{\" ( \"\\\"type\\\": \\\"function\\\", \" | space ) "
+                    "\"{\" space "
+                    "( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? "
                    "\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
                        builder.add_schema(name + "-args", parameters) +
                    " \"}\""));
            data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true});
        });
        data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true});
+        data.grammar_triggers.push_back({"{\n  \"name\":", /* .at_start = */ true});
+        data.grammar_triggers.push_back({"{\n    \"name\":", /* .at_start = */ true});
        data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true});
+        data.grammar_triggers.push_back({"{\n  \"type\": \"function\"", /* .at_start = */ true});
+        data.grammar_triggers.push_back({"{\n    \"type\": \"function\"", /* .at_start = */ true});
        if (!builtin_tools.empty()) {
            data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
        }
@ -586,9 +591,17 @@ static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & in
        }
    }
    // TODO: tighten & simplify.
-    auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
-    res.content = content;
-    return res;
+    try {
+        auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
+        res.content = content + res.content;
+        return res;
+    } catch (const std::exception & e) {
+        LOG_ERR("Failed to parse functionary v3.2 input: %s\n", e.what());
+        common_chat_msg res;
+        res.role = "assistant";
+        res.content = input;
+        return res;
+    }
 }

 static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -991,7 +991,14 @@ public:
    }
 };

-std::string json_schema_to_grammar(const json & schema) {
+std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
+#ifdef LLAMA_USE_LLGUIDANCE
+    if (!force_gbnf) {
+        return "%llguidance {}\nstart: %json " + schema.dump();
+    }
+#else
+    (void)force_gbnf;
+#endif // LLAMA_USE_LLGUIDANCE
    return build_grammar([&](const common_grammar_builder & callbacks) {
        auto copy = schema;
        callbacks.resolve_refs(copy);
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@ -5,7 +5,8 @@
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"

-std::string json_schema_to_grammar(const nlohmann::ordered_json & schema);
+std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
+                                   bool force_gbnf = false);

 struct common_grammar_builder {
    std::function<std::string(const std::string &, const std::string &)> add_rule;
--- a/common/llguidance.cpp
+++ b/common/llguidance.cpp
@ -0,0 +1,270 @@
+#include "sampling.h"
+#include "log.h"
+
+#ifdef LLAMA_USE_LLGUIDANCE
+
+#    include "llguidance.h"
+#    include <cmath>
+
+struct llama_sampler_llg {
+    const llama_vocab * vocab;
+    std::string         grammar_kind;
+    std::string         grammar_data;
+    LlgTokenizer *      tokenizer;
+    LlgConstraint *     grammar;
+    LlgMaskResult       llg_res;
+    bool                has_llg_res;
+};
+
+static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
+                                             const char * grammar_data) {
+    LlgConstraintInit cinit;
+    llg_constraint_init_set_defaults(&cinit, tokenizer);
+    const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
+    if (log_level && *log_level) {
+        cinit.log_stderr_level = atoi(log_level);
+    }
+    auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
+    if (llg_get_error(c)) {
+        LOG_ERR("llg error: %s\n", llg_get_error(c));
+        llg_free_constraint(c);
+        return nullptr;
+    }
+    return c;
+}
+
+static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
+    return "llguidance";
+}
+
+static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
+    auto * ctx = (llama_sampler_llg *) smpl->ctx;
+    if (ctx->grammar) {
+        LlgCommitResult res;
+        llg_commit_token(ctx->grammar, token, &res);
+        ctx->has_llg_res = false;
+    }
+}
+
+static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_llg *) smpl->ctx;
+    if (ctx->grammar) {
+        if (!ctx->has_llg_res) {
+            if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
+                ctx->has_llg_res = true;
+            } else {
+                LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
+                llg_free_constraint(ctx->grammar);
+                ctx->grammar = nullptr;
+            }
+        }
+        if (ctx->has_llg_res) {
+            if (ctx->llg_res.is_stop) {
+                for (size_t i = 0; i < cur_p->size; ++i) {
+                    if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
+                        cur_p->data[i].logit = -INFINITY;
+                    }
+                }
+            } else {
+                const uint32_t * mask = ctx->llg_res.sample_mask;
+                for (size_t i = 0; i < cur_p->size; ++i) {
+                    auto token = cur_p->data[i].id;
+                    if ((mask[token / 32] & (1 << (token % 32))) == 0) {
+                        cur_p->data[i].logit = -INFINITY;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void llama_sampler_llg_reset(llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_llg *) smpl->ctx;
+    if (!ctx->grammar) {
+        return;
+    }
+
+    auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
+    llg_free_constraint(ctx->grammar);
+    ctx->grammar     = grammar_new;
+    ctx->has_llg_res = false;
+}
+
+static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_llg *) smpl->ctx;
+
+    auto * result = llama_sampler_init_llg(ctx->vocab, nullptr, nullptr);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_llg *) result->ctx;
+
+        if (ctx->grammar) {
+            result_ctx->grammar_kind = ctx->grammar_kind;
+            result_ctx->grammar_data = ctx->grammar_data;
+            result_ctx->grammar      = llg_clone_constraint(ctx->grammar);
+            result_ctx->tokenizer    = llg_clone_tokenizer(ctx->tokenizer);
+        }
+    }
+
+    return result;
+}
+
+static void llama_sampler_llg_free(llama_sampler * smpl) {
+    const auto * ctx = (llama_sampler_llg *) smpl->ctx;
+
+    if (ctx->grammar) {
+        llg_free_constraint(ctx->grammar);
+        llg_free_tokenizer(ctx->tokenizer);
+    }
+
+    delete ctx;
+}
+
+static llama_sampler_i llama_sampler_llg_i = {
+    /* .name   = */ llama_sampler_llg_name,
+    /* .accept = */ llama_sampler_llg_accept_impl,
+    /* .apply  = */ llama_sampler_llg_apply,
+    /* .reset  = */ llama_sampler_llg_reset,
+    /* .clone  = */ llama_sampler_llg_clone,
+    /* .free   = */ llama_sampler_llg_free,
+};
+
+static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
+                                            uint32_t * output_tokens, size_t output_tokens_len) {
+    const llama_vocab * vocab = (const llama_vocab *) user_data;
+    int                 r     = 0;
+    try {
+        r = llama_tokenize(vocab, (const char *) bytes, bytes_len, (int32_t *) output_tokens, output_tokens_len, false,
+                           true);
+    } catch (const std::exception & e) {
+        GGML_ABORT("llama_tokenize failed: %s\n", e.what());
+    }
+    if (r < 0) {
+        return -r;
+    }
+    return r;
+}
+
+static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab) {
+    // TODO store the tokenizer in the vocab somehow
+    static const llama_vocab * vocab_cache;
+    static LlgTokenizer *      tokenizer_cache;
+
+    if (vocab_cache == vocab) {
+        return llg_clone_tokenizer(tokenizer_cache);
+    }
+
+    auto tok_eos = llama_vocab_eot(vocab);
+    if (tok_eos == LLAMA_TOKEN_NULL) {
+        tok_eos = llama_vocab_eos(vocab);
+    }
+
+    size_t vocab_size = llama_vocab_n_tokens(vocab);
+
+    auto token_lens       = new uint32_t[vocab_size];
+    // we typically have ~7 bytes per token; let's go on the safe side here
+    auto token_bytes_size = vocab_size * 16 + 1024 * 1024;
+    auto token_bytes      = new uint8_t[token_bytes_size];
+
+    size_t offset = 0;
+    for (size_t i = 0; i < vocab_size; i++) {
+        size_t max_token = 1024;
+        if (token_bytes_size - offset < max_token) {
+            GGML_ABORT("token_bytes buffer too small\n");
+        }
+
+        llama_token token = i;
+        auto        dp    = (char *) token_bytes + offset;
+        auto        size  = llama_detokenize(vocab, &token, 1, dp, max_token, false, false);
+        if (size < 0) {
+            GGML_ABORT("llama_detokenize failed\n");
+        }
+        if (size == 0) {
+            size = llama_detokenize(vocab, &token, 1, dp + 1, max_token - 1, false, true);
+            if (size < 0) {
+                GGML_ABORT("llama_detokenize failed\n");
+            }
+            if (size != 0) {
+                *dp = '\xff';  // special token prefix marker
+                size += 1;
+            }
+        }
+
+        token_lens[i] = size;
+        offset += size;
+    }
+
+    LlgTokenizerInit tinit = {
+        /* .vocab_size                         = */ (uint32_t) vocab_size,
+        /* .tok_eos                            = */ (uint32_t) tok_eos,
+        /* .token_lens                         = */ token_lens,
+        /* .token_bytes                        = */ token_bytes,
+        /* .tokenizer_json                     = */ nullptr,
+        /* .tokenize_assumes_string            = */ true,
+        /* .tokenize_fn                        = */ llama_sampler_llg_tokenize_fn,
+        /* .use_approximate_greedy_tokenize_fn = */ false,
+        /* .tokenize_user_data                 = */ vocab,
+    };
+
+    char           error_buffer[1024];
+    LlgTokenizer * tokenizer = llg_new_tokenizer(&tinit, error_buffer, sizeof(error_buffer));
+
+    delete[] token_bytes;
+    delete[] token_lens;
+
+    if (tokenizer == nullptr) {
+        LOG_ERR("llg tokenizer error: %s\n", error_buffer);
+        return tokenizer;
+    }
+
+    if (tokenizer_cache) {
+        llg_free_tokenizer(tokenizer_cache);
+    }
+    vocab_cache     = vocab;
+    tokenizer_cache = tokenizer;
+
+    return llg_clone_tokenizer(tokenizer_cache);
+}
+
+llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * grammar_kind,
+                                       const char * grammar_data) {
+    auto * ctx = new llama_sampler_llg;
+
+    if (grammar_kind != nullptr && grammar_kind[0] != '\0') {
+        auto tokenizer = llama_sampler_llg_new_tokenizer(vocab);
+        *ctx           = {
+            /* .vocab        = */ vocab,
+            /* .grammar_kind = */ grammar_kind,
+            /* .grammar_data = */ grammar_data,
+            /* .tokenizer    = */ tokenizer,
+            /* .grammar      = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
+            /* .llg_res      = */ {},
+            /* .has_llg_res  = */ false,
+        };
+    } else {
+        *ctx = {
+            /* .vocab        = */ vocab,
+            /* .grammar_kind = */ {},
+            /* .grammar_data = */ {},
+            /* .tokenizer    = */ nullptr,
+            /* .grammar      = */ nullptr,
+            /* .llg_res      = */ {},
+            /* .has_llg_res  = */ false,
+        };
+    }
+
+    return new llama_sampler{
+        /* .iface = */ &llama_sampler_llg_i,
+        /* .ctx   = */ ctx,
+    };
+}
+
+#else
+
+llama_sampler * llama_sampler_init_llg(const llama_vocab *, const char *, const char *) {
+    LOG_WRN("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
+    return nullptr;
+}
+
+#endif  // LLAMA_USE_LLGUIDANCE
--- a/common/minja.hpp
+++ b/common/minja.hpp
@ -693,7 +693,7 @@ enum SpaceHandling { Keep, Strip, StripSpaces, StripNewline };

 class TemplateToken {
 public:
-    enum class Type { Text, Expression, If, Else, Elif, EndIf, For, EndFor, Generation, EndGeneration, Set, EndSet, Comment, Macro, EndMacro, Filter, EndFilter };
+    enum class Type { Text, Expression, If, Else, Elif, EndIf, For, EndFor, Generation, EndGeneration, Set, EndSet, Comment, Macro, EndMacro, Filter, EndFilter, Break, Continue };

    static std::string typeToString(Type t) {
        switch (t) {
@ -714,6 +714,8 @@ public:
            case Type::EndFilter: return "endfilter";
            case Type::Generation: return "generation";
            case Type::EndGeneration: return "endgeneration";
+            case Type::Break: return "break";
+            case Type::Continue: return "continue";
        }
        return "Unknown";
    }
@ -815,6 +817,22 @@ struct CommentTemplateToken : public TemplateToken {
    CommentTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Comment, location, pre, post), text(t) {}
 };

+enum class LoopControlType { Break, Continue };
+
+class LoopControlException : public std::runtime_error {
+public:
+    LoopControlType control_type;
+    LoopControlException(const std::string & message, LoopControlType control_type) : std::runtime_error(message), control_type(control_type) {}
+    LoopControlException(LoopControlType control_type)
+      : std::runtime_error((std::ostringstream() << (control_type == LoopControlType::Continue ? "continue" : "break") << " outside of a loop").str()),
+        control_type(control_type) {}
+};
+
+struct LoopControlTemplateToken : public TemplateToken {
+    LoopControlType control_type;
+    LoopControlTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, LoopControlType control_type) : TemplateToken(Type::Break, location, pre, post), control_type(control_type) {}
+};
+
 class TemplateNode {
    Location location_;
 protected:
@ -825,6 +843,12 @@ public:
    void render(std::ostringstream & out, const std::shared_ptr<Context> & context) const {
        try {
            do_render(out, context);
+        } catch (const LoopControlException & e) {
+            // TODO: make stack creation lazy. Only needed if it was thrown outside of a loop.
+            std::ostringstream err;
+            err << e.what();
+            if (location_.source) err << error_location_suffix(*location_.source, location_.pos);
+            throw LoopControlException(err.str(), e.control_type);
        } catch (const std::exception & e) {
            std::ostringstream err;
            err << e.what();
@ -897,6 +921,15 @@ public:
    }
 };

+class LoopControlNode : public TemplateNode {
+    LoopControlType control_type_;
+  public:
+    LoopControlNode(const Location & location, LoopControlType control_type) : TemplateNode(location), control_type_(control_type) {}
+    void do_render(std::ostringstream &, const std::shared_ptr<Context> &) const override {
+      throw LoopControlException(control_type_);
+    }
+};
+
 class ForNode : public TemplateNode {
    std::vector<std::string> var_names;
    std::shared_ptr<Expression> iterable;
@ -961,7 +994,12 @@ public:
                  loop.set("last", i == (n - 1));
                  loop.set("previtem", i > 0 ? filtered_items.at(i - 1) : Value());
                  loop.set("nextitem", i < n - 1 ? filtered_items.at(i + 1) : Value());
-                  body->render(out, loop_context);
+                  try {
+                      body->render(out, loop_context);
+                  } catch (const LoopControlException & e) {
+                      if (e.control_type == LoopControlType::Break) break;
+                      if (e.control_type == LoopControlType::Continue) continue;
+                  }
              }
          }
      };
@ -2159,7 +2197,7 @@ private:
      static std::regex comment_tok(R"(\{#([-~]?)(.*?)([-~]?)#\})");
      static std::regex expr_open_regex(R"(\{\{([-~])?)");
      static std::regex block_open_regex(R"(^\{%([-~])?[\s\n\r]*)");
-      static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter)\b)");
+      static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)");
      static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)");
      static std::regex expr_close_regex(R"([\s\n\r]*([-~])?\}\})");
      static std::regex block_close_regex(R"([\s\n\r]*([-~])?%\})");
@ -2291,6 +2329,9 @@ private:
            } else if (keyword == "endfilter") {
              auto post_space = parseBlockClose();
              tokens.push_back(std::make_unique<EndFilterTemplateToken>(location, pre_space, post_space));
+            } else if (keyword == "break" || keyword == "continue") {
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<LoopControlTemplateToken>(location, pre_space, post_space, keyword == "break" ? LoopControlType::Break : LoopControlType::Continue));
            } else {
              throw std::runtime_error("Unexpected block: " + keyword);
            }
@ -2414,6 +2455,8 @@ private:
              children.emplace_back(std::make_shared<FilterNode>(token->location, std::move(filter_token->filter), std::move(body)));
          } else if (dynamic_cast<CommentTemplateToken*>(token.get())) {
              // Ignore comments
+          } else if (auto ctrl_token = dynamic_cast<LoopControlTemplateToken*>(token.get())) {
+              children.emplace_back(std::make_shared<LoopControlNode>(token->location, ctrl_token->control_type));
          } else if (dynamic_cast<EndForTemplateToken*>(token.get())
                  || dynamic_cast<EndSetTemplateToken*>(token.get())
                  || dynamic_cast<EndMacroTemplateToken*>(token.get())
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -156,13 +156,25 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
    for (const auto & str : params.grammar_trigger_words) {
        trigger_words.push_back(str.word.c_str());
    }
+
+    struct llama_sampler * grmr;
+    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
+#ifdef LLAMA_USE_LLGUIDANCE
+        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
+#else
+        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
+#endif // LLAMA_USE_LLGUIDANCE
+    } else {
+        grmr = params.grammar_lazy
+             ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
+                                               trigger_words.data(), trigger_words.size(),
+                                               params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
+             :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
+    }
+
    auto * result = new common_sampler {
        /* .params = */ params,
-        /* .grmr   = */ params.grammar_lazy
-            ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
-                                              trigger_words.data(), trigger_words.size(),
-                                              params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
-            :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"),
+        /* .grmr   = */ grmr,
        /* .chain  = */ llama_sampler_chain_init(lparams),
        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
        /* .cur    = */ {},
--- a/common/sampling.h
+++ b/common/sampling.h
@ -102,3 +102,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr);

 std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
+
+llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
+                const char * grammar_kind, const char * grammar_data);