llama, main : constrain sampling to grammar

2023-05-31 00:20:51 -04:00 · 2023-05-31 00:20:51 -04:00 · fd0eb663ce
commit fd0eb663ce
parent 72ff5282bf
9 changed files with 662 additions and 1 deletions
--- a/5
+++ b/5
@ -250,6 +250,9 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
@ -260,7 +263,7 @@ clean:
 # Examples
 #
-main: examples/main/main.cpp                                  build-info.h ggml.o llama.o common.o $(OBJS)
+main: examples/main/main.cpp                                  build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -13,6 +13,8 @@ set(TARGET common)
 add_library(${TARGET} OBJECT
    common.h
    common.cpp
    grammar-parser.h
    grammar-parser.cpp
    )
 if (BUILD_SHARED_LIBS)
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -388,6 +388,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.input_suffix = argv[i];
        } else if (arg == "--grammar") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.grammar = argv[i];
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            gpt_print_usage(argc, argv, default_params);
@ -458,6 +464,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "                        modifies the likelihood of token appearing in the completion,\n");
    fprintf(stderr, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
    fprintf(stderr, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
    fprintf(stderr, "  --grammar GRAMMAR     BNF-like grammar (TODO explain) to constrain generations\n");
    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
--- a/examples/common.h
+++ b/examples/common.h
@ -52,6 +52,7 @@ struct gpt_params {
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
    std::string input_prefix      = "";  // string to prefix user inputs with
    std::string input_suffix      = "";  // string to suffix user inputs with
    std::string grammar           = "";  // optional BNF-like grammar to constrain sampling
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string lora_adapter = "";  // lora adapter path
--- a/examples/grammar-parser.cpp
+++ b/examples/grammar-parser.cpp
@ -0,0 +1,315 @@
 #include "grammar-parser.h"
 #include <cstdint>
 #include <cwchar>
 #include <string>
 #include <utility>
 namespace grammar_parser {
    uint16_t get_symbol_id(parse_state & state, const char * src, size_t len) {
        uint16_t next_id = static_cast<uint16_t>(state.symbol_ids.size());
        auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
        return result.first->second;
    }
    uint16_t generate_symbol_id(parse_state & state, const std::string & base_name) {
        uint16_t next_id = static_cast<uint16_t>(state.symbol_ids.size());
        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
        return next_id;
    }
    bool is_word_char(char c) {
        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
    }
    int hex_to_int(char c) {
        if ('a' <= c && c <= 'f') {
            return c - 'a' + 10;
        } else if ('A' <= c && c <= 'F') {
            return c - 'A' + 10;
        } else if ('0' <= c && c <= '9') {
            return c - '0';
        }
        return -1;
    }
    const char * parse_space(const char * src) {
        const char * pos = src;
        // TODO: support newlines in some cases
        while (*pos == ' ' || *pos == '\t') {
            pos++;
        }
        return pos;
    }
    std::pair<const char *, const char *> parse_name(const char * src) {
        const char * pos = src;
        while (is_word_char(*pos)) {
            pos++;
        }
        if (pos == src) {
            throw std::string("expecting name at ") + src;
        }
        return std::make_pair(pos, parse_space(pos));
    }
    std::pair<uint16_t, const char *> parse_char(const char * src) {
        if (*src == '\\') {
            char esc = src[1];
            if (esc == 'x') {
                int first = hex_to_int(src[2]);
                if (first > -1) {
                    int second = hex_to_int(src[3]);
                    if (second > -1) {
                        return std::make_pair((first << 4) + second, src + 4);
                    }
                }
                throw std::string("expecting \\xNN at ") + src;
            } else if (esc == '"' || esc == '[' || esc == ']') {
                return std::make_pair(esc, src + 2);
            } else if (esc == 'r') {
                return std::make_pair('\r', src + 2);
            } else if (esc == 'n') {
                return std::make_pair('\n', src + 2);
            } else if (esc == 't') {
                return std::make_pair('\t', src + 2);
            }
            throw std::string("unknown escape at ") + src;
        } else if (*src) {
            return std::make_pair(*src, src + 1);
        }
        throw std::string("unexpected end of input");
    }
    const char * parse_alternates(
            parse_state       & state,
            const char        * src,
            const std::string & rule_name,
            uint16_t            rule_id);
    const char * parse_sequence(
            parse_state           & state,
            const char            * src,
            const std::string     & rule_name,
            std::vector<uint16_t> & outbuf) {
        size_t out_start = outbuf.size();
        // sequence size, will be replaced at end when known
        outbuf.push_back(0);
        size_t last_sym_start = outbuf.size();
        const char * pos = src;
        while (*pos) {
            if (*pos == '"') { // literal string
                pos++;
                last_sym_start = outbuf.size();
                while (*pos != '"') {
                    auto char_pair = parse_char(pos);
                         pos       = char_pair.second;
                    // each char of a literal is encoded as a "range" of char - char
                    outbuf.push_back(2);
                    outbuf.push_back(char_pair.first);
                    outbuf.push_back(char_pair.first);
                }
                pos = parse_space(pos + 1);
            } else if (*pos == '[') { // char range(s)
                pos++;
                last_sym_start = outbuf.size();
                // num chars in range - replaced at end of loop
                outbuf.push_back(0);
                while (*pos != ']') {
                    auto char_pair = parse_char(pos);
                         pos       = char_pair.second;
                    outbuf.push_back(char_pair.first);
                    if (pos[0] == '-' && pos[1] != ']') {
                        auto endchar_pair = parse_char(pos + 1);
                             pos          = endchar_pair.second;
                        outbuf.push_back(endchar_pair.first);
                    } else {
                        // chars that aren't part of a c1-c2 range are just doubled (i.e., c-c)
                        outbuf.push_back(char_pair.first);
                    }
                }
                // replace num chars with actual
                outbuf[last_sym_start] = static_cast<uint16_t>(outbuf.size() - last_sym_start - 1);
                pos = parse_space(pos + 1);
            } else if (is_word_char(*pos)) { // rule reference
                auto     name_pair   = parse_name(pos);
                uint16_t ref_rule_id = get_symbol_id(state, pos, name_pair.first - pos);
                         pos         = name_pair.second;
                last_sym_start = outbuf.size();
                outbuf.push_back(1);
                outbuf.push_back(ref_rule_id);
            } else if (*pos == '(') { // grouping
                // parse nested alternates into synthesized rule
                pos = parse_space(pos + 1);
                uint16_t sub_rule_id = generate_symbol_id(state, rule_name);
                pos = parse_alternates(state, pos, rule_name, sub_rule_id);
                last_sym_start = outbuf.size();
                // output reference to synthesized rule
                outbuf.push_back(1);
                outbuf.push_back(sub_rule_id);
                if (*pos != ')') {
                    throw std::string("expecting ')' at ") + pos;
                }
                pos = parse_space(pos + 1);
            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
                if (outbuf.size() - out_start - 1 == 0) {
                    throw std::string("expecting preceeding item to */+/? at ") + pos;
                }
                std::vector<uint16_t> & out_grammar = state.out_grammar;
                // apply transformation to previous symbol (last_sym_start -
                // end) according to rewrite rules:
                // S* --> S' ::= S S' |
                // S+ --> S' ::= S S' | S
                // S? --> S' ::= S |
                uint16_t sub_rule_id = generate_symbol_id(state, rule_name);
                out_grammar.push_back(sub_rule_id);
                size_t   sub_rule_start = out_grammar.size();
                // placeholder for size of 1st alternate
                out_grammar.push_back(0);
                // add preceding symbol to generated rule
                out_grammar.insert(out_grammar.end(), outbuf.begin() + last_sym_start, outbuf.end());
                if (*pos == '*' || *pos == '+') {
                    // cause generated rule to recurse
                    out_grammar.push_back(1);
                    out_grammar.push_back(sub_rule_id);
                }
                // apply actual size
                out_grammar[sub_rule_start] = out_grammar.size() - sub_rule_start;
                // mark end of 1st alternate
                out_grammar.push_back(0);
                sub_rule_start = out_grammar.size();
                // placeholder for size of 2nd alternate
                out_grammar.push_back(0);
                if (*pos == '+') {
                    // add preceding symbol as alternate only for '+'
                    out_grammar.insert(out_grammar.end(), outbuf.begin() + last_sym_start, outbuf.end());
                }
                // apply actual size of 2nd alternate
                out_grammar[sub_rule_start] = out_grammar.size() - sub_rule_start;
                // mark end of 2nd alternate, then end of rule
                out_grammar.push_back(0);
                out_grammar.push_back(0);
                // in original rule, replace previous symbol with reference to generated rule
                outbuf.resize(last_sym_start);
                outbuf.push_back(1);
                outbuf.push_back(sub_rule_id);
                pos = parse_space(pos + 1);
            } else {
                break;
            }
        }
        // apply actual size of this alternate sequence
        outbuf[out_start] = static_cast<uint16_t>(outbuf.size() - out_start);
        // mark end of alternate
        outbuf.push_back(0);
        return pos;
    }
    const char * parse_alternates(
            parse_state       & state,
            const char        * src,
            const std::string & rule_name,
            uint16_t            rule_id) {
        std::vector<uint16_t> outbuf;
        const char * pos = parse_sequence(state, src, rule_name, outbuf);
        while (*pos == '|') {
            pos = parse_space(pos + 1);
            pos = parse_sequence(state, pos, rule_name, outbuf);
        }
        state.out_grammar.push_back(rule_id);
        state.out_grammar.insert(state.out_grammar.end(), outbuf.begin(), outbuf.end());
        state.out_grammar.push_back(0);
        return pos;
    }
    const char * parse_rule(parse_state & state, const char * src) {
        auto         name_pair = parse_name(src);
        const char * pos       = name_pair.second;
        size_t       name_len  = name_pair.first - src;
        uint16_t     rule_id   = get_symbol_id(state, src, name_len);
        const std::string name(src, name_len);
        if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
            throw std::string("expecting ::= at ") + pos;
        }
        pos = parse_space(pos + 3);
        pos = parse_alternates(state, pos, name, rule_id);
        if (*pos == '\r') {
            pos += pos[1] == '\n' ? 2 : 1;
        } else if (*pos == '\n') {
            pos++;
        } else if (*pos) {
            throw std::string("expecting newline or end at ") + pos;
        }
        return parse_space(pos);
    }
    parse_state parse(const char * src) {
        parse_state state;
        const char * pos = parse_space(src);
        while (*pos) {
            pos = parse_rule(state, pos);
        }
        state.out_grammar.push_back(0xffff);
        return state;
    }
    const uint16_t * print_rule(
            FILE           * file,
            const uint16_t * base,
            const uint16_t * src, 
            const std::map<uint16_t, std::string> & symbol_id_names) {
        uint16_t rule_id = *src;
        fprintf(file, "<%zu>%s ::= ", src - base, symbol_id_names.at(rule_id).c_str());
        const uint16_t * pos = src + 1;
        while (*pos) {
            if (pos - 1 > src) {
                fprintf(file, "| ");
            }
            pos++; // sequence size, not needed here
            while (*pos) {
                if (*pos == 1) {
                    uint16_t ref_rule_id = pos[1];
                    fprintf(file, "<%zu>%s ", pos - base, symbol_id_names.at(ref_rule_id).c_str());
                    pos += 2;
                } else {
                    fprintf(file, "<%zu>[", pos - base);
                    uint16_t num_chars = *pos;
                    pos++;
                    for (uint16_t i = 0; i < num_chars; i += 2) {
                        fprintf(file, "%lc-", static_cast<wint_t>(pos[i])); // REVIEW
                        if (i + 1 < num_chars) {
                            fprintf(file, "%lc", static_cast<wint_t>(pos[i + 1]));
                        }
                    }
                    fprintf(file, "] ");
                    pos += num_chars;
                }
            }
            pos++;
        }
        fprintf(file, "\n");
        return pos + 1;
    }
    void print_grammar(FILE * file, const parse_state & state) {
        std::map<uint16_t, std::string> symbol_id_names;
        for (auto kv : state.symbol_ids) {
            symbol_id_names[kv.second] = kv.first;
        }
        const uint16_t * pos = state.out_grammar.data();
        while (*pos != 0xffff) {
            pos = print_rule(file, state.out_grammar.data(), pos, symbol_id_names);
        }
    }
 }
--- a/examples/grammar-parser.h
+++ b/examples/grammar-parser.h
@ -0,0 +1,26 @@
 // Implements a parser for an extended Backus-Naur form (BNF), producing the
 // binary context-free grammar format specified by llama.h. Supports character
 // ranges, grouping, and repetition operators. As an example, a grammar for
 // arithmetic might look like:
 //
 // root  ::= expr
 // expr  ::= term ([-+*/] term)*
 // term  ::= num | "(" space expr ")" space
 // num   ::= [0-9]+ space
 // space ::= [ \t\n]*
 #pragma once
 #include <vector>
 #include <map>
 #include <cstdint>
 #include <string>
 namespace grammar_parser {
    struct parse_state {
        std::map<std::string, uint16_t> symbol_ids;
        std::vector<uint16_t>           out_grammar;
    };
    parse_state parse(const char * src);
    void print_grammar(FILE * file, const parse_state & state);
 }
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -6,6 +6,7 @@
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
 #include "grammar-parser.h"
 #include <cassert>
 #include <cinttypes>
@ -291,6 +292,17 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    fprintf(stderr, "\n\n");
    grammar_parser::parse_state parsed_grammar;
    llama_grammar *             grammar = NULL; 
    if (!params.grammar.empty()) {
        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
        fprintf(stderr, "%s: grammar:\n", __func__);
        grammar_parser::print_grammar(stderr, parsed_grammar);
        fprintf(stderr, "\n");
        grammar = llama_grammar_init(
            parsed_grammar.out_grammar.data(), parsed_grammar.symbol_ids.at("root"));
    }
    // TODO: replace with ring-buffer
    std::vector<llama_token> last_n_tokens(n_ctx);
    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
@ -454,6 +466,10 @@ int main(int argc, char ** argv) {
                    logits[llama_token_nl()] = nl_logit;
                }
                if (grammar != NULL) {
                    llama_sample_grammar(ctx, &candidates_p, grammar);
                }
                if (temp <= 0) {
                    // Greedy sampling
                    id = llama_sample_token_greedy(ctx, &candidates_p);
@ -479,6 +495,10 @@ int main(int argc, char ** argv) {
                }
                // printf("`%d`", candidates_p.size);
                if (grammar != NULL) {
                    id = llama_grammar_accept_token(ctx, grammar, id);
                }
                last_n_tokens.erase(last_n_tokens.begin());
                last_n_tokens.push_back(id);
            }
@ -609,6 +629,17 @@ int main(int argc, char ** argv) {
            }
            if (n_past > 0) {
                if (is_interacting) {
                    // reset grammar state if we're restarting generation
                    if (!params.grammar.empty()) {
                        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
                        if (grammar != NULL) {
                            llama_grammar_free(grammar);
                        }
                        grammar = llama_grammar_init(
                            parsed_grammar.out_grammar.data(), parsed_grammar.symbol_ids.at("root"));
                    }
                }
                is_interacting = false;
            }
        }
@ -638,5 +669,9 @@ int main(int argc, char ** argv) {
    llama_print_timings(ctx);
    llama_free(ctx);
    if (grammar != NULL) {
        llama_grammar_free(grammar);
    }
    return 0;
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -1821,6 +1821,168 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
    return output;
 }
 //
 // grammar - internal
 //
 struct llama_grammar {
    const std::vector<const uint16_t *>        rules;
    std::vector<std::vector<const uint16_t *>> stacks;
 };
 // transforms a grammar pushdown stack into N possible stacks, all terminating
 // at a character range (terminal element)
 static void llama_grammar_advance_stack(
        const std::vector<const uint16_t *>        & rules,
        const std::vector<const uint16_t *>        & stack,
        std::vector<std::vector<const uint16_t *>> & new_stacks) {
    if (stack.empty()) {
        new_stacks.push_back(stack);
        return;
    }
    const uint16_t * pos = stack.back();
    if (*pos == 1) {
        // rule reference, apply rule to stack
        const uint16_t * subpos = rules[pos[1]] + 1;
        while (*subpos) {
            // init new stack without the top (pos)
            std::vector<const uint16_t *> new_stack(stack.begin(), stack.end() - 1);
            if (pos[2]) {
                // if the rule ref is followed by another element, add that to stack
                new_stack.push_back(pos + 2);
            }
            if (subpos[1]) {
                // if the referenced rule is nonempty, add that to the stack
                new_stack.push_back(subpos + 1);
            }
            llama_grammar_advance_stack(rules, new_stack, new_stacks);
            subpos += 1 + *subpos;
        }
    } else {
        // rule element size > 1 -> character reference
        LLAMA_ASSERT(*pos);
        new_stacks.push_back(stack);
    }
 }
 // takes a set of possible pushdown stacks on a grammar, which are required to
 // be positioned at a character range (see `llama_grammar_advance_stack`), and
 // produces the N possible stacks if the given char is accepted at those
 // positions  
 static std::vector<std::vector<const uint16_t *>> llama_grammar_accept(
        const std::vector<const uint16_t *>              & rules,
        const std::vector<std::vector<const uint16_t *>> & stacks,
        const uint16_t                                     chr) {
    std::vector<std::vector<const uint16_t *>> new_stacks;
    for (const auto & stack : stacks) {
        if (stack.empty()) {
            continue;
        }
        const uint16_t * pos       = stack.back();
        const uint16_t   num_chars = *pos;
        LLAMA_ASSERT(num_chars > 1);
        pos++; // skip num chars indicator
        bool found = false;
        // loop over the inclusive char pairs to find a match on the given char
        for (int i = 0; i < num_chars; i += 2) {
            if (pos[i] <= chr && (i + 1 == num_chars || chr <= pos[i + 1])) {
                found = true;
                break;
            }
        }
        if (!found) {
            continue;
        }
        // advance past char range, updating top of stack to next element, if any
        pos += num_chars;
        std::vector<const uint16_t *> new_stack(stack.begin(), stack.end() - 1);
        if (*pos) {
            new_stack.push_back(pos);
        }
        llama_grammar_advance_stack(rules, new_stack, new_stacks);
    }
    return new_stacks;
 }
 // returns `true` if one of the pushdown stacks can accept the given char. 
 static bool llama_grammar_peek(
        const std::vector<std::vector<const uint16_t *>> & stacks,
        const uint16_t                                     chr) {
    for (const auto & stack : stacks) {
        if (stack.empty()) {
            if (!chr) {
                return true;
            }
        } else {
            const uint16_t * pos       = stack.back();
            const uint16_t   num_chars = *pos;
            LLAMA_ASSERT(num_chars > 1);
            pos++;
            for (int i = 0; i < num_chars; i += 2) {
                if (pos[i] <= chr && (i + 1 == num_chars || chr <= pos[i + 1])) {
                    return true;
                }
            }
        }
    }
    return false;
 }
 //
 // grammar - external
 // 
 struct llama_grammar * llama_grammar_init(const uint16_t * src, uint16_t start_rule_id) {
    const uint16_t * pos = src;
    std::vector<const uint16_t *> rules;
    // build `rules` as list of pointers to rules embedded in binary grammar `src`
    while (*pos != 0xffff) {
        uint16_t rule_id = *pos;
        if (rules.size() <= rule_id) {
            rules.resize(rule_id + 1);
        }
        rules[rule_id] = pos;
        // skip rule id
        pos++;
        // skip rule alternates
        while (*pos) {
            pos += 1 + *pos;
        }
        // skip 0 denoting end of rule
        pos++;
    }
    // TODO: handle if start rule has alternates
    const uint16_t * start_rule = rules[start_rule_id];
    // rule starts with rule id and 1st alternate's size; skip that so initial
    // stack starts at 1st element in 1st alternate
    LLAMA_ASSERT(start_rule[0] == start_rule_id && start_rule[1]);
    const std::vector<const uint16_t *> stack = { start_rule + 2 };
    std::vector<std::vector<const uint16_t *>> stacks;
    llama_grammar_advance_stack(rules, stack, stacks);
    return new llama_grammar{ rules, stacks };
 }
 void llama_grammar_free(struct llama_grammar * grammar) {
    delete grammar;
 }
 //
 // sampling
 //
@ -2097,6 +2259,30 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
    }
 }
 void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
    assert(ctx);
    const int64_t     t_start_sample_us  = ggml_time_us();
    const llama_token eos                = llama_token_eos();
    // since many llama tokens are prefixed with a single space, special case a lookahead on ' '
    const auto        stacks_after_space = llama_grammar_accept(grammar->rules, grammar->stacks, ' ');
    for (size_t i = 0; i < candidates->size; ++i) {
        const llama_token id    = candidates->data[i].id;
        const char *      str   = llama_token_to_str(ctx, id);
        // prune tokens based on first char only - in `llama_grammar_accept_token` we will find the
        // full matching prefix of the selected token
        const bool valid = str[0] == ' '
            ? llama_grammar_peek(stacks_after_space, str[1])
            : llama_grammar_peek(grammar->stacks,    id == eos ? 0 : str[0]);
        if (!valid) {
            candidates->data[i].logit = -INFINITY;
        }
    }
    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
 }
 llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
    assert(ctx);
@ -2223,6 +2409,60 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
    return result;
 }
 llama_token llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
    const int64_t t_start_sample_us = ggml_time_us();
    if (token == llama_token_eos()) {
        for (const auto & stack : grammar->stacks) {
            if (stack.empty()) {
                return token;
            }
            LLAMA_ASSERT(false);
        }
    }
    const char * str    = llama_token_to_str(ctx, token);
    const char * suffix = str;
    // Find prefix of selected token that matches grammar, expecting at least 1 char
    auto new_stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *suffix);
    LLAMA_ASSERT(!new_stacks.empty());
    if (*suffix) {
        ++suffix;
        for ( ; *suffix; ++suffix) {
            new_stacks = llama_grammar_accept(grammar->rules, new_stacks, *suffix);
            if (new_stacks.empty()) {
                break;
            }
        }
    }
    // if full token is matched, accept new stacks
    if (!(*suffix)) {
        grammar->stacks = new_stacks;
        return token;
    }
    // otherwise, tokenize the string prefix that did match
    llama_token tokens[32]; // TODO - determine actual max token size
    const std::string prefix_str(str, suffix - str);
    int n_tokens = llama_tokenize(ctx, prefix_str.c_str(), tokens, 32, false);
    if (n_tokens < 1) {
        return token; // REVIEW
    }
    // accept the first token of the matching prefix into the grammar
    llama_token first_prefix_token = tokens[0];
    const char * first_prefix_str = llama_token_to_str(ctx, first_prefix_token);
    for ( ; *first_prefix_str; ++first_prefix_str) {
        grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *first_prefix_str);
        LLAMA_ASSERT(!grammar->stacks.empty());
    }
    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
    return first_prefix_token;
 }
 //
 // quantization
 //
--- a/llama.h
+++ b/llama.h
@ -55,6 +55,8 @@ extern "C" {
    struct llama_context;
    struct llama_grammar;
    typedef int llama_token;
    typedef struct llama_token_data {
@ -233,6 +235,30 @@ extern "C" {
    LLAMA_API llama_token llama_token_eos();
    LLAMA_API llama_token llama_token_nl();
    // Grammar
    //
    // Accepts a binary encoding of a context-free grammar. The returned struct can be used to
    // constrain sampled tokens (see below).
    //
    // The binary format represents one or more production rules, each with one or more alternate
    // defininitions:
    //
    // (<rule_id: u16> (<alt_size: u16> <alt_size * u16>)+ 0000)+ FFFF
    //
    // rule_ids should be assigned sequentially from zero but may appear out of order. Each
    // rule alternate is a sequence of zero or more symbols, each prefixed with size:
    //
    // (<sym_size: u16> <sym_size * u16>)* 0000
    //
    // A symbol of size 1 is interpreted as a rule reference (whose value is the single following
    // u16). Symbols sized greater than 1 are interpreted as inclusive pairs of 16-bit chars to
    // match. Note that symbol sizes greater than 7FFF are reserved for future use.
    //
    // The provided `src` must be kept valid for the lifetime of the `llama_grammar`.
    //
    LLAMA_API struct llama_grammar * llama_grammar_init(const uint16_t * src, uint16_t start_rule_id);
    LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
    // Sampling functions
    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@ -257,6 +283,9 @@ extern "C" {
    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
    /// @details Apply constraints from grammar
    LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@ -278,6 +307,9 @@ extern "C" {
    /// @details Randomly selects a token from the candidates based on their probabilities.
    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
    /// @details Accepts the sampled token into the grammar, possibly transforming to a new token
    LLAMA_API llama_token llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
    // Performance information
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
    LLAMA_API void llama_reset_timings(struct llama_context * ctx);