llama : move grammar code into llama-grammar
ggml-ci
This commit is contained in:
parent
0ddc8e361c
commit
675f305f31
12 changed files with 742 additions and 672 deletions
18
Makefile
18
Makefile
|
@ -876,6 +876,8 @@ OBJ_GGML += \
|
|||
|
||||
OBJ_LLAMA = \
|
||||
src/llama.o \
|
||||
src/llama-vocab.o \
|
||||
src/llama-grammar.o \
|
||||
src/llama-sampling.o \
|
||||
src/unicode.o \
|
||||
src/unicode-data.o
|
||||
|
@ -1066,6 +1068,20 @@ src/llama.o: \
|
|||
ggml/include/ggml-backend.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
src/llama-vocab.o: \
|
||||
src/llama-vocab.cpp \
|
||||
src/llama-vocab.h \
|
||||
src/llama-impl.h \
|
||||
include/llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
src/llama-grammar.o: \
|
||||
src/llama-grammar.cpp \
|
||||
src/llama-grammar.h \
|
||||
src/llama-impl.h \
|
||||
include/llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
src/llama-sampling.o: \
|
||||
src/llama-sampling.cpp \
|
||||
src/llama-sampling.h \
|
||||
|
@ -1448,7 +1464,7 @@ run-benchmark-matmult: llama-benchmark-matmult
|
|||
.PHONY: run-benchmark-matmult swift
|
||||
|
||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp \
|
||||
$(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
|
|
|
@ -330,7 +330,7 @@ static llama_token llama_sampling_sample_impl(
|
|||
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
|
||||
|
||||
// Apply grammar constraints to the single token
|
||||
llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
|
||||
llama_grammar_sample(ctx_main, &single_token_data_array, ctx_sampling->grammar);
|
||||
|
||||
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
|
||||
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||
|
@ -421,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
|
|||
|
||||
// apply grammar checks before sampling logic
|
||||
if (apply_grammar && ctx_sampling->grammar != NULL) {
|
||||
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
||||
llama_grammar_sample(ctx_main, &cur_p, ctx_sampling->grammar);
|
||||
}
|
||||
|
||||
return cur_p;
|
||||
|
|
|
@ -16,20 +16,23 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
|
|||
auto decoded = decode_utf8(input_str, {});
|
||||
const auto & code_points = decoded.first;
|
||||
|
||||
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
|
||||
|
||||
size_t pos = 0;
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
auto prev_stacks = grammar->stacks;
|
||||
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||
if (grammar->stacks.empty()) {
|
||||
const llama_grammar_rules & prev_rules = llama_grammar_get_rules (grammar);
|
||||
const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
|
||||
llama_grammar_accept(prev_rules, prev_stacks, *it, cur_stacks);
|
||||
if (cur_stacks.empty()) {
|
||||
error_pos = pos;
|
||||
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
|
||||
grammar->stacks = prev_stacks;
|
||||
cur_stacks = prev_stacks;
|
||||
return false;
|
||||
}
|
||||
++pos;
|
||||
}
|
||||
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
for (const auto & stack : cur_stacks) {
|
||||
if (stack.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -1003,6 +1003,18 @@ extern "C" {
|
|||
|
||||
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
|
||||
|
||||
/// @details Apply constraints from grammar
|
||||
LLAMA_API void llama_grammar_sample(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
const struct llama_grammar * grammar);
|
||||
|
||||
/// @details Accepts the sampled token into the grammar
|
||||
LLAMA_API void llama_grammar_accept_token(
|
||||
struct llama_context * ctx,
|
||||
struct llama_grammar * grammar,
|
||||
llama_token token);
|
||||
|
||||
//
|
||||
// Sampling functions
|
||||
//
|
||||
|
@ -1121,18 +1133,6 @@ extern "C" {
|
|||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates);
|
||||
|
||||
/// @details Apply constraints from grammar
|
||||
LLAMA_API void llama_sample_grammar(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
const struct llama_grammar * grammar);
|
||||
|
||||
/// @details Accepts the sampled token into the grammar
|
||||
LLAMA_API void llama_grammar_accept_token(
|
||||
struct llama_context * ctx,
|
||||
struct llama_grammar * grammar,
|
||||
llama_token token);
|
||||
|
||||
//
|
||||
// Model split
|
||||
//
|
||||
|
@ -1175,38 +1175,41 @@ extern "C" {
|
|||
|
||||
struct ggml_tensor;
|
||||
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
||||
struct llama_context * ctx
|
||||
);
|
||||
|
||||
struct llama_partial_utf8 {
|
||||
uint32_t value; // bit value so far (unshifted)
|
||||
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
||||
};
|
||||
|
||||
struct llama_grammar {
|
||||
const std::vector<std::vector<llama_grammar_element>> rules;
|
||||
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
||||
|
||||
// buffer for partially generated UTF-8 sequence from accepted tokens
|
||||
llama_partial_utf8 partial_utf8;
|
||||
};
|
||||
|
||||
struct llama_grammar_candidate {
|
||||
size_t index;
|
||||
const uint32_t * code_points;
|
||||
llama_partial_utf8 partial_utf8;
|
||||
};
|
||||
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
||||
struct llama_context * ctx
|
||||
);
|
||||
using llama_grammar_rules = std::vector<std::vector<llama_grammar_element>>;
|
||||
using llama_grammar_stacks = std::vector<std::vector<const llama_grammar_element *>>;
|
||||
|
||||
const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar);
|
||||
llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar);
|
||||
|
||||
void llama_grammar_accept(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||
const uint32_t chr,
|
||||
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
|
||||
const llama_grammar_rules & rules,
|
||||
const llama_grammar_stacks & stacks,
|
||||
const uint32_t chr,
|
||||
llama_grammar_stacks & new_stacks);
|
||||
|
||||
std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<const llama_grammar_element *> & stack,
|
||||
const std::vector<llama_grammar_candidate> & candidates);
|
||||
|
||||
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||
const std::string & src,
|
||||
llama_partial_utf8 partial_start);
|
||||
llama_partial_utf8 partial_start);
|
||||
|
||||
// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
|
||||
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
|
||||
|
|
|
@ -14,6 +14,8 @@ endif()
|
|||
add_library(llama
|
||||
../include/llama.h
|
||||
llama.cpp
|
||||
llama-vocab.cpp
|
||||
llama-grammar.cpp
|
||||
llama-sampling.cpp
|
||||
unicode.h
|
||||
unicode.cpp
|
||||
|
|
544
src/llama-grammar.cpp
Normal file
544
src/llama-grammar.cpp
Normal file
|
@ -0,0 +1,544 @@
|
|||
#include "llama-grammar.h"
|
||||
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-sampling.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
||||
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
||||
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||
const std::string & src,
|
||||
llama_partial_utf8 partial_start) {
|
||||
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
||||
const char * pos = src.c_str();
|
||||
std::vector<uint32_t> code_points;
|
||||
|
||||
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
|
||||
code_points.reserve(src.size() + 1);
|
||||
uint32_t value = partial_start.value;
|
||||
int n_remain = partial_start.n_remain;
|
||||
|
||||
// continue previous decode, if applicable
|
||||
while (*pos != 0 && n_remain > 0) {
|
||||
uint8_t next_byte = static_cast<uint8_t>(*pos);
|
||||
if ((next_byte >> 6) != 2) {
|
||||
// invalid sequence, abort
|
||||
code_points.push_back(0);
|
||||
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
|
||||
}
|
||||
value = (value << 6) + (next_byte & 0x3F);
|
||||
++pos;
|
||||
--n_remain;
|
||||
}
|
||||
|
||||
if (partial_start.n_remain > 0 && n_remain == 0) {
|
||||
code_points.push_back(value);
|
||||
}
|
||||
|
||||
// decode any subsequent utf-8 sequences, which may end in an incomplete one
|
||||
while (*pos != 0) {
|
||||
uint8_t first_byte = static_cast<uint8_t>(*pos);
|
||||
uint8_t highbits = first_byte >> 4;
|
||||
n_remain = lookup[highbits] - 1;
|
||||
|
||||
if (n_remain < 0) {
|
||||
// invalid sequence, abort
|
||||
code_points.clear();
|
||||
code_points.push_back(0);
|
||||
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
|
||||
}
|
||||
|
||||
uint8_t mask = (1 << (7 - n_remain)) - 1;
|
||||
value = first_byte & mask;
|
||||
|
||||
++pos;
|
||||
while (*pos != 0 && n_remain > 0) {
|
||||
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
||||
++pos;
|
||||
--n_remain;
|
||||
}
|
||||
if (n_remain == 0) {
|
||||
code_points.push_back(value);
|
||||
}
|
||||
}
|
||||
code_points.push_back(0);
|
||||
|
||||
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
||||
}
|
||||
|
||||
const llama_grammar_rules & llama_grammar_get_rules(const struct llama_grammar * grammar) {
|
||||
return grammar->rules;
|
||||
}
|
||||
|
||||
llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar) {
|
||||
return grammar->stacks;
|
||||
}
|
||||
|
||||
// returns true iff pos points to the end of one of the definitions of a rule
|
||||
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
||||
switch (pos->type) {
|
||||
case LLAMA_GRETYPE_END: return true; // NOLINT
|
||||
case LLAMA_GRETYPE_ALT: return true; // NOLINT
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
// returns true iff chr satisfies the char range at pos (regular or inverse range)
|
||||
// asserts that pos is pointing to a char range element
|
||||
static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
||||
const llama_grammar_element * pos,
|
||||
const uint32_t chr) {
|
||||
|
||||
bool found = false;
|
||||
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
||||
|
||||
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
|
||||
|
||||
do {
|
||||
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
||||
// inclusive range, e.g. [a-z]
|
||||
found = found || (pos->value <= chr && chr <= pos[1].value);
|
||||
pos += 2;
|
||||
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
||||
// Any character matches "."
|
||||
found = true;
|
||||
pos += 1;
|
||||
} else {
|
||||
// exact char match, e.g. [a] or "a"
|
||||
found = found || pos->value == chr;
|
||||
pos += 1;
|
||||
}
|
||||
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
||||
|
||||
return std::make_pair(found == is_positive_char, pos);
|
||||
}
|
||||
|
||||
// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
|
||||
// range at pos (regular or inverse range)
|
||||
// asserts that pos is pointing to a char range element
|
||||
static bool llama_grammar_match_partial_char(
|
||||
const llama_grammar_element * pos,
|
||||
const llama_partial_utf8 partial_utf8) {
|
||||
|
||||
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
||||
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
||||
|
||||
uint32_t partial_value = partial_utf8.value;
|
||||
int n_remain = partial_utf8.n_remain;
|
||||
|
||||
// invalid sequence or 7-bit char split across 2 bytes (overlong)
|
||||
if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// range of possible code points this partial UTF-8 sequence could complete to
|
||||
uint32_t low = partial_value << (n_remain * 6);
|
||||
uint32_t high = low | ((1 << (n_remain * 6)) - 1);
|
||||
|
||||
if (low == 0) {
|
||||
if (n_remain == 2) {
|
||||
low = 1 << 11;
|
||||
} else if (n_remain == 3) {
|
||||
low = 1 << 16;
|
||||
}
|
||||
}
|
||||
|
||||
do {
|
||||
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
||||
// inclusive range, e.g. [a-z]
|
||||
if (pos->value <= high && low <= pos[1].value) {
|
||||
return is_positive_char;
|
||||
}
|
||||
pos += 2;
|
||||
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
||||
// Any character matches "."
|
||||
return true;
|
||||
} else {
|
||||
// exact char match, e.g. [a] or "a"
|
||||
if (low <= pos->value && pos->value <= high) {
|
||||
return is_positive_char;
|
||||
}
|
||||
pos += 1;
|
||||
}
|
||||
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
||||
|
||||
return !is_positive_char;
|
||||
}
|
||||
|
||||
|
||||
// transforms a grammar pushdown stack into N possible stacks, all ending
|
||||
// at a character range (terminal element)
|
||||
static void llama_grammar_advance_stack(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<const llama_grammar_element *> & stack,
|
||||
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
||||
|
||||
if (stack.empty()) {
|
||||
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||
new_stacks.emplace_back(stack);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const llama_grammar_element * pos = stack.back();
|
||||
|
||||
switch (pos->type) {
|
||||
case LLAMA_GRETYPE_RULE_REF: {
|
||||
const size_t rule_id = static_cast<size_t>(pos->value);
|
||||
const llama_grammar_element * subpos = rules[rule_id].data();
|
||||
do {
|
||||
// init new stack without the top (pos)
|
||||
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
||||
// if this rule ref is followed by another element, add that to stack
|
||||
new_stack.push_back(pos + 1);
|
||||
}
|
||||
if (!llama_grammar_is_end_of_sequence(subpos)) {
|
||||
// if alternate is nonempty, add to stack
|
||||
new_stack.push_back(subpos);
|
||||
}
|
||||
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
||||
while (!llama_grammar_is_end_of_sequence(subpos)) {
|
||||
// scan to end of alternate def
|
||||
subpos++;
|
||||
}
|
||||
if (subpos->type == LLAMA_GRETYPE_ALT) {
|
||||
// there's another alternate def of this rule to process
|
||||
subpos++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (true);
|
||||
break;
|
||||
}
|
||||
case LLAMA_GRETYPE_CHAR:
|
||||
case LLAMA_GRETYPE_CHAR_NOT:
|
||||
case LLAMA_GRETYPE_CHAR_ANY:
|
||||
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||
// only add the stack if it's not a duplicate of one we already have
|
||||
new_stacks.emplace_back(stack);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
||||
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
|
||||
// those
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// takes a set of possible pushdown stacks on a grammar, which are required to
|
||||
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
||||
// produces the N possible stacks if the given char is accepted at those
|
||||
// positions
|
||||
void llama_grammar_accept(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||
const uint32_t chr,
|
||||
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
||||
|
||||
new_stacks.clear();
|
||||
|
||||
for (const auto & stack : stacks) {
|
||||
if (stack.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto match = llama_grammar_match_char(stack.back(), chr);
|
||||
if (match.first) {
|
||||
const llama_grammar_element * pos = match.second;
|
||||
|
||||
// update top of stack to next element, if any
|
||||
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(pos)) {
|
||||
new_stack.push_back(pos);
|
||||
}
|
||||
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||
const std::vector<llama_grammar_candidate> & candidates);
|
||||
|
||||
std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<const llama_grammar_element *> & stack,
|
||||
const std::vector<llama_grammar_candidate> & candidates) {
|
||||
|
||||
std::vector<llama_grammar_candidate> rejects;
|
||||
rejects.reserve(candidates.size());
|
||||
|
||||
if (stack.empty()) {
|
||||
for (const auto & tok : candidates) {
|
||||
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
}
|
||||
return rejects;
|
||||
}
|
||||
|
||||
const llama_grammar_element * stack_pos = stack.back();
|
||||
|
||||
std::vector<llama_grammar_candidate> next_candidates;
|
||||
next_candidates.reserve(candidates.size());
|
||||
|
||||
for (const auto & tok : candidates) {
|
||||
if (*tok.code_points == 0) {
|
||||
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
||||
// that cannot satisfy this position in grammar
|
||||
if (tok.partial_utf8.n_remain != 0 &&
|
||||
!llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
|
||||
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
|
||||
} else {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
}
|
||||
|
||||
const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
|
||||
|
||||
// update top of stack to next element, if any
|
||||
std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
|
||||
stack_after.push_back(stack_pos_after);
|
||||
}
|
||||
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
|
||||
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
||||
|
||||
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
||||
for (const auto & tok : next_rejects) {
|
||||
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
||||
}
|
||||
|
||||
return rejects;
|
||||
}
|
||||
|
||||
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||
const std::vector<llama_grammar_candidate> & candidates) {
|
||||
GGML_ASSERT(!stacks.empty()); // REVIEW
|
||||
|
||||
if (candidates.empty()) {
|
||||
return std::vector<llama_grammar_candidate>();
|
||||
}
|
||||
|
||||
auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
|
||||
|
||||
for (size_t i = 1, size = stacks.size(); i < size; ++i) {
|
||||
rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
|
||||
}
|
||||
return rejects;
|
||||
}
|
||||
|
||||
static bool llama_grammar_detect_left_recursion(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
size_t rule_index,
|
||||
std::vector<bool> * rules_visited,
|
||||
std::vector<bool> * rules_in_progress,
|
||||
std::vector<bool> * rules_may_be_empty) {
|
||||
if ((*rules_in_progress)[rule_index]) {
|
||||
return true;
|
||||
}
|
||||
|
||||
(*rules_in_progress)[rule_index] = true;
|
||||
|
||||
const std::vector<llama_grammar_element> & rule = rules[rule_index];
|
||||
|
||||
// First check if the rule might produce the empty string. This could be done combined with the second
|
||||
// step but it's more readable as two steps.
|
||||
bool at_rule_start = true;
|
||||
for (size_t i = 0; i < rule.size(); i++) {
|
||||
if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
||||
if (at_rule_start) {
|
||||
(*rules_may_be_empty)[rule_index] = true;
|
||||
break;
|
||||
}
|
||||
at_rule_start = true;
|
||||
} else {
|
||||
at_rule_start = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
|
||||
// be empty)
|
||||
bool recurse_into_nonterminal = true;
|
||||
for (size_t i = 0; i < rule.size(); i++) {
|
||||
if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
|
||||
if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
|
||||
return true;
|
||||
}
|
||||
if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
|
||||
recurse_into_nonterminal = false;
|
||||
}
|
||||
} else if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
||||
recurse_into_nonterminal = true;
|
||||
} else {
|
||||
recurse_into_nonterminal = false;
|
||||
}
|
||||
}
|
||||
|
||||
(*rules_in_progress)[rule_index] = false;
|
||||
(*rules_visited)[rule_index] = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
//
|
||||
// grammar - external
|
||||
//
|
||||
|
||||
struct llama_grammar * llama_grammar_init(
|
||||
const llama_grammar_element ** rules,
|
||||
size_t n_rules,
|
||||
size_t start_rule_index) {
|
||||
const llama_grammar_element * pos;
|
||||
|
||||
// copy rule definitions into vectors
|
||||
std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
|
||||
for (size_t i = 0; i < n_rules; i++) {
|
||||
for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
|
||||
vec_rules[i].push_back(*pos);
|
||||
}
|
||||
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
||||
}
|
||||
|
||||
// Check for left recursion
|
||||
std::vector<bool> rules_visited(n_rules);
|
||||
std::vector<bool> rules_in_progress(n_rules);
|
||||
std::vector<bool> rules_may_be_empty(n_rules);
|
||||
for (size_t i = 0; i < n_rules; i++) {
|
||||
if (rules_visited[i]) {
|
||||
continue;
|
||||
}
|
||||
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
||||
LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// loop over alternates of start rule to build initial stacks
|
||||
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
||||
pos = vec_rules[start_rule_index].data();
|
||||
do {
|
||||
std::vector<const llama_grammar_element *> stack;
|
||||
if (!llama_grammar_is_end_of_sequence(pos)) {
|
||||
// if alternate is nonempty, add to stack
|
||||
stack.push_back(pos);
|
||||
}
|
||||
llama_grammar_advance_stack(vec_rules, stack, stacks);
|
||||
while (!llama_grammar_is_end_of_sequence(pos)) {
|
||||
// scan to end of alternate def
|
||||
pos++;
|
||||
}
|
||||
if (pos->type == LLAMA_GRETYPE_ALT) {
|
||||
// there's another alternate def of this rule to process
|
||||
pos++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (true);
|
||||
|
||||
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
||||
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
||||
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
||||
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
||||
}
|
||||
|
||||
void llama_grammar_free(struct llama_grammar * grammar) {
|
||||
delete grammar;
|
||||
}
|
||||
|
||||
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
|
||||
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
|
||||
|
||||
// redirect elements in stacks to point to new rules
|
||||
for (size_t is = 0; is < result->stacks.size(); is++) {
|
||||
for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
|
||||
for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
|
||||
for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
|
||||
if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
|
||||
result->stacks[is][ie] = &result->rules[ir0][ir1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void llama_grammar_sample(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
||||
GGML_ASSERT(ctx);
|
||||
int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
bool allow_eog = false;
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
if (stack.empty()) {
|
||||
allow_eog = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
||||
candidates_decoded.reserve(candidates->size);
|
||||
|
||||
std::vector<llama_grammar_candidate> candidates_grammar;
|
||||
candidates_grammar.reserve(candidates->size);
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
const llama_token id = candidates->data[i].id;
|
||||
const std::string & piece = llama_get_vocab(ctx)->cache_token_to_piece.at(id);
|
||||
|
||||
if (llama_token_is_eog(llama_get_model(ctx), id)) {
|
||||
if (!allow_eog) {
|
||||
candidates->data[i].logit = -INFINITY;
|
||||
}
|
||||
} else if (piece.empty() || piece[0] == 0) {
|
||||
candidates->data[i].logit = -INFINITY;
|
||||
} else {
|
||||
candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
|
||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
||||
}
|
||||
}
|
||||
|
||||
const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
|
||||
for (const auto & reject : rejects) {
|
||||
candidates->data[reject.index].logit = -INFINITY;
|
||||
}
|
||||
|
||||
llama_get_sampling(ctx)->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
|
||||
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
if (llama_token_is_eog(llama_get_model(ctx), token)) {
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
if (stack.empty()) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
const std::string & piece = llama_get_vocab(ctx)->cache_token_to_piece.at(token);
|
||||
|
||||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
||||
const auto & code_points = decoded.first;
|
||||
std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
|
||||
grammar->stacks = tmp_new_stacks;
|
||||
}
|
||||
grammar->partial_utf8 = decoded.second;
|
||||
GGML_ASSERT(!grammar->stacks.empty());
|
||||
|
||||
llama_get_sampling(ctx)->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
15
src/llama-grammar.h
Normal file
15
src/llama-grammar.h
Normal file
|
@ -0,0 +1,15 @@
|
|||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
|
||||
struct llama_vocab;
|
||||
|
||||
struct llama_grammar {
|
||||
const llama_grammar_rules rules;
|
||||
llama_grammar_stacks stacks;
|
||||
|
||||
// buffer for partially generated UTF-8 sequence from accepted tokens
|
||||
llama_partial_utf8 partial_utf8;
|
||||
};
|
||||
|
||||
struct llama_grammar * llama_get_grammar(struct llama_context * ctx);
|
15
src/llama-vocab.cpp
Normal file
15
src/llama-vocab.cpp
Normal file
|
@ -0,0 +1,15 @@
|
|||
#include "llama-vocab.h"
|
||||
|
||||
int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
||||
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
||||
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
||||
GGML_ASSERT(token_right.find(' ') == std::string::npos);
|
||||
GGML_ASSERT(token_right.find('\n') == std::string::npos);
|
||||
|
||||
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
||||
if (it == bpe_ranks.end()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return it->second;
|
||||
}
|
64
src/llama-vocab.h
Normal file
64
src/llama-vocab.h
Normal file
|
@ -0,0 +1,64 @@
|
|||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <map>
|
||||
|
||||
struct llama_vocab {
|
||||
using id = llama_token;
|
||||
using token = std::string;
|
||||
using tattr = llama_token_attr;
|
||||
|
||||
struct token_data {
|
||||
token text;
|
||||
float score;
|
||||
tattr attr;
|
||||
};
|
||||
|
||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
|
||||
int max_token_len = 0; // used for optimizing longest token search
|
||||
|
||||
std::unordered_map<token, id> token_to_id;
|
||||
std::vector<token_data> id_to_token;
|
||||
|
||||
std::vector<id> cache_special_tokens;
|
||||
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
|
||||
|
||||
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
||||
|
||||
// default LLaMA special tokens
|
||||
id special_bos_id = 1;
|
||||
id special_eos_id = 2;
|
||||
id special_unk_id = 0;
|
||||
id special_sep_id = -1;
|
||||
id special_pad_id = -1;
|
||||
id special_cls_id = -1;
|
||||
id special_mask_id = -1;
|
||||
|
||||
id linefeed_id = 13;
|
||||
id special_prefix_id = -1;
|
||||
id special_suffix_id = -1;
|
||||
id special_middle_id = -1;
|
||||
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
||||
|
||||
// tokenizer flags
|
||||
bool tokenizer_add_space_prefix = false;
|
||||
bool tokenizer_add_bos = false;
|
||||
bool tokenizer_add_eos = false;
|
||||
bool tokenizer_ignore_merges = false;
|
||||
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
|
||||
bool tokenizer_remove_extra_whitespaces = false;
|
||||
bool tokenizer_escape_whitespaces = true;
|
||||
bool tokenizer_treat_whitespace_as_suffix = false;
|
||||
|
||||
std::vector<char> precompiled_charsmap;
|
||||
|
||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
||||
};
|
||||
|
||||
const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx);
|
639
src/llama.cpp
639
src/llama.cpp
|
@ -1,4 +1,6 @@
|
|||
#include "llama-impl.h"
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-grammar.h"
|
||||
#include "llama-sampling.h"
|
||||
|
||||
#include "unicode.h"
|
||||
|
@ -2542,72 +2544,6 @@ struct llama_control_vector {
|
|||
}
|
||||
};
|
||||
|
||||
struct llama_vocab {
|
||||
using id = int32_t;
|
||||
using token = std::string;
|
||||
using tattr = llama_token_attr;
|
||||
|
||||
struct token_data {
|
||||
token text;
|
||||
float score;
|
||||
tattr attr;
|
||||
};
|
||||
|
||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
|
||||
int max_token_len = 0; // used for optimizing longest token search
|
||||
|
||||
std::unordered_map<token, id> token_to_id;
|
||||
std::vector<token_data> id_to_token;
|
||||
|
||||
std::vector<id> cache_special_tokens;
|
||||
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
|
||||
|
||||
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
||||
|
||||
// default LLaMA special tokens
|
||||
id special_bos_id = 1;
|
||||
id special_eos_id = 2;
|
||||
id special_unk_id = 0;
|
||||
id special_sep_id = -1;
|
||||
id special_pad_id = -1;
|
||||
id special_cls_id = -1;
|
||||
id special_mask_id = -1;
|
||||
|
||||
id linefeed_id = 13;
|
||||
id special_prefix_id = -1;
|
||||
id special_suffix_id = -1;
|
||||
id special_middle_id = -1;
|
||||
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
||||
|
||||
// tokenizer flags
|
||||
bool tokenizer_add_space_prefix = false;
|
||||
bool tokenizer_add_bos = false;
|
||||
bool tokenizer_add_eos = false;
|
||||
bool tokenizer_ignore_merges = false;
|
||||
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
|
||||
bool tokenizer_remove_extra_whitespaces = false;
|
||||
bool tokenizer_escape_whitespaces = true;
|
||||
bool tokenizer_treat_whitespace_as_suffix = false;
|
||||
|
||||
std::vector<char> precompiled_charsmap;
|
||||
|
||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
||||
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
||||
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
||||
GGML_ASSERT(token_right.find(' ') == std::string::npos);
|
||||
GGML_ASSERT(token_right.find('\n') == std::string::npos);
|
||||
|
||||
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
||||
if (it == bpe_ranks.end()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return it->second;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_model {
|
||||
e_model type = MODEL_UNKNOWN;
|
||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||
|
@ -2696,7 +2632,12 @@ struct llama_model {
|
|||
};
|
||||
|
||||
struct llama_context {
|
||||
llama_context(const llama_model & model) : model(model), sampling(llama_n_vocab(&model)), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
|
||||
llama_context(const llama_model & model)
|
||||
: model(model)
|
||||
, sampling(llama_n_vocab(&model))
|
||||
, grammar()
|
||||
, t_start_us(model.t_start_us)
|
||||
, t_load_us(model.t_load_us) {}
|
||||
|
||||
~llama_context() {
|
||||
ggml_backend_sched_free(sched);
|
||||
|
@ -2710,9 +2651,10 @@ struct llama_context {
|
|||
|
||||
const struct llama_model & model;
|
||||
|
||||
struct llama_cparams cparams;
|
||||
struct llama_sampling sampling;
|
||||
struct llama_kv_cache kv_self;
|
||||
struct llama_cparams cparams;
|
||||
struct llama_sampling sampling;
|
||||
struct llama_grammar grammar;
|
||||
struct llama_kv_cache kv_self;
|
||||
struct llama_control_vector cvec;
|
||||
|
||||
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
|
||||
|
@ -16535,555 +16477,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|||
return output;
|
||||
}
|
||||
|
||||
//
|
||||
// grammar - internal
|
||||
//
|
||||
|
||||
|
||||
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
||||
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
||||
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||
const std::string & src,
|
||||
llama_partial_utf8 partial_start) {
|
||||
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
||||
const char * pos = src.c_str();
|
||||
std::vector<uint32_t> code_points;
|
||||
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
|
||||
code_points.reserve(src.size() + 1);
|
||||
uint32_t value = partial_start.value;
|
||||
int n_remain = partial_start.n_remain;
|
||||
|
||||
// continue previous decode, if applicable
|
||||
while (*pos != 0 && n_remain > 0) {
|
||||
uint8_t next_byte = static_cast<uint8_t>(*pos);
|
||||
if ((next_byte >> 6) != 2) {
|
||||
// invalid sequence, abort
|
||||
code_points.push_back(0);
|
||||
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
|
||||
}
|
||||
value = (value << 6) + (next_byte & 0x3F);
|
||||
++pos;
|
||||
--n_remain;
|
||||
}
|
||||
|
||||
if (partial_start.n_remain > 0 && n_remain == 0) {
|
||||
code_points.push_back(value);
|
||||
}
|
||||
|
||||
// decode any subsequent utf-8 sequences, which may end in an incomplete one
|
||||
while (*pos != 0) {
|
||||
uint8_t first_byte = static_cast<uint8_t>(*pos);
|
||||
uint8_t highbits = first_byte >> 4;
|
||||
n_remain = lookup[highbits] - 1;
|
||||
|
||||
if (n_remain < 0) {
|
||||
// invalid sequence, abort
|
||||
code_points.clear();
|
||||
code_points.push_back(0);
|
||||
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
|
||||
}
|
||||
|
||||
uint8_t mask = (1 << (7 - n_remain)) - 1;
|
||||
value = first_byte & mask;
|
||||
++pos;
|
||||
while (*pos != 0 && n_remain > 0) {
|
||||
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
||||
++pos;
|
||||
--n_remain;
|
||||
}
|
||||
if (n_remain == 0) {
|
||||
code_points.push_back(value);
|
||||
}
|
||||
}
|
||||
code_points.push_back(0);
|
||||
|
||||
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
||||
}
|
||||
|
||||
// returns true iff pos points to the end of one of the definitions of a rule
|
||||
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
||||
switch (pos->type) {
|
||||
case LLAMA_GRETYPE_END: return true; // NOLINT
|
||||
case LLAMA_GRETYPE_ALT: return true; // NOLINT
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
// returns true iff chr satisfies the char range at pos (regular or inverse range)
|
||||
// asserts that pos is pointing to a char range element
|
||||
static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
||||
const llama_grammar_element * pos,
|
||||
const uint32_t chr) {
|
||||
|
||||
bool found = false;
|
||||
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
||||
|
||||
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
|
||||
|
||||
do {
|
||||
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
||||
// inclusive range, e.g. [a-z]
|
||||
found = found || (pos->value <= chr && chr <= pos[1].value);
|
||||
pos += 2;
|
||||
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
||||
// Any character matches "."
|
||||
found = true;
|
||||
pos += 1;
|
||||
} else {
|
||||
// exact char match, e.g. [a] or "a"
|
||||
found = found || pos->value == chr;
|
||||
pos += 1;
|
||||
}
|
||||
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
||||
|
||||
return std::make_pair(found == is_positive_char, pos);
|
||||
}
|
||||
|
||||
// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
|
||||
// range at pos (regular or inverse range)
|
||||
// asserts that pos is pointing to a char range element
|
||||
static bool llama_grammar_match_partial_char(
|
||||
const llama_grammar_element * pos,
|
||||
const llama_partial_utf8 partial_utf8) {
|
||||
|
||||
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
||||
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
||||
|
||||
uint32_t partial_value = partial_utf8.value;
|
||||
int n_remain = partial_utf8.n_remain;
|
||||
|
||||
// invalid sequence or 7-bit char split across 2 bytes (overlong)
|
||||
if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// range of possible code points this partial UTF-8 sequence could complete to
|
||||
uint32_t low = partial_value << (n_remain * 6);
|
||||
uint32_t high = low | ((1 << (n_remain * 6)) - 1);
|
||||
|
||||
if (low == 0) {
|
||||
if (n_remain == 2) {
|
||||
low = 1 << 11;
|
||||
} else if (n_remain == 3) {
|
||||
low = 1 << 16;
|
||||
}
|
||||
}
|
||||
|
||||
do {
|
||||
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
||||
// inclusive range, e.g. [a-z]
|
||||
if (pos->value <= high && low <= pos[1].value) {
|
||||
return is_positive_char;
|
||||
}
|
||||
pos += 2;
|
||||
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
||||
// Any character matches "."
|
||||
return true;
|
||||
} else {
|
||||
// exact char match, e.g. [a] or "a"
|
||||
if (low <= pos->value && pos->value <= high) {
|
||||
return is_positive_char;
|
||||
}
|
||||
pos += 1;
|
||||
}
|
||||
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
||||
|
||||
return !is_positive_char;
|
||||
}
|
||||
|
||||
|
||||
// transforms a grammar pushdown stack into N possible stacks, all ending
|
||||
// at a character range (terminal element)
|
||||
static void llama_grammar_advance_stack(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<const llama_grammar_element *> & stack,
|
||||
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
||||
|
||||
if (stack.empty()) {
|
||||
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||
new_stacks.emplace_back(stack);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const llama_grammar_element * pos = stack.back();
|
||||
|
||||
switch (pos->type) {
|
||||
case LLAMA_GRETYPE_RULE_REF: {
|
||||
const size_t rule_id = static_cast<size_t>(pos->value);
|
||||
const llama_grammar_element * subpos = rules[rule_id].data();
|
||||
do {
|
||||
// init new stack without the top (pos)
|
||||
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
||||
// if this rule ref is followed by another element, add that to stack
|
||||
new_stack.push_back(pos + 1);
|
||||
}
|
||||
if (!llama_grammar_is_end_of_sequence(subpos)) {
|
||||
// if alternate is nonempty, add to stack
|
||||
new_stack.push_back(subpos);
|
||||
}
|
||||
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
||||
while (!llama_grammar_is_end_of_sequence(subpos)) {
|
||||
// scan to end of alternate def
|
||||
subpos++;
|
||||
}
|
||||
if (subpos->type == LLAMA_GRETYPE_ALT) {
|
||||
// there's another alternate def of this rule to process
|
||||
subpos++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (true);
|
||||
break;
|
||||
}
|
||||
case LLAMA_GRETYPE_CHAR:
|
||||
case LLAMA_GRETYPE_CHAR_NOT:
|
||||
case LLAMA_GRETYPE_CHAR_ANY:
|
||||
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||
// only add the stack if it's not a duplicate of one we already have
|
||||
new_stacks.emplace_back(stack);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
||||
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
|
||||
// those
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// takes a set of possible pushdown stacks on a grammar, which are required to
|
||||
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
||||
// produces the N possible stacks if the given char is accepted at those
|
||||
// positions
|
||||
void llama_grammar_accept(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||
const uint32_t chr,
|
||||
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
||||
|
||||
new_stacks.clear();
|
||||
|
||||
for (const auto & stack : stacks) {
|
||||
if (stack.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto match = llama_grammar_match_char(stack.back(), chr);
|
||||
if (match.first) {
|
||||
const llama_grammar_element * pos = match.second;
|
||||
|
||||
// update top of stack to next element, if any
|
||||
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(pos)) {
|
||||
new_stack.push_back(pos);
|
||||
}
|
||||
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||
const std::vector<llama_grammar_candidate> & candidates);
|
||||
|
||||
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<const llama_grammar_element *> & stack,
|
||||
const std::vector<llama_grammar_candidate> & candidates) {
|
||||
|
||||
std::vector<llama_grammar_candidate> rejects;
|
||||
rejects.reserve(candidates.size());
|
||||
|
||||
if (stack.empty()) {
|
||||
for (const auto & tok : candidates) {
|
||||
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
}
|
||||
return rejects;
|
||||
}
|
||||
|
||||
const llama_grammar_element * stack_pos = stack.back();
|
||||
|
||||
std::vector<llama_grammar_candidate> next_candidates;
|
||||
next_candidates.reserve(candidates.size());
|
||||
|
||||
for (const auto & tok : candidates) {
|
||||
if (*tok.code_points == 0) {
|
||||
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
||||
// that cannot satisfy this position in grammar
|
||||
if (tok.partial_utf8.n_remain != 0 &&
|
||||
!llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
|
||||
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
|
||||
} else {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
}
|
||||
|
||||
const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
|
||||
|
||||
// update top of stack to next element, if any
|
||||
std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
|
||||
stack_after.push_back(stack_pos_after);
|
||||
}
|
||||
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
|
||||
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
||||
|
||||
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
||||
for (const auto & tok : next_rejects) {
|
||||
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
||||
}
|
||||
|
||||
return rejects;
|
||||
}
|
||||
|
||||
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||
const std::vector<llama_grammar_candidate> & candidates) {
|
||||
GGML_ASSERT(!stacks.empty()); // REVIEW
|
||||
|
||||
if (candidates.empty()) {
|
||||
return std::vector<llama_grammar_candidate>();
|
||||
}
|
||||
|
||||
auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
|
||||
|
||||
for (size_t i = 1, size = stacks.size(); i < size; ++i) {
|
||||
rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
|
||||
}
|
||||
return rejects;
|
||||
}
|
||||
|
||||
static bool llama_grammar_detect_left_recursion(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
size_t rule_index,
|
||||
std::vector<bool> * rules_visited,
|
||||
std::vector<bool> * rules_in_progress,
|
||||
std::vector<bool> * rules_may_be_empty) {
|
||||
if ((*rules_in_progress)[rule_index]) {
|
||||
return true;
|
||||
}
|
||||
|
||||
(*rules_in_progress)[rule_index] = true;
|
||||
|
||||
const std::vector<llama_grammar_element> & rule = rules[rule_index];
|
||||
|
||||
// First check if the rule might produce the empty string. This could be done combined with the second
|
||||
// step but it's more readable as two steps.
|
||||
bool at_rule_start = true;
|
||||
for (size_t i = 0; i < rule.size(); i++) {
|
||||
if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
||||
if (at_rule_start) {
|
||||
(*rules_may_be_empty)[rule_index] = true;
|
||||
break;
|
||||
}
|
||||
at_rule_start = true;
|
||||
} else {
|
||||
at_rule_start = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
|
||||
// be empty)
|
||||
bool recurse_into_nonterminal = true;
|
||||
for (size_t i = 0; i < rule.size(); i++) {
|
||||
if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
|
||||
if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
|
||||
return true;
|
||||
}
|
||||
if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
|
||||
recurse_into_nonterminal = false;
|
||||
}
|
||||
} else if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
||||
recurse_into_nonterminal = true;
|
||||
} else {
|
||||
recurse_into_nonterminal = false;
|
||||
}
|
||||
}
|
||||
|
||||
(*rules_in_progress)[rule_index] = false;
|
||||
(*rules_visited)[rule_index] = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
//
|
||||
// grammar - external
|
||||
//
|
||||
|
||||
struct llama_grammar * llama_grammar_init(
|
||||
const llama_grammar_element ** rules,
|
||||
size_t n_rules,
|
||||
size_t start_rule_index) {
|
||||
const llama_grammar_element * pos;
|
||||
|
||||
// copy rule definitions into vectors
|
||||
std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
|
||||
for (size_t i = 0; i < n_rules; i++) {
|
||||
for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
|
||||
vec_rules[i].push_back(*pos);
|
||||
}
|
||||
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
||||
}
|
||||
|
||||
// Check for left recursion
|
||||
std::vector<bool> rules_visited(n_rules);
|
||||
std::vector<bool> rules_in_progress(n_rules);
|
||||
std::vector<bool> rules_may_be_empty(n_rules);
|
||||
for (size_t i = 0; i < n_rules; i++) {
|
||||
if (rules_visited[i]) {
|
||||
continue;
|
||||
}
|
||||
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
||||
LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// loop over alternates of start rule to build initial stacks
|
||||
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
||||
pos = vec_rules[start_rule_index].data();
|
||||
do {
|
||||
std::vector<const llama_grammar_element *> stack;
|
||||
if (!llama_grammar_is_end_of_sequence(pos)) {
|
||||
// if alternate is nonempty, add to stack
|
||||
stack.push_back(pos);
|
||||
}
|
||||
llama_grammar_advance_stack(vec_rules, stack, stacks);
|
||||
while (!llama_grammar_is_end_of_sequence(pos)) {
|
||||
// scan to end of alternate def
|
||||
pos++;
|
||||
}
|
||||
if (pos->type == LLAMA_GRETYPE_ALT) {
|
||||
// there's another alternate def of this rule to process
|
||||
pos++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (true);
|
||||
|
||||
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
||||
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
||||
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
||||
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
||||
}
|
||||
|
||||
void llama_grammar_free(struct llama_grammar * grammar) {
|
||||
delete grammar;
|
||||
}
|
||||
|
||||
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
|
||||
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
|
||||
|
||||
// redirect elements in stacks to point to new rules
|
||||
for (size_t is = 0; is < result->stacks.size(); is++) {
|
||||
for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
|
||||
for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
|
||||
for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
|
||||
if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
|
||||
result->stacks[is][ie] = &result->rules[ir0][ir1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// TODO: rename to llama_grammar_...
|
||||
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
||||
GGML_ASSERT(ctx);
|
||||
int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
bool allow_eog = false;
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
if (stack.empty()) {
|
||||
allow_eog = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
||||
candidates_decoded.reserve(candidates->size);
|
||||
|
||||
std::vector<llama_grammar_candidate> candidates_grammar;
|
||||
candidates_grammar.reserve(candidates->size);
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
const llama_token id = candidates->data[i].id;
|
||||
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
|
||||
|
||||
if (llama_token_is_eog(&ctx->model, id)) {
|
||||
if (!allow_eog) {
|
||||
candidates->data[i].logit = -INFINITY;
|
||||
}
|
||||
} else if (piece.empty() || piece[0] == 0) {
|
||||
candidates->data[i].logit = -INFINITY;
|
||||
} else {
|
||||
candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
|
||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
||||
}
|
||||
}
|
||||
|
||||
const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
|
||||
for (const auto & reject : rejects) {
|
||||
candidates->data[reject.index].logit = -INFINITY;
|
||||
}
|
||||
|
||||
// TODO: change to t_grammar_us
|
||||
ctx->sampling.t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
|
||||
static void llama_log_softmax(float * array, size_t size) {
|
||||
float max_l = *std::max_element(array, array + size);
|
||||
float sum = 0.f;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
float p = expf(array[i] - max_l);
|
||||
sum += p;
|
||||
array[i] = p;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
array[i] = logf(array[i] / sum);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
if (llama_token_is_eog(&ctx->model, token)) {
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
if (stack.empty()) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
|
||||
|
||||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
||||
const auto & code_points = decoded.first;
|
||||
std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
|
||||
grammar->stacks = tmp_new_stacks;
|
||||
}
|
||||
grammar->partial_utf8 = decoded.second;
|
||||
GGML_ASSERT(!grammar->stacks.empty());
|
||||
|
||||
ctx->sampling.t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
|
||||
//
|
||||
// quantization
|
||||
//
|
||||
|
@ -18750,10 +18143,18 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
|
|||
return &ctx->model;
|
||||
}
|
||||
|
||||
const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
|
||||
return &ctx->model.vocab;
|
||||
}
|
||||
|
||||
struct llama_sampling * llama_get_sampling(struct llama_context * ctx) {
|
||||
return &ctx->sampling;
|
||||
}
|
||||
|
||||
struct llama_grammar * llama_get_grammar(struct llama_context * ctx) {
|
||||
return &ctx->grammar;
|
||||
}
|
||||
|
||||
uint32_t llama_n_ctx(const struct llama_context * ctx) {
|
||||
return ctx->cparams.n_ctx;
|
||||
}
|
||||
|
|
|
@ -44,21 +44,24 @@ static bool test_build_grammar_fails(const std::string & grammar_str) {
|
|||
return grammar_fails;
|
||||
}
|
||||
|
||||
static bool match_string(const std::string & input, llama_grammar* grammar) {
|
||||
static bool match_string(const std::string & input, llama_grammar * grammar) {
|
||||
auto decoded = decode_utf8(input, {});
|
||||
|
||||
const auto & code_points = decoded.first;
|
||||
|
||||
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
|
||||
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
auto prev_stacks = grammar->stacks;
|
||||
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||
if (grammar->stacks.empty()) {
|
||||
const llama_grammar_rules & prev_rules = llama_grammar_get_rules (grammar);
|
||||
const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
|
||||
llama_grammar_accept(prev_rules, prev_stacks, *it, cur_stacks);
|
||||
if (cur_stacks.empty()) {
|
||||
// no stacks means that the grammar failed to match at this point
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
for (const auto & stack : cur_stacks) {
|
||||
if (stack.empty()) {
|
||||
// An empty stack means that the grammar has been completed
|
||||
return true;
|
||||
|
@ -75,7 +78,9 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
|
|||
auto grammar = build_grammar(grammar_str);
|
||||
|
||||
// Save the original grammar stacks so that we can reset after every new string we want to test
|
||||
auto original_stacks = grammar->stacks;
|
||||
const llama_grammar_stacks original_stacks = llama_grammar_get_stacks(grammar);
|
||||
|
||||
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
|
||||
|
||||
fprintf(stderr, " 🔵 Valid strings:\n");
|
||||
|
||||
|
@ -112,7 +117,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
|
|||
assert(matched);
|
||||
|
||||
// Reset the grammar stacks
|
||||
grammar->stacks = original_stacks;
|
||||
cur_stacks = original_stacks;
|
||||
}
|
||||
|
||||
fprintf(stderr, " 🟠 Invalid strings:\n");
|
||||
|
@ -132,7 +137,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
|
|||
assert(!matched);
|
||||
|
||||
// Reset the grammar stacks
|
||||
grammar->stacks = original_stacks;
|
||||
cur_stacks = original_stacks;
|
||||
}
|
||||
|
||||
// Clean up allocated memory
|
||||
|
|
|
@ -2,10 +2,12 @@
|
|||
#undef NDEBUG
|
||||
#endif
|
||||
|
||||
#include "llama.cpp" // TODO: not great
|
||||
#define LLAMA_API_INTERNAL
|
||||
#include "llama.h"
|
||||
#include "grammar-parser.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <stdexcept>
|
||||
|
||||
int main()
|
||||
{
|
||||
|
@ -112,10 +114,10 @@ int main()
|
|||
}
|
||||
}
|
||||
|
||||
llama_grammar *grammar = NULL;
|
||||
llama_grammar * grammar = NULL;
|
||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
||||
grammar = llama_grammar_init(
|
||||
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||
|
||||
grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||
if (grammar == nullptr)
|
||||
{
|
||||
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||
|
@ -172,7 +174,7 @@ int main()
|
|||
}};
|
||||
|
||||
auto index = 0;
|
||||
for (auto stack : grammar->stacks)
|
||||
for (auto stack : llama_grammar_get_stacks(grammar))
|
||||
{
|
||||
// compare stack to expected_stack
|
||||
for (uint32_t i = 0; i < stack.size(); i++)
|
||||
|
@ -374,13 +376,13 @@ int main()
|
|||
},
|
||||
};
|
||||
|
||||
std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[0], next_candidates);
|
||||
std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[0], next_candidates);
|
||||
|
||||
std::vector<std::vector<llama_grammar_candidate>> all_rejects;
|
||||
|
||||
for (std::size_t count = 0; count < grammar->stacks.size(); ++count)
|
||||
for (std::size_t count = 0; count < llama_grammar_get_stacks(grammar).size(); ++count)
|
||||
{
|
||||
rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[count], next_candidates);
|
||||
rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[count], next_candidates);
|
||||
all_rejects.push_back(rejects);
|
||||
}
|
||||
|
||||
|
@ -401,6 +403,6 @@ int main()
|
|||
delete[] candidate.code_points;
|
||||
candidate.code_points = nullptr;
|
||||
}
|
||||
delete grammar;
|
||||
llama_grammar_free(grammar);
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue