grammars: move token caches to llama_context
This commit is contained in:
parent
9f13623149
commit
d41f314740
2 changed files with 16 additions and 16 deletions
27
llama.cpp
27
llama.cpp
|
@ -2325,6 +2325,11 @@ struct llama_context {
|
||||||
// control vectors
|
// control vectors
|
||||||
struct llama_control_vector cvec;
|
struct llama_control_vector cvec;
|
||||||
|
|
||||||
|
// caching token pieces & their decoded codepoints.
|
||||||
|
std::vector<std::string> token_pieces;
|
||||||
|
std::vector<std::pair<std::vector<uint32_t>,
|
||||||
|
llama_partial_utf8>> token_codepoints;
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
#ifdef GGML_USE_MPI
|
||||||
ggml_mpi_context * ctx_mpi = NULL;
|
ggml_mpi_context * ctx_mpi = NULL;
|
||||||
#endif
|
#endif
|
||||||
|
@ -13051,7 +13056,7 @@ struct llama_grammar * llama_grammar_init(
|
||||||
}
|
}
|
||||||
} while (true);
|
} while (true);
|
||||||
|
|
||||||
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {}, {}, {} };
|
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_grammar_free(struct llama_grammar * grammar) {
|
void llama_grammar_free(struct llama_grammar * grammar) {
|
||||||
|
@ -13059,7 +13064,7 @@ void llama_grammar_free(struct llama_grammar * grammar) {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
|
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
|
||||||
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8, grammar->token_pieces, grammar->token_codepoints };
|
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
|
||||||
|
|
||||||
// redirect elements in stacks to point to new rules
|
// redirect elements in stacks to point to new rules
|
||||||
for (size_t is = 0; is < result->stacks.size(); is++) {
|
for (size_t is = 0; is < result->stacks.size(); is++) {
|
||||||
|
@ -13552,14 +13557,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (grammar->token_codepoints.empty()) {
|
if (ctx->token_codepoints.empty()) {
|
||||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
grammar->token_codepoints.resize(n_vocab);
|
ctx->token_codepoints.resize(n_vocab);
|
||||||
grammar->token_pieces.resize(n_vocab);
|
ctx->token_pieces.resize(n_vocab);
|
||||||
for (llama_token id = 0; id < n_vocab; ++id) {
|
for (llama_token id = 0; id < n_vocab; ++id) {
|
||||||
const std::string piece = llama_token_to_piece(ctx, id, false);
|
const std::string piece = llama_token_to_piece(ctx, id, false);
|
||||||
grammar->token_pieces[id] = piece;
|
ctx->token_pieces[id] = piece;
|
||||||
grammar->token_codepoints[id] = decode_utf8(piece, {0, 0});
|
ctx->token_codepoints[id] = decode_utf8(piece, {0, 0});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13572,7 +13577,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
const llama_token id = candidates->data[i].id;
|
const llama_token id = candidates->data[i].id;
|
||||||
const auto & piece = grammar->token_pieces[id];
|
const auto & piece = ctx->token_pieces[id];
|
||||||
if (llama_token_is_eog(&ctx->model, id)) {
|
if (llama_token_is_eog(&ctx->model, id)) {
|
||||||
if (!allow_eog) {
|
if (!allow_eog) {
|
||||||
candidates->data[i].logit = -INFINITY;
|
candidates->data[i].logit = -INFINITY;
|
||||||
|
@ -13580,7 +13585,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
||||||
} else if (piece.empty() || piece[0] == 0) {
|
} else if (piece.empty() || piece[0] == 0) {
|
||||||
candidates->data[i].logit = -INFINITY;
|
candidates->data[i].logit = -INFINITY;
|
||||||
} else if (grammar->partial_utf8.n_remain == 0){
|
} else if (grammar->partial_utf8.n_remain == 0){
|
||||||
const auto & decoded = grammar->token_codepoints.at(id);
|
const auto & decoded = ctx->token_codepoints.at(id);
|
||||||
candidates_grammar.push_back({ i, decoded.first.data(), decoded.second });
|
candidates_grammar.push_back({ i, decoded.first.data(), decoded.second });
|
||||||
} else {
|
} else {
|
||||||
candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
|
candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
|
||||||
|
@ -13778,11 +13783,11 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto & piece = grammar->token_pieces.at(token);
|
const auto & piece = ctx->token_pieces.at(token);
|
||||||
|
|
||||||
// Note terminating 0 in decoded string
|
// Note terminating 0 in decoded string
|
||||||
const auto decoded = grammar->partial_utf8.n_remain == 0
|
const auto decoded = grammar->partial_utf8.n_remain == 0
|
||||||
? grammar->token_codepoints[token]
|
? ctx->token_codepoints[token]
|
||||||
: decode_utf8(piece, grammar->partial_utf8);
|
: decode_utf8(piece, grammar->partial_utf8);
|
||||||
const auto & code_points = decoded.first;
|
const auto & code_points = decoded.first;
|
||||||
std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
|
std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
|
||||||
|
|
5
llama.h
5
llama.h
|
@ -1099,11 +1099,6 @@ struct llama_grammar {
|
||||||
|
|
||||||
// buffer for partially generated UTF-8 sequence from accepted tokens
|
// buffer for partially generated UTF-8 sequence from accepted tokens
|
||||||
llama_partial_utf8 partial_utf8;
|
llama_partial_utf8 partial_utf8;
|
||||||
|
|
||||||
// caching the token pieces & their decoded codepoints.
|
|
||||||
std::vector<std::string> token_pieces;
|
|
||||||
std::vector<std::pair<std::vector<uint32_t>,
|
|
||||||
llama_partial_utf8>> token_codepoints;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_grammar_candidate {
|
struct llama_grammar_candidate {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue