llama : remove llama_constraint
ggml-ci
This commit is contained in:
parent
a2d8b27a4b
commit
0b6dfcebb2
19 changed files with 1020 additions and 1055 deletions
|
@ -841,15 +841,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
params.defrag_thold = std::stof(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--samplers" || arg == "--constraints") {
|
||||
if (arg == "--samplers") {
|
||||
CHECK_ARG
|
||||
const auto constraint_names = string_split(argv[i], ';');
|
||||
sparams.constraints = gpt_constraint_types_from_names(constraint_names, true);
|
||||
const auto sampler_names = string_split(argv[i], ';');
|
||||
sparams.samplers = gpt_sampler_types_from_names(sampler_names, true);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--sampling-seq") {
|
||||
CHECK_ARG
|
||||
sparams.constraints = gpt_constraint_types_from_chars(argv[i]);
|
||||
sparams.samplers = gpt_sampler_types_from_chars(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--top-p") {
|
||||
|
@ -1706,13 +1706,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
const auto & sparams = params.sparams;
|
||||
|
||||
std::string constraint_type_chars;
|
||||
std::string constraint_type_names;
|
||||
for (const auto & constraint : sparams.constraints) {
|
||||
constraint_type_chars += gpt_constraint_type_to_chr(constraint);
|
||||
constraint_type_names += gpt_constraint_type_to_str(constraint) + ";";
|
||||
std::string sampler_type_chars;
|
||||
std::string sampler_type_names;
|
||||
for (const auto & sampler : sparams.samplers) {
|
||||
sampler_type_chars += gpt_sampler_type_to_chr(sampler);
|
||||
sampler_type_names += gpt_sampler_type_to_str(sampler) + ";";
|
||||
}
|
||||
constraint_type_names.pop_back();
|
||||
sampler_type_names.pop_back();
|
||||
|
||||
struct option_info {
|
||||
LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5)
|
||||
|
@ -1826,9 +1826,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
options.push_back({ "sampling" });
|
||||
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", sparams.seed });
|
||||
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
|
||||
"(default: %s)", constraint_type_names.c_str() });
|
||||
"(default: %s)", sampler_type_names.c_str() });
|
||||
options.push_back({ "*", " --sampling-seq SEQUENCE",
|
||||
"simplified sequence for samplers that will be used (default: %s)", constraint_type_chars.c_str() });
|
||||
"simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
|
||||
options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
|
||||
options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
|
||||
options.push_back({ "*", " --temp T", "temperature (default: %.1f)", (double)sparams.temp });
|
||||
|
|
|
@ -2,14 +2,127 @@
|
|||
|
||||
#include "common.h"
|
||||
|
||||
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||
// TODO: deduplicate with llama-impl.h
|
||||
template<typename T>
|
||||
struct ring_buffer {
|
||||
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
|
||||
|
||||
T & front() {
|
||||
if (sz == 0) {
|
||||
throw std::runtime_error("ring buffer is empty");
|
||||
}
|
||||
return data[first];
|
||||
}
|
||||
|
||||
const T & front() const {
|
||||
if (sz == 0) {
|
||||
throw std::runtime_error("ring buffer is empty");
|
||||
}
|
||||
return data[first];
|
||||
}
|
||||
|
||||
T & back() {
|
||||
if (sz == 0) {
|
||||
throw std::runtime_error("ring buffer is empty");
|
||||
}
|
||||
return data[pos];
|
||||
}
|
||||
|
||||
const T & back() const {
|
||||
if (sz == 0) {
|
||||
throw std::runtime_error("ring buffer is empty");
|
||||
}
|
||||
return data[pos];
|
||||
}
|
||||
|
||||
void push_back(const T & value) {
|
||||
if (sz == capacity) {
|
||||
// advance the start when buffer is full
|
||||
first = (first + 1) % capacity;
|
||||
} else {
|
||||
sz++;
|
||||
}
|
||||
data[pos] = value;
|
||||
pos = (pos + 1) % capacity;
|
||||
}
|
||||
|
||||
T pop_front() {
|
||||
if (sz == 0) {
|
||||
throw std::runtime_error("ring buffer is empty");
|
||||
}
|
||||
T value = data[first];
|
||||
first = (first + 1) % capacity;
|
||||
sz--;
|
||||
return value;
|
||||
}
|
||||
|
||||
const T & rat(size_t i) const {
|
||||
if (i >= sz) {
|
||||
throw std::runtime_error("ring buffer: index out of bounds");
|
||||
}
|
||||
return data[(first + sz - i - 1) % capacity];
|
||||
}
|
||||
|
||||
std::vector<T> to_vector() const {
|
||||
std::vector<T> result;
|
||||
result.reserve(sz);
|
||||
for (size_t i = 0; i < sz; i++) {
|
||||
result.push_back(data[(first + i) % capacity]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void clear() {
|
||||
// here only reset the status of the buffer
|
||||
sz = 0;
|
||||
first = 0;
|
||||
pos = 0;
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return sz == 0;
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return sz;
|
||||
}
|
||||
|
||||
size_t capacity = 0;
|
||||
size_t sz = 0;
|
||||
size_t first = 0;
|
||||
size_t pos = 0;
|
||||
std::vector<T> data;
|
||||
};
|
||||
|
||||
struct gpt_sampler {
|
||||
gpt_sampler_params params;
|
||||
|
||||
struct llama_constraint * bias;
|
||||
struct llama_constraint * pnlt;
|
||||
struct llama_constraint * grmr;
|
||||
struct llama_sampler * bias;
|
||||
struct llama_sampler * pnlt;
|
||||
struct llama_sampler * grmr;
|
||||
|
||||
struct llama_sampler * smpl;
|
||||
struct llama_sampler * chain;
|
||||
|
||||
ring_buffer<llama_token> prev;
|
||||
|
||||
std::vector<llama_token_data> cur;
|
||||
|
||||
llama_token_data_array cur_p;
|
||||
|
||||
void set_logits(struct llama_context * ctx, int idx) {
|
||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
|
||||
cur.resize(n_vocab);
|
||||
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
||||
}
|
||||
|
||||
cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
|
||||
}
|
||||
};
|
||||
|
||||
std::string gpt_sampler_params::print() const {
|
||||
|
@ -29,28 +142,26 @@ std::string gpt_sampler_params::print() const {
|
|||
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
|
||||
std::string result = "\tlogits";
|
||||
|
||||
for (int i = 0; i < llama_sampler_n_constraints(gsmpl->smpl); i++) {
|
||||
const auto * cnstr = llama_sampler_constraint_get(gsmpl->smpl, i);
|
||||
result += std::string(" -> ") + llama_constraint_name(cnstr) + " ";
|
||||
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
||||
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||
result += std::string(" -> ") + llama_sampler_name(smpl) + " ";
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
||||
llama_sampler_params lparams = llama_sampler_default_params();
|
||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||
|
||||
lparams.seed = params.seed;
|
||||
lparams.n_prev = params.n_prev;
|
||||
lparams.type = params.temp <= 0.0f ? LLAMA_SAMPLER_TYPE_GREEDY : LLAMA_SAMPLER_TYPE_DIST;
|
||||
lparams.no_timing = false;
|
||||
|
||||
auto * result = new gpt_sampler {
|
||||
/* .params = */ params,
|
||||
/* .bias = */ llama_constraint_init_logit_bias(
|
||||
/* .bias = */ llama_sampler_init_logit_bias(
|
||||
model,
|
||||
params.logit_bias.size(),
|
||||
params.logit_bias.data()),
|
||||
/* .pnlt = */ llama_constraint_init_penalties(
|
||||
/* .pnlt = */ llama_sampler_init_penalties(
|
||||
model,
|
||||
params.penalty_last_n,
|
||||
params.penalty_repeat,
|
||||
|
@ -58,45 +169,53 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
|
|||
params.penalty_present,
|
||||
params.penalize_nl,
|
||||
params.ignore_eos),
|
||||
/* .grmr = */ llama_constraint_init_grammar(model, params.grammar.c_str(), "root"),
|
||||
/* .smpl = */ llama_sampler_init(model, lparams)
|
||||
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
|
||||
/* .chain = */ llama_sampler_chain_init(lparams),
|
||||
/* .prev = */ ring_buffer<llama_token>(params.n_prev),
|
||||
/* .cur = */ {},
|
||||
/* .cur_p = */ {},
|
||||
};
|
||||
|
||||
if (params.temp > 0.0f) {
|
||||
if (params.mirostat == 0) {
|
||||
for (const auto & cnstr : params.constraints) {
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
switch (cnstr) {
|
||||
case GPT_CONSTRAINT_TYPE_TOP_K:
|
||||
llama_sampler_constraint_add(result->smpl, llama_constraint_init_top_k (params.top_k));
|
||||
case GPT_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
case GPT_CONSTRAINT_TYPE_TOP_P:
|
||||
llama_sampler_constraint_add(result->smpl, llama_constraint_init_top_p (params.top_p, params.min_keep));
|
||||
case GPT_SAMPLER_TYPE_TOP_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
break;
|
||||
case GPT_CONSTRAINT_TYPE_MIN_P:
|
||||
llama_sampler_constraint_add(result->smpl, llama_constraint_init_min_p (params.min_p, params.min_keep));
|
||||
case GPT_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case GPT_CONSTRAINT_TYPE_TFS_Z:
|
||||
llama_sampler_constraint_add(result->smpl, llama_constraint_init_tail_free(params.tfs_z, params.min_keep));
|
||||
case GPT_SAMPLER_TYPE_TFS_Z:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
||||
break;
|
||||
case GPT_CONSTRAINT_TYPE_TYPICAL_P:
|
||||
llama_sampler_constraint_add(result->smpl, llama_constraint_init_typical (params.typ_p, params.min_keep));
|
||||
case GPT_SAMPLER_TYPE_TYPICAL_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
break;
|
||||
case GPT_CONSTRAINT_TYPE_TEMPERATURE:
|
||||
llama_sampler_constraint_add(result->smpl, llama_constraint_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
case GPT_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown constraint type");
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
}
|
||||
} else if (params.mirostat == 1) {
|
||||
llama_sampler_constraint_add(result->smpl, llama_constraint_init_temp(params.temp));
|
||||
llama_sampler_constraint_add(result->smpl, llama_constraint_init_mirostat(model, params.mirostat_tau, params.mirostat_eta));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(model, params.mirostat_tau, params.mirostat_eta));
|
||||
} else if (params.mirostat == 2) {
|
||||
llama_sampler_constraint_add(result->smpl, llama_constraint_init_temp(params.temp));
|
||||
llama_sampler_constraint_add(result->smpl, llama_constraint_init_mirostat_v2(params.mirostat_tau, params.mirostat_eta));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.mirostat_tau, params.mirostat_eta));
|
||||
} else {
|
||||
GGML_ASSERT(false && "unknown mirostat version");
|
||||
}
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||
} else {
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
|
||||
}
|
||||
|
||||
return result;
|
||||
|
@ -104,11 +223,11 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
|
|||
|
||||
void gpt_sampler_free(struct gpt_sampler * gsmpl) {
|
||||
if (gsmpl) {
|
||||
llama_constraint_free(gsmpl->bias);
|
||||
llama_constraint_free(gsmpl->pnlt);
|
||||
llama_constraint_free(gsmpl->grmr);
|
||||
llama_sampler_free(gsmpl->bias);
|
||||
llama_sampler_free(gsmpl->pnlt);
|
||||
llama_sampler_free(gsmpl->grmr);
|
||||
|
||||
llama_sampler_free(gsmpl->smpl);
|
||||
llama_sampler_free(gsmpl->chain);
|
||||
|
||||
delete gsmpl;
|
||||
}
|
||||
|
@ -117,69 +236,66 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) {
|
|||
struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
|
||||
return new gpt_sampler {
|
||||
/* .params = */ gsmpl->params,
|
||||
/* .bias = */ llama_constraint_clone(gsmpl->bias),
|
||||
/* .pnlt = */ llama_constraint_clone(gsmpl->pnlt),
|
||||
/* .grmr = */ llama_constraint_clone(gsmpl->grmr),
|
||||
/* .smpl = */ llama_sampler_clone (gsmpl->smpl)
|
||||
/* .bias = */ llama_sampler_clone(gsmpl->bias),
|
||||
/* .pnlt = */ llama_sampler_clone(gsmpl->pnlt),
|
||||
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||
/* .prev = */ gsmpl->prev,
|
||||
/* .cur = */ gsmpl->cur,
|
||||
/* .cur_p = */ gsmpl->cur_p,
|
||||
};
|
||||
}
|
||||
|
||||
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool apply_grammar) {
|
||||
if (apply_grammar) {
|
||||
llama_constraint_accept(gsmpl->grmr, token);
|
||||
llama_sampler_accept(gsmpl->grmr, token);
|
||||
}
|
||||
|
||||
llama_sampler_accept(gsmpl->smpl, token);
|
||||
llama_sampler_accept(gsmpl->chain, token);
|
||||
|
||||
gsmpl->prev.push_back(token);
|
||||
}
|
||||
|
||||
void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
|
||||
llama_constraint_reset(gsmpl->grmr);
|
||||
llama_sampler_reset(gsmpl->grmr);
|
||||
|
||||
llama_sampler_reset(gsmpl->smpl);
|
||||
}
|
||||
|
||||
void gpt_sampler_set_logits(struct gpt_sampler * gsmpl, const float * logits) {
|
||||
llama_sampler_set_logits(gsmpl->smpl, logits);
|
||||
llama_sampler_reset(gsmpl->chain);
|
||||
}
|
||||
|
||||
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
|
||||
return llama_sampler_get_candidates(gsmpl->smpl);
|
||||
return &gsmpl->cur_p;
|
||||
}
|
||||
|
||||
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
|
||||
return llama_sampler_last(gsmpl->smpl);
|
||||
return gsmpl->prev.rat(0);
|
||||
}
|
||||
|
||||
void gpt_print_timings(struct llama_context * ctx, struct gpt_sampler * gsmpl) {
|
||||
llama_print_timings(ctx, gsmpl ? gsmpl->smpl : nullptr);
|
||||
}
|
||||
|
||||
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_token_data_array * cur_p) {
|
||||
return llama_sampler_sample(gsmpl->smpl, cur_p);
|
||||
void gpt_print_timings(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
|
||||
llama_print_timings(ctx, gsmpl ? gsmpl->chain : nullptr);
|
||||
}
|
||||
|
||||
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
||||
auto & bias = gsmpl->bias;
|
||||
auto & pnlt = gsmpl->pnlt;
|
||||
auto & grmr = gsmpl->grmr;
|
||||
auto & smpl = gsmpl->smpl;
|
||||
auto & bias = gsmpl->bias;
|
||||
auto & pnlt = gsmpl->pnlt;
|
||||
auto & grmr = gsmpl->grmr;
|
||||
auto & chain = gsmpl->chain;
|
||||
|
||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
llama_sampler_set_logits(smpl, logits);
|
||||
auto & cur_p = gsmpl->cur_p;
|
||||
|
||||
auto * cur_p = llama_sampler_get_candidates(smpl);
|
||||
|
||||
llama_constraint_apply(bias, cur_p);
|
||||
llama_constraint_apply(pnlt, cur_p);
|
||||
llama_sampler_apply(bias, &cur_p);
|
||||
llama_sampler_apply(pnlt, &cur_p);
|
||||
|
||||
if (grammar_first) {
|
||||
llama_constraint_apply(grmr, cur_p);
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
}
|
||||
|
||||
llama_sampler_apply(smpl, cur_p);
|
||||
llama_sampler_apply(chain, &cur_p);
|
||||
|
||||
const llama_token id = llama_sampler_sample(smpl, cur_p);
|
||||
const llama_token id = cur_p.data[cur_p.selected].id;
|
||||
|
||||
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - check your sampling configuration");
|
||||
|
||||
if (grammar_first) {
|
||||
return id;
|
||||
|
@ -188,9 +304,9 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
|
|||
// check if it the sampled token fits the grammar
|
||||
{
|
||||
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
|
||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, LLAMA_TOKEN_NULL, false };
|
||||
|
||||
llama_constraint_apply(grmr, &single_token_data_array);
|
||||
llama_sampler_apply(grmr, &single_token_data_array);
|
||||
|
||||
// check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
|
||||
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||
|
@ -199,28 +315,22 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
|
|||
}
|
||||
}
|
||||
|
||||
// if the token is not valid, sample again, first apply the grammar constraints and then sample
|
||||
llama_sampler_set_logits(smpl, logits);
|
||||
// if the token is not valid, sample again, first apply the grammar samplers and then sample
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
llama_constraint_apply(bias, cur_p);
|
||||
llama_constraint_apply(pnlt, cur_p);
|
||||
llama_constraint_apply(grmr, cur_p);
|
||||
llama_sampler_apply(bias, &cur_p);
|
||||
llama_sampler_apply(pnlt, &cur_p);
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
|
||||
llama_sampler_apply(smpl, cur_p);
|
||||
llama_sampler_apply(chain, &cur_p);
|
||||
|
||||
return llama_sampler_sample(smpl, cur_p);
|
||||
}
|
||||
GGML_ASSERT(cur_p.data[cur_p.selected].id != LLAMA_TOKEN_NULL && "null token in the sampling history - check your sampling configuration");
|
||||
|
||||
void gpt_sampler_apply_grammar(struct gpt_sampler * gsmpl, llama_token_data_array * cur_p) {
|
||||
GGML_ASSERT(cur_p != nullptr);
|
||||
|
||||
llama_constraint_apply(gsmpl->grmr, cur_p);
|
||||
return cur_p.data[cur_p.selected].id;
|
||||
}
|
||||
|
||||
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
|
||||
auto & smpl = gsmpl->smpl;
|
||||
|
||||
n = std::min(n, llama_sampler_n_prev(smpl));
|
||||
n = std::min(n, (int) gsmpl->prev.size());
|
||||
|
||||
if (n <= 0) {
|
||||
return "";
|
||||
|
@ -230,7 +340,7 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
|
|||
result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
|
||||
|
||||
for (int i = n - 1; i >= 0; i--) {
|
||||
const llama_token id = llama_sampler_prev(smpl, i);
|
||||
const llama_token id = gsmpl->prev.rat(i);
|
||||
|
||||
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
|
||||
|
||||
|
@ -240,95 +350,95 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
|
|||
return result;
|
||||
}
|
||||
|
||||
char gpt_constraint_type_to_chr(enum gpt_constraint_type cnstr) {
|
||||
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
|
||||
switch (cnstr) {
|
||||
case GPT_CONSTRAINT_TYPE_TOP_K: return 'k';
|
||||
case GPT_CONSTRAINT_TYPE_TFS_Z: return 'f';
|
||||
case GPT_CONSTRAINT_TYPE_TYPICAL_P: return 'y';
|
||||
case GPT_CONSTRAINT_TYPE_TOP_P: return 'p';
|
||||
case GPT_CONSTRAINT_TYPE_MIN_P: return 'm';
|
||||
case GPT_CONSTRAINT_TYPE_TEMPERATURE: return 't';
|
||||
case GPT_SAMPLER_TYPE_TOP_K: return 'k';
|
||||
case GPT_SAMPLER_TYPE_TFS_Z: return 'f';
|
||||
case GPT_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
||||
case GPT_SAMPLER_TYPE_TOP_P: return 'p';
|
||||
case GPT_SAMPLER_TYPE_MIN_P: return 'm';
|
||||
case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||
default : return '?';
|
||||
}
|
||||
}
|
||||
|
||||
std::string gpt_constraint_type_to_str(enum gpt_constraint_type cnstr) {
|
||||
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
|
||||
switch (cnstr) {
|
||||
case GPT_CONSTRAINT_TYPE_TOP_K: return "top_k";
|
||||
case GPT_CONSTRAINT_TYPE_TFS_Z: return "tfs_z";
|
||||
case GPT_CONSTRAINT_TYPE_TYPICAL_P: return "typ_p";
|
||||
case GPT_CONSTRAINT_TYPE_TOP_P: return "top_p";
|
||||
case GPT_CONSTRAINT_TYPE_MIN_P: return "min_p";
|
||||
case GPT_CONSTRAINT_TYPE_TEMPERATURE: return "temperature";
|
||||
case GPT_SAMPLER_TYPE_TOP_K: return "top_k";
|
||||
case GPT_SAMPLER_TYPE_TFS_Z: return "tfs_z";
|
||||
case GPT_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
||||
case GPT_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||
case GPT_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||
case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||
default : return "";
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<gpt_constraint_type> gpt_constraint_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
||||
std::unordered_map<std::string, gpt_constraint_type> constraint_canonical_name_map {
|
||||
{ "top_k", GPT_CONSTRAINT_TYPE_TOP_K },
|
||||
{ "top_p", GPT_CONSTRAINT_TYPE_TOP_P },
|
||||
{ "typ_p", GPT_CONSTRAINT_TYPE_TYPICAL_P },
|
||||
{ "min_p", GPT_CONSTRAINT_TYPE_MIN_P },
|
||||
{ "tfs_z", GPT_CONSTRAINT_TYPE_TFS_Z },
|
||||
{ "temperature", GPT_CONSTRAINT_TYPE_TEMPERATURE },
|
||||
std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
||||
std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
|
||||
{ "top_k", GPT_SAMPLER_TYPE_TOP_K },
|
||||
{ "top_p", GPT_SAMPLER_TYPE_TOP_P },
|
||||
{ "typ_p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "min_p", GPT_SAMPLER_TYPE_MIN_P },
|
||||
{ "tfs_z", GPT_SAMPLER_TYPE_TFS_Z },
|
||||
{ "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
|
||||
};
|
||||
|
||||
// since constraints names are written multiple ways
|
||||
// since samplers names are written multiple ways
|
||||
// make it ready for both system names and input names
|
||||
std::unordered_map<std::string, gpt_constraint_type> constraint_alt_name_map {
|
||||
{ "top-k", GPT_CONSTRAINT_TYPE_TOP_K },
|
||||
{ "top-p", GPT_CONSTRAINT_TYPE_TOP_P },
|
||||
{ "nucleus", GPT_CONSTRAINT_TYPE_TOP_P },
|
||||
{ "typical-p", GPT_CONSTRAINT_TYPE_TYPICAL_P },
|
||||
{ "typical", GPT_CONSTRAINT_TYPE_TYPICAL_P },
|
||||
{ "typ-p", GPT_CONSTRAINT_TYPE_TYPICAL_P },
|
||||
{ "typ", GPT_CONSTRAINT_TYPE_TYPICAL_P },
|
||||
{ "min-p", GPT_CONSTRAINT_TYPE_MIN_P },
|
||||
{ "tfs-z", GPT_CONSTRAINT_TYPE_TFS_Z },
|
||||
{ "tfs", GPT_CONSTRAINT_TYPE_TFS_Z },
|
||||
{ "temp", GPT_CONSTRAINT_TYPE_TEMPERATURE },
|
||||
std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
|
||||
{ "top-k", GPT_SAMPLER_TYPE_TOP_K },
|
||||
{ "top-p", GPT_SAMPLER_TYPE_TOP_P },
|
||||
{ "nucleus", GPT_SAMPLER_TYPE_TOP_P },
|
||||
{ "typical-p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "typical", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "typ-p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "typ", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "min-p", GPT_SAMPLER_TYPE_MIN_P },
|
||||
{ "tfs-z", GPT_SAMPLER_TYPE_TFS_Z },
|
||||
{ "tfs", GPT_SAMPLER_TYPE_TFS_Z },
|
||||
{ "temp", GPT_SAMPLER_TYPE_TEMPERATURE },
|
||||
};
|
||||
|
||||
std::vector<gpt_constraint_type> constraints;
|
||||
constraints.reserve(names.size());
|
||||
std::vector<gpt_sampler_type> samplers;
|
||||
samplers.reserve(names.size());
|
||||
|
||||
for (const auto & name : names) {
|
||||
auto constraint = constraint_canonical_name_map.find(name);
|
||||
if (constraint != constraint_canonical_name_map.end()) {
|
||||
constraints.push_back(constraint->second);
|
||||
auto sampler = sampler_canonical_name_map.find(name);
|
||||
if (sampler != sampler_canonical_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
} else {
|
||||
if (allow_alt_names) {
|
||||
constraint = constraint_alt_name_map.find(name);
|
||||
if (constraint != constraint_alt_name_map.end()) {
|
||||
constraints.push_back(constraint->second);
|
||||
sampler = sampler_alt_name_map.find(name);
|
||||
if (sampler != sampler_alt_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return constraints;
|
||||
return samplers;
|
||||
}
|
||||
|
||||
std::vector<gpt_constraint_type> gpt_constraint_types_from_chars(const std::string & chars) {
|
||||
std::unordered_map<char, gpt_constraint_type> constraint_name_map {
|
||||
{ gpt_constraint_type_to_chr(GPT_CONSTRAINT_TYPE_TOP_K), GPT_CONSTRAINT_TYPE_TOP_K },
|
||||
{ gpt_constraint_type_to_chr(GPT_CONSTRAINT_TYPE_TFS_Z), GPT_CONSTRAINT_TYPE_TFS_Z },
|
||||
{ gpt_constraint_type_to_chr(GPT_CONSTRAINT_TYPE_TYPICAL_P), GPT_CONSTRAINT_TYPE_TYPICAL_P },
|
||||
{ gpt_constraint_type_to_chr(GPT_CONSTRAINT_TYPE_TOP_P), GPT_CONSTRAINT_TYPE_TOP_P },
|
||||
{ gpt_constraint_type_to_chr(GPT_CONSTRAINT_TYPE_MIN_P), GPT_CONSTRAINT_TYPE_MIN_P },
|
||||
{ gpt_constraint_type_to_chr(GPT_CONSTRAINT_TYPE_TEMPERATURE), GPT_CONSTRAINT_TYPE_TEMPERATURE }
|
||||
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
|
||||
std::unordered_map<char, gpt_sampler_type> sampler_name_map {
|
||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
|
||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
|
||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P },
|
||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P },
|
||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
|
||||
};
|
||||
|
||||
std::vector<gpt_constraint_type> constraints;
|
||||
constraints.reserve(chars.size());
|
||||
std::vector<gpt_sampler_type> samplers;
|
||||
samplers.reserve(chars.size());
|
||||
|
||||
for (const auto & c : chars) {
|
||||
const auto constraint = constraint_name_map.find(c);
|
||||
if (constraint != constraint_name_map.end()) {
|
||||
constraints.push_back(constraint->second);
|
||||
const auto sampler = sampler_name_map.find(c);
|
||||
if (sampler != sampler_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
}
|
||||
}
|
||||
|
||||
return constraints;
|
||||
return samplers;
|
||||
}
|
||||
|
|
|
@ -5,14 +5,14 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
enum gpt_constraint_type {
|
||||
GPT_CONSTRAINT_TYPE_NONE = 0,
|
||||
GPT_CONSTRAINT_TYPE_TOP_K = 1,
|
||||
GPT_CONSTRAINT_TYPE_TOP_P = 2,
|
||||
GPT_CONSTRAINT_TYPE_MIN_P = 3,
|
||||
GPT_CONSTRAINT_TYPE_TFS_Z = 4,
|
||||
GPT_CONSTRAINT_TYPE_TYPICAL_P = 5,
|
||||
GPT_CONSTRAINT_TYPE_TEMPERATURE = 6,
|
||||
enum gpt_sampler_type {
|
||||
GPT_SAMPLER_TYPE_NONE = 0,
|
||||
GPT_SAMPLER_TYPE_TOP_K = 1,
|
||||
GPT_SAMPLER_TYPE_TOP_P = 2,
|
||||
GPT_SAMPLER_TYPE_MIN_P = 3,
|
||||
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
||||
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
||||
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
||||
};
|
||||
|
||||
// sampling parameters
|
||||
|
@ -21,7 +21,7 @@ struct gpt_sampler_params {
|
|||
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t min_keep = 0; // 0 = disabled, otherwise constraints should return at least min_keep tokens
|
||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
|
@ -40,13 +40,13 @@ struct gpt_sampler_params {
|
|||
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||
bool ignore_eos = false;
|
||||
|
||||
std::vector<enum gpt_constraint_type> constraints = {
|
||||
GPT_CONSTRAINT_TYPE_TOP_K,
|
||||
GPT_CONSTRAINT_TYPE_TFS_Z,
|
||||
GPT_CONSTRAINT_TYPE_TYPICAL_P,
|
||||
GPT_CONSTRAINT_TYPE_TOP_P,
|
||||
GPT_CONSTRAINT_TYPE_MIN_P,
|
||||
GPT_CONSTRAINT_TYPE_TEMPERATURE
|
||||
std::vector<enum gpt_sampler_type> samplers = {
|
||||
GPT_SAMPLER_TYPE_TOP_K,
|
||||
GPT_SAMPLER_TYPE_TFS_Z,
|
||||
GPT_SAMPLER_TYPE_TYPICAL_P,
|
||||
GPT_SAMPLER_TYPE_TOP_P,
|
||||
GPT_SAMPLER_TYPE_MIN_P,
|
||||
GPT_SAMPLER_TYPE_TEMPERATURE
|
||||
};
|
||||
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
|
@ -73,40 +73,36 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl);
|
|||
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool apply_grammar);
|
||||
void gpt_sampler_reset (struct gpt_sampler * gsmpl);
|
||||
|
||||
void gpt_sampler_apply_grammar(struct gpt_sampler * gsmpl, llama_token_data_array * cur_p);
|
||||
|
||||
void gpt_sampler_set_logits(struct gpt_sampler * gsmpl, const float * logits);
|
||||
|
||||
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
|
||||
|
||||
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_token_data_array * cur_p);
|
||||
//llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_token_data_array * cur_p);
|
||||
|
||||
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
|
||||
|
||||
void gpt_print_timings(struct llama_context * ctx, struct gpt_sampler * gsmpl);
|
||||
void gpt_print_timings(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
|
||||
|
||||
// extended sampling implementation:
|
||||
//
|
||||
// - set logits
|
||||
// - apply the configured sampling constraints
|
||||
// - apply the configured sampler chain
|
||||
// - check if the token fits the grammar (if any)
|
||||
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
||||
//
|
||||
// if grammar_first is true, the grammar is applied before the constraints (slower)
|
||||
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
||||
// useful in cases where all the resulting candidates must fit the grammar
|
||||
//
|
||||
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||
|
||||
// helpers
|
||||
|
||||
// print the constraints into a string
|
||||
// print the sampler chain into a string
|
||||
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
|
||||
|
||||
// get a string representation of the last accepted tokens
|
||||
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
|
||||
|
||||
char gpt_constraint_type_to_chr(enum gpt_constraint_type cnstr);
|
||||
std::string gpt_constraint_type_to_str(enum gpt_constraint_type cnstr);
|
||||
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
|
||||
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
|
||||
|
||||
std::vector<enum gpt_constraint_type> gpt_constraint_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
||||
std::vector<enum gpt_constraint_type> gpt_constraint_types_from_chars(const std::string & chars);
|
||||
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
||||
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
|
||||
|
|
|
@ -50,9 +50,9 @@ defer {
|
|||
llama_free(context)
|
||||
}
|
||||
|
||||
var sparams = llama_sampler_params()
|
||||
var sparams = llama_sampler_chain_default_params()
|
||||
|
||||
let smpl = llama_sampler_init(model, sparams)
|
||||
let smpl = llama_sampler_chain_init(sparams)
|
||||
guard smpl != nil else {
|
||||
print("Failed to initialize sampling")
|
||||
exit(1)
|
||||
|
@ -61,9 +61,9 @@ defer {
|
|||
llama_sampler_free(smpl)
|
||||
}
|
||||
|
||||
llama_sampler_constraint_add(smpl, llama_constraint_init_top_k(40));
|
||||
llama_sampler_constraint_add(smpl, llama_constraint_init_top_p(0.9, 1));
|
||||
llama_sampler_constraint_add(smpl, llama_constraint_init_temp (0.4));
|
||||
llama_sampler_sampler_add(smpl, llama_sampler_init_top_k(40));
|
||||
llama_sampler_sampler_add(smpl, llama_sampler_init_top_p(0.9, 1));
|
||||
llama_sampler_sampler_add(smpl, llama_sampler_init_temp (0.4));
|
||||
|
||||
let n_ctx = llama_n_ctx(context)
|
||||
|
||||
|
@ -137,11 +137,9 @@ while n_cur <= n_len {
|
|||
continue
|
||||
}
|
||||
|
||||
var logits = llama_get_logits_ith(context, i_batch[i])
|
||||
let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
|
||||
|
||||
llama_sampler_set_logits(smpl, logits)
|
||||
|
||||
let new_token_id = llama_sampler_sample(smpl, nil)
|
||||
llama_sampler_accept(smpl, new_token_id)
|
||||
|
||||
// is it an end of stream? -> mark the stream as finished
|
||||
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
||||
|
|
|
@ -64,15 +64,13 @@ int main(int argc, char ** argv) {
|
|||
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
auto sparams = llama_sampler_default_params();
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
|
||||
sparams.seed = params.sparams.seed;
|
||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler * smpl = llama_sampler_init(model, sparams);
|
||||
|
||||
llama_sampler_constraint_add(smpl, llama_constraint_init_top_k(params.sparams.top_k));
|
||||
llama_sampler_constraint_add(smpl, llama_constraint_init_top_p(params.sparams.top_p, params.sparams.min_keep));
|
||||
llama_sampler_constraint_add(smpl, llama_constraint_init_temp (params.sparams.temp));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||
|
@ -173,11 +171,9 @@ int main(int argc, char ** argv) {
|
|||
continue;
|
||||
}
|
||||
|
||||
const auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
|
||||
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
|
||||
|
||||
llama_sampler_set_logits(smpl, logits);
|
||||
|
||||
const llama_token new_token_id = llama_sampler_sample(smpl, nullptr);
|
||||
llama_sampler_accept(smpl, new_token_id);
|
||||
|
||||
// is it an end of generation? -> mark the stream as finished
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||
|
|
|
@ -120,11 +120,9 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
|||
|
||||
llama_decode(ctx, bat);
|
||||
|
||||
const auto * logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
|
||||
llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
|
||||
llama_sampler_accept(smpl, token);
|
||||
|
||||
llama_sampler_set_logits(smpl, logits);
|
||||
|
||||
llama_token token = llama_sampler_sample(smpl, nullptr);
|
||||
if (token == eos_token) {
|
||||
break;
|
||||
}
|
||||
|
@ -171,11 +169,9 @@ int main(int argc, char * argv[]) {
|
|||
// create generation context
|
||||
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
||||
|
||||
auto sparams = llama_sampler_default_params();
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
|
||||
sparams.type = LLAMA_SAMPLER_TYPE_GREEDY;
|
||||
|
||||
llama_sampler * smpl = llama_sampler_init(model, sparams);
|
||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||
|
||||
// ### Embedding/Representation ###
|
||||
// samples taken from: https://github.com/ContextualAI/gritlm#basic
|
||||
|
|
|
@ -394,12 +394,10 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
|||
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
|
||||
if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
|
||||
|
||||
const auto * logits = llama_get_logits_ith(context, batch->n_tokens - 1);
|
||||
|
||||
llama_sampler_set_logits(sampling, logits);
|
||||
|
||||
// sample the most likely token
|
||||
const auto new_token_id = llama_sampler_sample(sampling, nullptr);
|
||||
const auto new_token_id = llama_sampler_sample(sampling, context, batch->n_tokens - 1);
|
||||
|
||||
llama_sampler_accept(sampling, new_token_id);
|
||||
|
||||
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||
|
|
|
@ -43,9 +43,8 @@ actor LlamaContext {
|
|||
self.tokens_list = []
|
||||
self.batch = llama_batch_init(512, 0, 1)
|
||||
self.temporary_invalid_cchars = []
|
||||
var sparams = llama_sampler_default_params()
|
||||
sparams.type = LLAMA_SAMPLER_TYPE_GREEDY
|
||||
self.sampling = llama_sampler_init(context, sparams)
|
||||
var sparams = llama_sampler_chain_default_params()
|
||||
self.sampling = llama_sampler_chain_init(sparams)
|
||||
}
|
||||
|
||||
deinit {
|
||||
|
@ -148,12 +147,9 @@ actor LlamaContext {
|
|||
func completion_loop() -> String {
|
||||
var new_token_id: llama_token = 0
|
||||
|
||||
let n_vocab = llama_n_vocab(model)
|
||||
let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
|
||||
new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
|
||||
|
||||
llama_sampler_set_logits(sampling, logits);
|
||||
|
||||
new_token_id = llama_sampler_sample(sampling, nil)
|
||||
llama_sampler_accept(sampling, new_token_id)
|
||||
|
||||
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
||||
print("\n")
|
||||
|
|
|
@ -83,11 +83,11 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
auto sparams = llama_sampler_default_params();
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
|
||||
sparams.type = LLAMA_SAMPLER_TYPE_GREEDY;
|
||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler * smpl = llama_sampler_init(model, sparams);
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> tokens_list;
|
||||
|
@ -220,12 +220,9 @@ int main(int argc, char ** argv) {
|
|||
while (n_cur <= n_len) {
|
||||
// sample the next token
|
||||
{
|
||||
const auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
|
||||
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
|
||||
|
||||
llama_sampler_set_logits(smpl, logits);
|
||||
|
||||
// sample the most likely token
|
||||
const llama_token new_token_id = llama_sampler_sample(smpl, nullptr);
|
||||
llama_sampler_accept(smpl, new_token_id);
|
||||
|
||||
// is it an end of generation?
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||
|
|
|
@ -38,10 +38,12 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
llama_sampler_params sparams = llama_sampler_default_params();
|
||||
sparams.seed = params.sparams.seed;
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
|
||||
llama_sampler * smpl = llama_sampler_init(model, sparams);
|
||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_softmax());
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
|
||||
|
||||
// tokenize prompt
|
||||
auto tokens = llama_tokenize(ctx, params.prompt, true);
|
||||
|
@ -69,13 +71,11 @@ int main(int argc, char ** argv) {
|
|||
printf("\nfirst run: %s", params.prompt.c_str());
|
||||
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
const auto * logits = llama_get_logits(ctx);
|
||||
|
||||
llama_sampler_set_logits(smpl, logits);
|
||||
|
||||
auto next_token = llama_sampler_sample(smpl, nullptr);
|
||||
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
||||
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
||||
|
||||
llama_sampler_accept(smpl, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result0 += next_token_str;
|
||||
|
||||
|
@ -96,7 +96,10 @@ int main(int argc, char ** argv) {
|
|||
// make new context
|
||||
auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||
|
||||
llama_sampler * smpl2 = llama_sampler_init(model, sparams);
|
||||
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl2, llama_sampler_init_softmax());
|
||||
llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
|
||||
|
||||
printf("\nsecond run: %s", params.prompt.c_str());
|
||||
|
||||
|
@ -126,13 +129,11 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// second run
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
const auto * logits = llama_get_logits(ctx2);
|
||||
|
||||
llama_sampler_set_logits(smpl2, logits);
|
||||
|
||||
auto next_token = llama_sampler_sample(smpl2, nullptr);
|
||||
auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
|
||||
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
||||
|
||||
llama_sampler_accept(smpl2, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result1 += next_token_str;
|
||||
|
||||
|
@ -157,7 +158,10 @@ int main(int argc, char ** argv) {
|
|||
// make new context
|
||||
auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||
|
||||
llama_sampler * smpl3 = llama_sampler_init(model, sparams);
|
||||
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl3, llama_sampler_init_softmax());
|
||||
llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
|
||||
|
||||
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||
|
||||
|
@ -215,13 +219,11 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// third run with seq 1 instead of 0
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
const auto * logits = llama_get_logits(ctx3);
|
||||
|
||||
llama_sampler_set_logits(smpl3, logits);
|
||||
|
||||
auto next_token = llama_sampler_sample(smpl3, nullptr);
|
||||
auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
|
||||
auto next_token_str = llama_token_to_piece(ctx3, next_token);
|
||||
|
||||
llama_sampler_accept(smpl3, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result2 += next_token_str;
|
||||
|
||||
|
|
|
@ -1027,17 +1027,17 @@ struct server_context {
|
|||
}
|
||||
|
||||
{
|
||||
const auto & constraints = data.find("samplers");
|
||||
if (constraints != data.end() && constraints->is_array()) {
|
||||
std::vector<std::string> constraint_names;
|
||||
for (const auto & name : *constraints) {
|
||||
const auto & samplers = data.find("samplers");
|
||||
if (samplers != data.end() && samplers->is_array()) {
|
||||
std::vector<std::string> sampler_names;
|
||||
for (const auto & name : *samplers) {
|
||||
if (name.is_string()) {
|
||||
constraint_names.emplace_back(name);
|
||||
sampler_names.emplace_back(name);
|
||||
}
|
||||
}
|
||||
slot.sparams.constraints = gpt_constraint_types_from_names(constraint_names, false);
|
||||
slot.sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
|
||||
} else {
|
||||
slot.sparams.constraints = default_sparams.constraints;
|
||||
slot.sparams.samplers = default_sparams.samplers;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1253,10 +1253,10 @@ struct server_context {
|
|||
}
|
||||
|
||||
json get_formated_generation(const server_slot & slot) const {
|
||||
std::vector<std::string> constraints;
|
||||
constraints.reserve(slot.sparams.constraints.size());
|
||||
for (const auto & constraint : slot.sparams.constraints) {
|
||||
constraints.emplace_back(gpt_constraint_type_to_str(constraint));
|
||||
std::vector<std::string> samplers;
|
||||
samplers.reserve(slot.sparams.samplers.size());
|
||||
for (const auto & sampler : slot.sparams.samplers) {
|
||||
samplers.emplace_back(gpt_sampler_type_to_str(sampler));
|
||||
}
|
||||
|
||||
return json {
|
||||
|
@ -1290,7 +1290,7 @@ struct server_context {
|
|||
{"n_probs", slot.sparams.n_probs},
|
||||
{"min_keep", slot.sparams.min_keep},
|
||||
{"grammar", slot.sparams.grammar},
|
||||
{"samplers", constraints},
|
||||
{"samplers", samplers},
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -55,11 +55,9 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
auto sparams = llama_sampler_default_params();
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
|
||||
sparams.type = LLAMA_SAMPLER_TYPE_GREEDY;
|
||||
|
||||
llama_sampler * smpl = llama_sampler_init(model, sparams);
|
||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||
|
||||
// tokenize the prompt
|
||||
|
||||
|
@ -116,12 +114,9 @@ int main(int argc, char ** argv) {
|
|||
while (n_cur <= n_predict) {
|
||||
// sample the next token
|
||||
{
|
||||
const auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
|
||||
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
|
||||
|
||||
llama_sampler_set_logits(smpl, logits);
|
||||
|
||||
// sample the most likely token
|
||||
const llama_token new_token_id = llama_sampler_sample(smpl, nullptr);
|
||||
llama_sampler_accept(smpl, new_token_id);
|
||||
|
||||
// is it an end of generation?
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||
|
|
|
@ -179,7 +179,7 @@ int main(int argc, char ** argv) {
|
|||
// target model sampling context (reuse the llama_context's sampling instance)
|
||||
struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams);
|
||||
|
||||
struct llama_constraint * softmax = llama_constraint_init_softmax();
|
||||
struct llama_sampler * softmax = llama_sampler_init_softmax();
|
||||
|
||||
// draft sequence data
|
||||
std::vector<seq_draft> drafts(n_seq_dft);
|
||||
|
@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
|
||||
float r = u_dist(rng);
|
||||
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
|
||||
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
|
||||
|
||||
//GGML_ASSERT(dist_tgt.size <= dist_dft.size);
|
||||
|
||||
|
@ -625,7 +625,7 @@ int main(int argc, char ** argv) {
|
|||
gpt_sampler_free(drafts[s].smpl);
|
||||
}
|
||||
|
||||
llama_constraint_free(softmax);
|
||||
llama_sampler_free(softmax);
|
||||
llama_batch_free(batch_dft);
|
||||
|
||||
llama_free(ctx_tgt);
|
||||
|
|
162
include/llama.h
162
include/llama.h
|
@ -216,6 +216,7 @@ extern "C" {
|
|||
// TODO: consider SoA
|
||||
llama_token_data * data;
|
||||
size_t size;
|
||||
int64_t selected;
|
||||
bool sorted;
|
||||
} llama_token_data_array;
|
||||
|
||||
|
@ -369,21 +370,9 @@ extern "C" {
|
|||
float bias;
|
||||
} llama_logit_bias;
|
||||
|
||||
enum llama_sampler_type {
|
||||
LLAMA_SAMPLER_TYPE_GREEDY = 0,
|
||||
LLAMA_SAMPLER_TYPE_DIST = 1,
|
||||
};
|
||||
|
||||
typedef struct llama_sampler_params {
|
||||
uint32_t seed; // the seed used to initialize the rng of the sampler
|
||||
|
||||
int32_t n_prev; // size of ring buffer to keep previous accepted tokens (needed for llama_sampler_prev_ API)
|
||||
|
||||
// TODO: will be used by the llama_decode_with_sampler() API in the future
|
||||
enum llama_sampler_type type;
|
||||
|
||||
typedef struct llama_sampler_chain_params {
|
||||
bool no_timing; // whether to measure performance timings
|
||||
} llama_sampler_params;
|
||||
} llama_sampler_chain_params;
|
||||
|
||||
// performance timing information
|
||||
struct llama_timings {
|
||||
|
@ -412,7 +401,7 @@ extern "C" {
|
|||
// TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
|
||||
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
||||
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
||||
LLAMA_API struct llama_sampler_params llama_sampler_default_params(void);
|
||||
LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
|
||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
||||
|
||||
// Initialize the llama + ggml backend
|
||||
|
@ -1003,70 +992,73 @@ extern "C" {
|
|||
//
|
||||
// Sampling API
|
||||
//
|
||||
// - Constraints
|
||||
// The llama_constraint object works on a set of candidate tokens (llama_token_data_array), by modifying their
|
||||
// logits and probabilities inplace. The interface is abstracted so that users can implement custom constraints.
|
||||
//
|
||||
// - Samplers
|
||||
// The llama_sampler samples a token based on the candidate token probabilities. Before the actual sampling, the
|
||||
// sampler can apply a sequence of constraints in order to modify the probabilities of the candidates.
|
||||
//
|
||||
// The llama_sampler object contains the entire sampling information:
|
||||
//
|
||||
// - RNG state (seed and generator)
|
||||
// - Custom set of constraints (see llama_sampler_constraint_add)
|
||||
// - Sampling method (greedy, dist)
|
||||
// - Previous tokens
|
||||
//
|
||||
// In the future, it will be utilized offload the sampling to the backends (e.g. GPU).
|
||||
//
|
||||
// TODO: in the future, the entire API should be changed to accept llama_vocab, instead of llama_model
|
||||
|
||||
// constraints
|
||||
typedef void * llama_sampler_context_t;
|
||||
|
||||
struct llama_constraint;
|
||||
|
||||
typedef void * llama_constraint_context_t;
|
||||
|
||||
// user code can implement the interface below in order to create custom llama_constraint
|
||||
struct llama_constraint_i {
|
||||
const char * (*name) (const struct llama_constraint * cnstr); // can be NULL
|
||||
void (*accept)( struct llama_constraint * cnstr, llama_token token); // can be NULL
|
||||
void (*apply) ( struct llama_constraint * cnstr, llama_token_data_array * cur_p); // required
|
||||
void (*reset) ( struct llama_constraint * cnstr); // can be NULL
|
||||
struct llama_constraint * (*clone) (const struct llama_constraint * cnstr); // can be NULL if ctx is NULL
|
||||
void (*free) ( struct llama_constraint * cnstr); // can be NULL if ctx is NULL
|
||||
// user code can implement the interface below in order to create custom llama_sampler
|
||||
struct llama_sampler_i {
|
||||
const char * (*name) (const struct llama_sampler * smpl); // can be NULL
|
||||
void (*accept)( struct llama_sampler * smpl, llama_token token); // can be NULL
|
||||
void (*apply) ( struct llama_sampler * smpl, llama_token_data_array * cur_p); // required
|
||||
void (*reset) ( struct llama_sampler * smpl); // can be NULL
|
||||
struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL
|
||||
void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL
|
||||
|
||||
// TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
|
||||
//void (*apply_ggml) (struct llama_constraint * cnstr, ...);
|
||||
//void (*apply_ggml) (struct llama_sampler * smpl, ...);
|
||||
};
|
||||
|
||||
struct llama_constraint {
|
||||
struct llama_constraint_i * iface;
|
||||
llama_constraint_context_t ctx;
|
||||
struct llama_sampler {
|
||||
struct llama_sampler_i * iface;
|
||||
llama_sampler_context_t ctx;
|
||||
};
|
||||
|
||||
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
|
||||
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
|
||||
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
|
||||
LLAMA_API void llama_sampler_reset ( struct llama_sampler * smpl);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
|
||||
// important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add)
|
||||
LLAMA_API void llama_sampler_free ( struct llama_sampler * smpl);
|
||||
|
||||
// llama_sampler_chain is a type of llama_sampler that can contain multiple llama_samplers
|
||||
|
||||
LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params);
|
||||
|
||||
// important: takes ownership of the sampler object and will free it when llama_sampler_free is called
|
||||
LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
|
||||
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
|
||||
|
||||
// available samplers:
|
||||
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
||||
|
||||
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_softmax (void);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void);
|
||||
|
||||
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_top_k (int32_t k);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
||||
|
||||
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_top_p (float p, int32_t min_keep);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_top_p (float p, int32_t min_keep);
|
||||
|
||||
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_min_p (float p, int32_t min_keep);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, int32_t min_keep);
|
||||
|
||||
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_tail_free (float z, int32_t min_keep);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, int32_t min_keep);
|
||||
|
||||
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_typical (float p, int32_t min_keep);
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_temp (float t);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, int32_t min_keep);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t);
|
||||
|
||||
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_temp_ext (float t, float delta, float exponent);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
|
||||
|
||||
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
||||
|
@ -1074,7 +1066,7 @@ extern "C" {
|
|||
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
||||
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
||||
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_mirostat(
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_mirostat(
|
||||
const struct llama_model * model,
|
||||
float tau,
|
||||
float eta);
|
||||
|
@ -1084,16 +1076,16 @@ extern "C" {
|
|||
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
||||
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_mirostat_v2(
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2(
|
||||
float tau,
|
||||
float eta);
|
||||
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_grammar(
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
|
||||
const struct llama_model * model,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root);
|
||||
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_penalties(
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
|
||||
const struct llama_model * model,
|
||||
int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat, // 1.0 = disabled
|
||||
|
@ -1102,57 +1094,14 @@ extern "C" {
|
|||
bool penalize_nl, // consider newlines as a repeatable token
|
||||
bool ignore_eos); // ignore the end-of-sequence token
|
||||
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_logit_bias(
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
|
||||
const struct llama_model * model,
|
||||
int32_t n_logit_bias,
|
||||
const llama_logit_bias * logit_bias);
|
||||
|
||||
LLAMA_API struct llama_constraint * llama_constraint_clone(const struct llama_constraint * cnstr);
|
||||
|
||||
// important: do not call if the constraint has been added to a llama_sampler (via llama_sampler_constraint_add)
|
||||
LLAMA_API void llama_constraint_free(struct llama_constraint * cnstr);
|
||||
|
||||
LLAMA_API const char * llama_constraint_name (const struct llama_constraint * cnstr);
|
||||
LLAMA_API void llama_constraint_accept( struct llama_constraint * cnstr, llama_token token);
|
||||
LLAMA_API void llama_constraint_apply ( struct llama_constraint * cnstr, llama_token_data_array * cur_p);
|
||||
LLAMA_API void llama_constraint_reset ( struct llama_constraint * cnstr);
|
||||
|
||||
// samplers
|
||||
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_model * model, struct llama_sampler_params params);
|
||||
LLAMA_API void llama_sampler_free ( struct llama_sampler * smpl);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
|
||||
LLAMA_API void llama_sampler_reset ( struct llama_sampler * smpl);
|
||||
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
|
||||
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
|
||||
|
||||
LLAMA_API void llama_sampler_set_logits(struct llama_sampler * smpl, const float * logits);
|
||||
|
||||
LLAMA_API llama_token_data_array * llama_sampler_get_candidates(struct llama_sampler * smpl);
|
||||
|
||||
// important: takes ownership of the constraint object and will free it in llama_sampler_free
|
||||
LLAMA_API void llama_sampler_constraint_add( struct llama_sampler * smpl, struct llama_constraint * cnstr);
|
||||
LLAMA_API int llama_sampler_n_constraints (const struct llama_sampler * smpl);
|
||||
LLAMA_API struct llama_constraint * llama_sampler_constraint_get(const struct llama_sampler * smpl, int32_t i);
|
||||
|
||||
|
||||
LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, llama_token_data_array * cur_p);
|
||||
|
||||
/// @details Get the number of accepted tokens so far (max of n_prev)
|
||||
LLAMA_API int llama_sampler_n_prev(const struct llama_sampler * smpl);
|
||||
|
||||
/// @details Get the ith accepted token
|
||||
/// @param ith [0, n_prev), ith == 0 is the last accepted token.
|
||||
/// returns LLAMA_TOKEN_NULL if ith is out of bounds
|
||||
LLAMA_API llama_token llama_sampler_prev(const struct llama_sampler * smpl, int32_t ith);
|
||||
|
||||
/// @details Get the last accepted token
|
||||
/// Same as llama_sampler_prev(smpl, 0)
|
||||
/// returns LLAMA_TOKEN_NULL if there are no accepted tokens
|
||||
LLAMA_API llama_token llama_sampler_last(const struct llama_sampler * smpl);
|
||||
LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
|
||||
|
||||
// TODO: extend in the future
|
||||
//LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t i);
|
||||
//LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);
|
||||
|
||||
//
|
||||
|
@ -1172,8 +1121,9 @@ extern "C" {
|
|||
// Performance information
|
||||
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
||||
|
||||
LLAMA_API void llama_print_timings(struct llama_context * ctx, struct llama_sampler * smpl);
|
||||
LLAMA_API void llama_reset_timings(struct llama_context * ctx, struct llama_sampler * smpl);
|
||||
// note: requires llama_sampler_chain. how to prevent misuse?
|
||||
LLAMA_API void llama_print_timings(const struct llama_context * ctx, const struct llama_sampler * chain);
|
||||
LLAMA_API void llama_reset_timings( struct llama_context * ctx, struct llama_sampler * chain);
|
||||
|
||||
// Print system information
|
||||
LLAMA_API const char * llama_print_system_info(void);
|
||||
|
|
|
@ -32,6 +32,20 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
|
|||
// helpers
|
||||
//
|
||||
|
||||
struct time_meas {
|
||||
time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
|
||||
|
||||
~time_meas() {
|
||||
if (t_start_us >= 0) {
|
||||
t_acc += ggml_time_us() - t_start_us;
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t t_start_us;
|
||||
|
||||
int64_t & t_acc;
|
||||
};
|
||||
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
if (search.empty()) {
|
||||
return;
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -2,89 +2,26 @@
|
|||
|
||||
#include "llama-grammar.h"
|
||||
|
||||
#include <random>
|
||||
#include <unordered_map>
|
||||
|
||||
struct llama_vocab;
|
||||
struct llama_grammar;
|
||||
|
||||
using llama_token_cnt = std::unordered_map<llama_token, int>;
|
||||
|
||||
// TODO: tmp exposed until test-sampling is fixed
|
||||
void llama_constraint_penalties_impl(
|
||||
llama_token_data_array * cur_p,
|
||||
const llama_token_cnt & token_count,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present);
|
||||
|
||||
// constraints
|
||||
|
||||
struct llama_constraint * llama_constraint_init_softmax_impl ();
|
||||
struct llama_constraint * llama_constraint_init_top_k_impl (int32_t k);
|
||||
struct llama_constraint * llama_constraint_init_top_p_impl (float p, size_t min_keep);
|
||||
struct llama_constraint * llama_constraint_init_min_p_impl (float p, size_t min_keep);
|
||||
struct llama_constraint * llama_constraint_init_tail_free_impl (float z, size_t min_keep);
|
||||
struct llama_constraint * llama_constraint_init_typical_impl (float p, size_t min_keep);
|
||||
struct llama_constraint * llama_constraint_init_temp_impl (float t);
|
||||
struct llama_constraint * llama_constraint_init_temp_ext_impl (float t, float delta, float exponent);
|
||||
|
||||
struct llama_constraint * llama_constraint_init_mirostat_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
float tau,
|
||||
float eta,
|
||||
int32_t m);
|
||||
|
||||
struct llama_constraint * llama_constraint_init_mirostat_v2_impl(
|
||||
float tau,
|
||||
float eta);
|
||||
|
||||
struct llama_constraint * llama_constraint_init_grammar_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root);
|
||||
|
||||
struct llama_constraint * llama_constraint_init_penalties_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
int32_t penalty_last_n,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present,
|
||||
bool penalize_nl,
|
||||
bool ignore_eos);
|
||||
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_logit_bias_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
int32_t n_logit_bias,
|
||||
const llama_logit_bias * logit_bias);
|
||||
|
||||
struct llama_constraint * llama_constraint_clone_impl(const struct llama_constraint & cnstr);
|
||||
|
||||
void llama_constraint_free_impl(struct llama_constraint * cnstr);
|
||||
|
||||
const char * llama_constraint_name_impl (const struct llama_constraint & cnstr);
|
||||
void llama_constraint_accept_impl( struct llama_constraint & cnstr, llama_token token);
|
||||
void llama_constraint_apply_impl ( struct llama_constraint & cnstr, struct llama_token_data_array * cur_p);
|
||||
void llama_constraint_reset_impl ( struct llama_constraint & cnstr);
|
||||
|
||||
// samplers
|
||||
|
||||
struct llama_sampler {
|
||||
llama_sampler_params params;
|
||||
const char * llama_sampler_name_impl (const struct llama_sampler & smpl);
|
||||
void llama_sampler_accept_impl( struct llama_sampler & smpl, llama_token token);
|
||||
void llama_sampler_apply_impl ( struct llama_sampler & smpl, struct llama_token_data_array * cur_p);
|
||||
void llama_sampler_reset_impl ( struct llama_sampler & smpl);
|
||||
struct llama_sampler * llama_sampler_clone_impl (const struct llama_sampler & smpl);
|
||||
void llama_sampler_free_impl ( struct llama_sampler * smpl);
|
||||
|
||||
const struct llama_vocab * vocab;
|
||||
// sampler chain
|
||||
|
||||
// state
|
||||
struct llama_sampler_chain {
|
||||
llama_sampler_chain_params params;
|
||||
|
||||
std::mt19937 rng;
|
||||
|
||||
ring_buffer<llama_token> prev;
|
||||
|
||||
std::vector<llama_constraint *> constraints;
|
||||
|
||||
std::vector<llama_token_data> cur;
|
||||
|
||||
llama_token_data_array cur_p;
|
||||
std::vector<struct llama_sampler *> samplers;
|
||||
|
||||
// timing
|
||||
|
||||
|
@ -93,18 +30,57 @@ struct llama_sampler {
|
|||
mutable int32_t n_sample;
|
||||
};
|
||||
|
||||
struct llama_sampler * llama_sampler_init_impl (const struct llama_vocab & vocab, struct llama_sampler_params params);
|
||||
void llama_sampler_free_impl ( struct llama_sampler * smpl);
|
||||
struct llama_sampler * llama_sampler_clone_impl (const struct llama_sampler & smpl);
|
||||
void llama_sampler_reset_impl ( struct llama_sampler & smpl);
|
||||
void llama_sampler_accept_impl( struct llama_sampler & smpl, llama_token token);
|
||||
void llama_sampler_apply_impl ( struct llama_sampler & smpl, struct llama_token_data_array * cur_p);
|
||||
struct llama_sampler * llama_sampler_chain_init_impl( struct llama_sampler_chain_params params);
|
||||
void llama_sampler_chain_add_impl ( struct llama_sampler_chain & chain, struct llama_sampler * smpl);
|
||||
struct llama_sampler * llama_sampler_chain_get_impl (const struct llama_sampler_chain & chain, int32_t i);
|
||||
int llama_sampler_chain_n_impl (const struct llama_sampler_chain & chain);
|
||||
|
||||
void llama_sampler_constraint_add_impl( struct llama_sampler & smpl, struct llama_constraint * cnstr);
|
||||
int llama_sampler_n_constraints_impl (const struct llama_sampler & smpl);
|
||||
struct llama_constraint * llama_sampler_constraint_get_impl(const struct llama_sampler & smpl, int ith);
|
||||
using llama_token_cnt = std::unordered_map<llama_token, int>;
|
||||
|
||||
llama_token llama_sampler_sample_impl(struct llama_token_data_array * cur_p, std::mt19937 & rng, enum llama_sampler_type type);
|
||||
// TODO: tmp exposed until test-sampling is fixed
|
||||
void llama_sampler_penalties_impl(
|
||||
llama_token_data_array * cur_p,
|
||||
const llama_token_cnt & token_count,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present);
|
||||
|
||||
llama_token llama_sampler_prev_impl (const struct llama_sampler & smpl, int ith);
|
||||
int llama_sampler_n_prev_impl(const struct llama_sampler & smpl);
|
||||
struct llama_sampler * llama_sampler_init_greedy_impl ();
|
||||
struct llama_sampler * llama_sampler_init_dist_impl (uint32_t seed);
|
||||
struct llama_sampler * llama_sampler_init_softmax_impl ();
|
||||
struct llama_sampler * llama_sampler_init_top_k_impl (int32_t k);
|
||||
struct llama_sampler * llama_sampler_init_top_p_impl (float p, size_t min_keep);
|
||||
struct llama_sampler * llama_sampler_init_min_p_impl (float p, size_t min_keep);
|
||||
struct llama_sampler * llama_sampler_init_tail_free_impl(float z, size_t min_keep);
|
||||
struct llama_sampler * llama_sampler_init_typical_impl (float p, size_t min_keep);
|
||||
struct llama_sampler * llama_sampler_init_temp_impl (float t);
|
||||
struct llama_sampler * llama_sampler_init_temp_ext_impl (float t, float delta, float exponent);
|
||||
|
||||
struct llama_sampler * llama_sampler_init_mirostat_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
float tau,
|
||||
float eta,
|
||||
int32_t m);
|
||||
|
||||
struct llama_sampler * llama_sampler_init_mirostat_v2_impl(
|
||||
float tau,
|
||||
float eta);
|
||||
|
||||
struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const char * grammar_str,
|
||||
const char * grammar_root);
|
||||
|
||||
struct llama_sampler * llama_sampler_init_penalties_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
int32_t penalty_last_n,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present,
|
||||
bool penalize_nl,
|
||||
bool ignore_eos);
|
||||
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
int32_t n_logit_bias,
|
||||
const llama_logit_bias * logit_bias);
|
||||
|
|
282
src/llama.cpp
282
src/llama.cpp
|
@ -147,21 +147,6 @@ static void zeros(std::ofstream & file, size_t n) {
|
|||
}
|
||||
}
|
||||
|
||||
struct time_meas {
|
||||
time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
|
||||
|
||||
~time_meas() {
|
||||
if (t_start_us >= 0) {
|
||||
t_acc += ggml_time_us() - t_start_us;
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t t_start_us;
|
||||
|
||||
int64_t & t_acc;
|
||||
};
|
||||
|
||||
|
||||
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap;
|
||||
|
@ -17937,11 +17922,8 @@ struct llama_context_params llama_context_default_params() {
|
|||
return result;
|
||||
}
|
||||
|
||||
struct llama_sampler_params llama_sampler_default_params() {
|
||||
struct llama_sampler_params result = {
|
||||
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
||||
/*.n_prev =*/ 256,
|
||||
/*.type =*/ LLAMA_SAMPLER_TYPE_DIST,
|
||||
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
||||
struct llama_sampler_chain_params result = {
|
||||
/*.no_timing =*/ false, // TODO: change to true and set explicitly in examples
|
||||
};
|
||||
|
||||
|
@ -20610,98 +20592,24 @@ int32_t llama_chat_apply_template(
|
|||
// sampling
|
||||
//
|
||||
|
||||
struct llama_constraint * llama_constraint_init_softmax(void) {
|
||||
return llama_constraint_init_softmax_impl();
|
||||
const char * llama_sampler_name(const struct llama_sampler * smpl) {
|
||||
return llama_sampler_name_impl(*smpl);
|
||||
}
|
||||
|
||||
struct llama_constraint * llama_constraint_init_top_k(int32_t k) {
|
||||
return llama_constraint_init_top_k_impl(k);
|
||||
void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
|
||||
llama_sampler_accept_impl(*smpl, token);
|
||||
}
|
||||
|
||||
struct llama_constraint * llama_constraint_init_top_p(float p, int32_t min_keep) {
|
||||
return llama_constraint_init_top_p_impl(p, min_keep);
|
||||
void llama_sampler_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
llama_sampler_apply_impl(*smpl, cur_p);
|
||||
}
|
||||
|
||||
struct llama_constraint * llama_constraint_init_min_p(float p, int32_t min_keep) {
|
||||
return llama_constraint_init_min_p_impl(p, min_keep);
|
||||
void llama_sampler_reset(struct llama_sampler * smpl) {
|
||||
llama_sampler_reset_impl(*smpl);
|
||||
}
|
||||
|
||||
struct llama_constraint * llama_constraint_init_tail_free(float z, int32_t min_keep) {
|
||||
return llama_constraint_init_tail_free_impl(z, min_keep);
|
||||
}
|
||||
|
||||
struct llama_constraint * llama_constraint_init_typical(float p, int32_t min_keep) {
|
||||
return llama_constraint_init_typical_impl(p, min_keep);
|
||||
}
|
||||
|
||||
struct llama_constraint * llama_constraint_init_temp(float temp) {
|
||||
return llama_constraint_init_temp_impl(temp);
|
||||
}
|
||||
|
||||
struct llama_constraint * llama_constraint_init_temp_ext(float temp, float delta, float exponent) {
|
||||
return llama_constraint_init_temp_ext_impl(temp, delta, exponent);
|
||||
}
|
||||
|
||||
struct llama_constraint * llama_constraint_init_mirostat(const struct llama_model * model, float tau, float eta) {
|
||||
return llama_constraint_init_mirostat_impl(model->vocab, tau, eta, 100);
|
||||
}
|
||||
|
||||
struct llama_constraint * llama_constraint_init_mirostat_v2(float tau, float eta) {
|
||||
return llama_constraint_init_mirostat_v2_impl(tau, eta);
|
||||
}
|
||||
|
||||
struct llama_constraint * llama_constraint_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
|
||||
return llama_constraint_init_grammar_impl(model->vocab, grammar_str, grammar_root);
|
||||
}
|
||||
|
||||
struct llama_constraint * llama_constraint_init_penalties(
|
||||
const struct llama_model * model,
|
||||
int32_t penalty_last_n,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present,
|
||||
bool penalize_nl,
|
||||
bool ignore_eos) {
|
||||
return llama_constraint_init_penalties_impl(model->vocab, penalty_last_n, penalty_repeat, penalty_freq, penalty_present, penalize_nl, ignore_eos);
|
||||
}
|
||||
|
||||
LLAMA_API struct llama_constraint * llama_constraint_init_logit_bias(
|
||||
const struct llama_model * model,
|
||||
int32_t n_logit_bias,
|
||||
const llama_logit_bias * logit_bias) {
|
||||
return llama_constraint_init_logit_bias_impl(model->vocab, n_logit_bias, logit_bias);
|
||||
}
|
||||
|
||||
struct llama_constraint * llama_constraint_clone(const struct llama_constraint * cnstr) {
|
||||
return llama_constraint_clone_impl(*cnstr);
|
||||
}
|
||||
|
||||
void llama_constraint_free(struct llama_constraint * cnstr) {
|
||||
if (cnstr == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_constraint_free_impl(cnstr);
|
||||
}
|
||||
|
||||
const char * llama_constraint_name(const struct llama_constraint * cnstr) {
|
||||
return llama_constraint_name_impl(*cnstr);
|
||||
}
|
||||
|
||||
void llama_constraint_accept(struct llama_constraint * cnstr, llama_token token) {
|
||||
llama_constraint_accept_impl(*cnstr, token);
|
||||
}
|
||||
|
||||
void llama_constraint_apply(struct llama_constraint * cnstr, llama_token_data_array * cur_p) {
|
||||
llama_constraint_apply_impl(*cnstr, cur_p);
|
||||
}
|
||||
|
||||
void llama_constraint_reset(struct llama_constraint * cnstr) {
|
||||
llama_constraint_reset_impl(*cnstr);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init(const struct llama_model * model, struct llama_sampler_params params) {
|
||||
return llama_sampler_init_impl(model->vocab, params);
|
||||
struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
|
||||
return llama_sampler_clone_impl(*smpl);
|
||||
}
|
||||
|
||||
void llama_sampler_free(struct llama_sampler * smpl) {
|
||||
|
@ -20712,86 +20620,110 @@ void llama_sampler_free(struct llama_sampler * smpl) {
|
|||
llama_sampler_free_impl(smpl);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
|
||||
return llama_sampler_clone_impl(*smpl);
|
||||
struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
|
||||
return llama_sampler_chain_init_impl(params);
|
||||
}
|
||||
|
||||
void llama_sampler_reset(struct llama_sampler * smpl) {
|
||||
llama_sampler_reset_impl(*smpl);
|
||||
void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
|
||||
llama_sampler_chain_add_impl(*(struct llama_sampler_chain *) chain->ctx, smpl);
|
||||
}
|
||||
|
||||
void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
|
||||
llama_sampler_accept_impl(*smpl, token);
|
||||
struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
|
||||
return llama_sampler_chain_get_impl(*(const struct llama_sampler_chain *) chain->ctx, i);
|
||||
}
|
||||
|
||||
void llama_sampler_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
time_meas tm(smpl->t_sample_us, smpl->params.no_timing);
|
||||
|
||||
if (cur_p == nullptr) {
|
||||
cur_p = &smpl->cur_p;
|
||||
}
|
||||
|
||||
llama_sampler_apply_impl(*smpl, cur_p);
|
||||
int llama_sampler_chain_n(const struct llama_sampler * chain) {
|
||||
return llama_sampler_chain_n_impl(*(const struct llama_sampler_chain *) chain->ctx);
|
||||
}
|
||||
|
||||
void llama_sampler_set_logits(struct llama_sampler * smpl, const float * logits) {
|
||||
const int n_vocab = smpl->vocab->n_vocab;
|
||||
struct llama_sampler * llama_sampler_init_greedy(void) {
|
||||
return llama_sampler_init_greedy_impl();
|
||||
}
|
||||
|
||||
smpl->cur.resize(n_vocab);
|
||||
struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
||||
return llama_sampler_init_dist_impl(seed);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_softmax(void) {
|
||||
return llama_sampler_init_softmax_impl();
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
|
||||
return llama_sampler_init_top_k_impl(k);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_top_p(float p, int32_t min_keep) {
|
||||
return llama_sampler_init_top_p_impl(p, min_keep);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_min_p(float p, int32_t min_keep) {
|
||||
return llama_sampler_init_min_p_impl(p, min_keep);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_tail_free(float z, int32_t min_keep) {
|
||||
return llama_sampler_init_tail_free_impl(z, min_keep);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_typical(float p, int32_t min_keep) {
|
||||
return llama_sampler_init_typical_impl(p, min_keep);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_temp(float temp) {
|
||||
return llama_sampler_init_temp_impl(temp);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
|
||||
return llama_sampler_init_temp_ext_impl(temp, delta, exponent);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_mirostat(const struct llama_model * model, float tau, float eta) {
|
||||
return llama_sampler_init_mirostat_impl(model->vocab, tau, eta, 100);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_mirostat_v2(float tau, float eta) {
|
||||
return llama_sampler_init_mirostat_v2_impl(tau, eta);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
|
||||
return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_penalties(
|
||||
const struct llama_model * model,
|
||||
int32_t penalty_last_n,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present,
|
||||
bool penalize_nl,
|
||||
bool ignore_eos) {
|
||||
return llama_sampler_init_penalties_impl(model->vocab, penalty_last_n, penalty_repeat, penalty_freq, penalty_present, penalize_nl, ignore_eos);
|
||||
}
|
||||
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
|
||||
const struct llama_model * model,
|
||||
int32_t n_logit_bias,
|
||||
const llama_logit_bias * logit_bias) {
|
||||
return llama_sampler_init_logit_bias_impl(model->vocab, n_logit_bias, logit_bias);
|
||||
}
|
||||
|
||||
llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
|
||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
|
||||
// TODO: do not allocate each time
|
||||
std::vector<llama_token_data> cur(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
smpl->cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
||||
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
||||
}
|
||||
|
||||
smpl->cur_p = { smpl->cur.data(), smpl->cur.size(), false };
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
||||
|
||||
llama_sampler_apply(smpl, &cur_p);
|
||||
|
||||
return cur_p.data[cur_p.selected].id;
|
||||
}
|
||||
|
||||
llama_token_data_array * llama_sampler_get_candidates(struct llama_sampler * smpl) {
|
||||
return &smpl->cur_p;
|
||||
}
|
||||
|
||||
void llama_sampler_constraint_add(struct llama_sampler * smpl, struct llama_constraint * cnstr) {
|
||||
llama_sampler_constraint_add_impl(*smpl, cnstr);
|
||||
}
|
||||
|
||||
int llama_sampler_n_constraints (const struct llama_sampler * smpl) {
|
||||
return llama_sampler_n_constraints_impl(*smpl);
|
||||
}
|
||||
|
||||
struct llama_constraint * llama_sampler_constraint_get(const struct llama_sampler * smpl, int32_t i) {
|
||||
return llama_sampler_constraint_get_impl(*smpl, i);
|
||||
}
|
||||
|
||||
llama_token llama_sampler_sample(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
time_meas tm(smpl->t_sample_us, smpl->params.no_timing);
|
||||
|
||||
if (cur_p == nullptr) {
|
||||
cur_p = &smpl->cur_p;
|
||||
}
|
||||
|
||||
auto res = llama_sampler_sample_impl(cur_p, smpl->rng, smpl->params.type);
|
||||
|
||||
smpl->n_sample++;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
int llama_sampler_n_prev(const struct llama_sampler * smpl) {
|
||||
return llama_sampler_n_prev_impl(*smpl);
|
||||
}
|
||||
|
||||
llama_token llama_sampler_prev(const struct llama_sampler * smpl, int32_t ith) {
|
||||
return llama_sampler_prev_impl(*smpl, ith);
|
||||
}
|
||||
|
||||
llama_token llama_sampler_last(const struct llama_sampler * smpl) {
|
||||
return llama_sampler_prev_impl(*smpl, 0);
|
||||
}
|
||||
|
||||
//llama_token llama_sampler_sample(struct llama_sampler * smpl, const struct llama_context * ctx, int32_t i) {
|
||||
// GGML_ABORT("not implemented");
|
||||
//}
|
||||
|
||||
//
|
||||
// model split
|
||||
//
|
||||
|
@ -20820,7 +20752,9 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
|||
return 0;
|
||||
}
|
||||
|
||||
void llama_print_timings(struct llama_context * ctx, struct llama_sampler * smpl) {
|
||||
void llama_print_timings(const struct llama_context * ctx, const struct llama_sampler * chain) {
|
||||
auto * smpl = chain ? (const struct llama_sampler_chain *) chain->ctx : nullptr;
|
||||
|
||||
const llama_timings timings = {
|
||||
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
||||
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
|
||||
|
@ -20845,13 +20779,15 @@ void llama_print_timings(struct llama_context * ctx, struct llama_sampler * smpl
|
|||
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
|
||||
}
|
||||
|
||||
void llama_reset_timings(struct llama_context * ctx, struct llama_sampler * smpl) {
|
||||
void llama_reset_timings(struct llama_context * ctx, struct llama_sampler * chain) {
|
||||
ctx->t_start_us = ggml_time_us();
|
||||
ctx->t_eval_us = ctx->n_eval = 0;
|
||||
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
||||
|
||||
if (smpl) {
|
||||
smpl->t_sample_us = smpl->n_sample = 0;
|
||||
if (chain) {
|
||||
auto * smpl = (struct llama_sampler_chain *) chain->ctx;
|
||||
|
||||
smpl->t_sample_us = smpl->n_sample = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -21,8 +21,8 @@ static void dump(const llama_token_data_array * cur_p) {
|
|||
|
||||
#define APPLY(__cnstr, __cur_p) do { \
|
||||
auto * cnstr = (__cnstr); \
|
||||
llama_constraint_apply(cnstr, (__cur_p)); \
|
||||
llama_constraint_free(cnstr); \
|
||||
llama_sampler_apply(cnstr, (__cur_p)); \
|
||||
llama_sampler_free(cnstr); \
|
||||
} while(0)
|
||||
|
||||
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
|
||||
|
@ -35,10 +35,10 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float
|
|||
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||
APPLY(llama_constraint_init_softmax(), &cur_p);
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
|
||||
APPLY(llama_sampler_init_softmax(), &cur_p);
|
||||
DUMP(&cur_p);
|
||||
APPLY(llama_constraint_init_top_k(k), &cur_p);
|
||||
APPLY(llama_sampler_init_top_k(k), &cur_p);
|
||||
DUMP(&cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.size == expected_probs.size());
|
||||
|
@ -57,10 +57,10 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
|
|||
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||
APPLY(llama_constraint_init_softmax(), &cur_p);
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
|
||||
APPLY(llama_sampler_init_softmax(), &cur_p);
|
||||
DUMP(&cur_p);
|
||||
APPLY(llama_constraint_init_top_p(p, 1), &cur_p);
|
||||
APPLY(llama_sampler_init_top_p(p, 1), &cur_p);
|
||||
DUMP(&cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.size == expected_probs.size());
|
||||
|
@ -79,9 +79,9 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>
|
|||
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
|
||||
DUMP(&cur_p);
|
||||
APPLY(llama_constraint_init_tail_free(z, 1), &cur_p);
|
||||
APPLY(llama_sampler_init_tail_free(z, 1), &cur_p);
|
||||
DUMP(&cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.size == expected_probs.size());
|
||||
|
@ -100,11 +100,11 @@ static void test_min_p(const std::vector<float> & probs, const std::vector<float
|
|||
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
|
||||
DUMP(&cur_p);
|
||||
APPLY(llama_constraint_init_min_p(p, 1), &cur_p);
|
||||
APPLY(llama_sampler_init_min_p(p, 1), &cur_p);
|
||||
DUMP(&cur_p);
|
||||
APPLY(llama_constraint_init_softmax(), &cur_p);
|
||||
APPLY(llama_sampler_init_softmax(), &cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < cur_p.size; i++) {
|
||||
|
@ -122,9 +122,9 @@ static void test_typical(const std::vector<float> & probs, const std::vector<flo
|
|||
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
|
||||
DUMP(&cur_p);
|
||||
APPLY(llama_constraint_init_typical(p, 1), &cur_p);
|
||||
APPLY(llama_sampler_init_typical(p, 1), &cur_p);
|
||||
DUMP(&cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.size == expected_probs.size());
|
||||
|
@ -153,11 +153,11 @@ static void test_penalties(
|
|||
token_count[last_tokens[i]]++;
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||
APPLY(llama_constraint_init_softmax(), &cur_p);
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
|
||||
APPLY(llama_sampler_init_softmax(), &cur_p);
|
||||
DUMP(&cur_p);
|
||||
llama_constraint_penalties_impl(&cur_p, token_count, repeat_penalty, alpha_frequency, alpha_presence); // TODO: avoid
|
||||
APPLY(llama_constraint_init_softmax(), &cur_p);
|
||||
llama_sampler_penalties_impl(&cur_p, token_count, repeat_penalty, alpha_frequency, alpha_presence); // TODO: avoid
|
||||
APPLY(llama_sampler_init_softmax(), &cur_p);
|
||||
DUMP(&cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.size == expected_probs.size());
|
||||
|
@ -175,23 +175,23 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
|
|||
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||
llama_token_data_array cur_p = { cur.data(), cur.size(), LLAMA_TOKEN_NULL, false };
|
||||
|
||||
llama_token min_token_id = 0;
|
||||
const llama_token max_token_id = n_vocab-1;
|
||||
|
||||
for (auto s : samplers_sequence) {
|
||||
switch (s){
|
||||
case 'k': APPLY(llama_constraint_init_top_k(top_k), &cur_p); break;
|
||||
case 'k': APPLY(llama_sampler_init_top_k(top_k), &cur_p); break;
|
||||
case 'f': GGML_ABORT("tail_free test not implemented");
|
||||
case 'y': GGML_ABORT("typical test not implemented");
|
||||
case 'p': APPLY(llama_constraint_init_top_p(top_p, 1), &cur_p); break;
|
||||
case 'm': APPLY(llama_constraint_init_min_p(min_p, 1), &cur_p); break;
|
||||
case 'p': APPLY(llama_sampler_init_top_p(top_p, 1), &cur_p); break;
|
||||
case 'm': APPLY(llama_sampler_init_min_p(min_p, 1), &cur_p); break;
|
||||
case 't': GGML_ABORT("temperature test not implemented");
|
||||
default : GGML_ABORT("Unknown sampler");
|
||||
}
|
||||
|
||||
APPLY(llama_constraint_init_softmax(), &cur_p); // make sure tokens are sorted for tests
|
||||
APPLY(llama_sampler_init_softmax(), &cur_p); // make sure tokens are sorted for tests
|
||||
|
||||
const int size = cur_p.size;
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue