constraint : clean-up and simplify
This commit is contained in:
parent
ca5d21c17a
commit
c024fe45b0
9 changed files with 357 additions and 357 deletions
|
@ -232,18 +232,18 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
|
||||||
return gpt_sampler_sample(smpl, cur_p, params.temp, params.mirostat, params.n_probs);
|
return gpt_sampler_sample(smpl, cur_p, params.temp, params.mirostat, params.n_probs);
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_sampler_apply_grammar(struct gpt_sampler * gsmpl, llama_token_data_array * candidates) {
|
void gpt_sampler_apply_grammar(struct gpt_sampler * gsmpl, llama_token_data_array * cur_p) {
|
||||||
GGML_ASSERT(candidates != nullptr);
|
GGML_ASSERT(cur_p != nullptr);
|
||||||
|
|
||||||
llama_constraint_apply(gsmpl->grmr, candidates);
|
llama_constraint_apply(gsmpl->grmr, cur_p);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token gpt_sampler_sample_dist(struct gpt_sampler * gsmpl, llama_token_data_array * candidates) {
|
llama_token gpt_sampler_sample_dist(struct gpt_sampler * gsmpl, llama_token_data_array * cur_p) {
|
||||||
return llama_sampler_sample_dist(gsmpl->smpl, candidates);
|
return llama_sampler_sample_dist(gsmpl->smpl, cur_p);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token gpt_sampler_sample_greedy(struct gpt_sampler * gsmpl, llama_token_data_array * candidates, bool probs) {
|
llama_token gpt_sampler_sample_greedy(struct gpt_sampler * gsmpl, llama_token_data_array * cur_p, bool probs) {
|
||||||
return llama_sampler_sample_greedy(gsmpl->smpl, candidates, probs);
|
return llama_sampler_sample_greedy(gsmpl->smpl, cur_p, probs);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
|
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
|
||||||
|
|
|
@ -93,10 +93,10 @@ void gpt_print_timings(struct llama_context * ctx, struct gpt_sampler * gsmpl);
|
||||||
//
|
//
|
||||||
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx);
|
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx);
|
||||||
|
|
||||||
void gpt_sampler_apply_grammar(struct gpt_sampler * gsmpl, llama_token_data_array * candidates);
|
void gpt_sampler_apply_grammar(struct gpt_sampler * gsmpl, llama_token_data_array * cur_p);
|
||||||
|
|
||||||
llama_token gpt_sampler_sample_dist (struct gpt_sampler * gsmpl, llama_token_data_array * candidates);
|
llama_token gpt_sampler_sample_dist (struct gpt_sampler * gsmpl, llama_token_data_array * cur_p);
|
||||||
llama_token gpt_sampler_sample_greedy(struct gpt_sampler * gsmpl, llama_token_data_array * candidates, bool probs);
|
llama_token gpt_sampler_sample_greedy(struct gpt_sampler * gsmpl, llama_token_data_array * cur_p, bool probs);
|
||||||
|
|
||||||
// helpers
|
// helpers
|
||||||
|
|
||||||
|
|
|
@ -1027,11 +1027,11 @@ extern "C" {
|
||||||
struct llama_constraint_i {
|
struct llama_constraint_i {
|
||||||
// TODO: add name API
|
// TODO: add name API
|
||||||
|
|
||||||
void (*accept)( struct llama_constraint * cnstr, llama_token token); // can be NULL
|
void (*accept)( struct llama_constraint * cnstr, llama_token token); // can be NULL
|
||||||
void (*apply) ( struct llama_constraint * cnstr, llama_token_data_array * candidates); // required
|
void (*apply) ( struct llama_constraint * cnstr, llama_token_data_array * cur_p); // required
|
||||||
void (*reset) ( struct llama_constraint * cnstr); // can be NULL
|
void (*reset) ( struct llama_constraint * cnstr); // can be NULL
|
||||||
struct llama_constraint * (*copy) (const struct llama_constraint * cnstr); // can be NULL if ctx is NULL
|
struct llama_constraint * (*copy) (const struct llama_constraint * cnstr); // can be NULL if ctx is NULL
|
||||||
void (*free) ( struct llama_constraint * cnstr); // can be NULL
|
void (*free) ( struct llama_constraint * cnstr); // can be NULL if ctx is NULL
|
||||||
|
|
||||||
// TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
|
// TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
|
||||||
//void (*apply_ggml) (struct llama_constraint * cnstr, ...);
|
//void (*apply_ggml) (struct llama_constraint * cnstr, ...);
|
||||||
|
@ -1044,6 +1044,7 @@ extern "C" {
|
||||||
llama_constraint_context_t ctx;
|
llama_constraint_context_t ctx;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
LLAMA_API struct llama_constraint * llama_constraint_init_softmax ();
|
||||||
LLAMA_API struct llama_constraint * llama_constraint_init_top_k (int32_t k, int32_t min_keep);
|
LLAMA_API struct llama_constraint * llama_constraint_init_top_k (int32_t k, int32_t min_keep);
|
||||||
LLAMA_API struct llama_constraint * llama_constraint_init_top_p (float p, int32_t min_keep);
|
LLAMA_API struct llama_constraint * llama_constraint_init_top_p (float p, int32_t min_keep);
|
||||||
LLAMA_API struct llama_constraint * llama_constraint_init_min_p (float p, int32_t min_keep);
|
LLAMA_API struct llama_constraint * llama_constraint_init_min_p (float p, int32_t min_keep);
|
||||||
|
@ -1077,7 +1078,7 @@ extern "C" {
|
||||||
LLAMA_API void llama_constraint_free(struct llama_constraint * cnstr);
|
LLAMA_API void llama_constraint_free(struct llama_constraint * cnstr);
|
||||||
|
|
||||||
LLAMA_API void llama_constraint_accept(struct llama_constraint * cnstr, llama_token token);
|
LLAMA_API void llama_constraint_accept(struct llama_constraint * cnstr, llama_token token);
|
||||||
LLAMA_API void llama_constraint_apply (struct llama_constraint * cnstr, llama_token_data_array * candidates);
|
LLAMA_API void llama_constraint_apply (struct llama_constraint * cnstr, llama_token_data_array * cur_p);
|
||||||
LLAMA_API void llama_constraint_reset (struct llama_constraint * cnstr);
|
LLAMA_API void llama_constraint_reset (struct llama_constraint * cnstr);
|
||||||
|
|
||||||
// samplers
|
// samplers
|
||||||
|
@ -1095,11 +1096,11 @@ extern "C" {
|
||||||
LLAMA_API void llama_sampler_add_constraint(struct llama_sampler * smpl, struct llama_constraint * cnstr);
|
LLAMA_API void llama_sampler_add_constraint(struct llama_sampler * smpl, struct llama_constraint * cnstr);
|
||||||
|
|
||||||
LLAMA_API void llama_sampler_accept(struct llama_sampler * smpl, llama_token token);
|
LLAMA_API void llama_sampler_accept(struct llama_sampler * smpl, llama_token token);
|
||||||
LLAMA_API void llama_sampler_apply (struct llama_sampler * smpl, llama_token_data_array * candidates);
|
LLAMA_API void llama_sampler_apply (struct llama_sampler * smpl, llama_token_data_array * cur_p);
|
||||||
|
|
||||||
LLAMA_API llama_token llama_sampler_sample_dist (struct llama_sampler * smpl, llama_token_data_array * candidates);
|
LLAMA_API llama_token llama_sampler_sample_dist (struct llama_sampler * smpl, llama_token_data_array * cur_p);
|
||||||
LLAMA_API llama_token llama_sampler_sample_greedy (struct llama_sampler * smpl, llama_token_data_array * candidates, bool probs);
|
LLAMA_API llama_token llama_sampler_sample_greedy (struct llama_sampler * smpl, llama_token_data_array * cur_p, bool probs);
|
||||||
LLAMA_API llama_token llama_sampler_sample_mirostat(struct llama_sampler * smpl, llama_token_data_array * candidates);
|
LLAMA_API llama_token llama_sampler_sample_mirostat(struct llama_sampler * smpl, llama_token_data_array * cur_p);
|
||||||
|
|
||||||
/// @details Get the number of accepted tokens so far (max of n_prev)
|
/// @details Get the number of accepted tokens so far (max of n_prev)
|
||||||
LLAMA_API int llama_sampler_n_prev(const struct llama_sampler * smpl);
|
LLAMA_API int llama_sampler_n_prev(const struct llama_sampler * smpl);
|
||||||
|
|
|
@ -1069,7 +1069,7 @@ struct llama_grammar * llama_grammar_cp_impl(const struct llama_grammar & gramma
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * candidates) {
|
void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
|
||||||
GGML_ASSERT(grammar.vocab != nullptr);
|
GGML_ASSERT(grammar.vocab != nullptr);
|
||||||
|
|
||||||
bool allow_eog = false;
|
bool allow_eog = false;
|
||||||
|
@ -1081,21 +1081,21 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
||||||
candidates_decoded.reserve(candidates->size);
|
candidates_decoded.reserve(cur_p->size);
|
||||||
|
|
||||||
llama_grammar_candidates candidates_grammar;
|
llama_grammar_candidates candidates_grammar;
|
||||||
candidates_grammar.reserve(candidates->size);
|
candidates_grammar.reserve(cur_p->size);
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
const llama_token id = candidates->data[i].id;
|
const llama_token id = cur_p->data[i].id;
|
||||||
const std::string & piece = grammar.vocab->cache_token_to_piece.at(id);
|
const std::string & piece = grammar.vocab->cache_token_to_piece.at(id);
|
||||||
|
|
||||||
if (llama_token_is_eog_impl(*grammar.vocab, id)) {
|
if (llama_token_is_eog_impl(*grammar.vocab, id)) {
|
||||||
if (!allow_eog) {
|
if (!allow_eog) {
|
||||||
candidates->data[i].logit = -INFINITY;
|
cur_p->data[i].logit = -INFINITY;
|
||||||
}
|
}
|
||||||
} else if (piece.empty() || piece[0] == 0) {
|
} else if (piece.empty() || piece[0] == 0) {
|
||||||
candidates->data[i].logit = -INFINITY;
|
cur_p->data[i].logit = -INFINITY;
|
||||||
} else {
|
} else {
|
||||||
candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
|
candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
|
||||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
||||||
|
@ -1104,7 +1104,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||||
|
|
||||||
const auto rejects = llama_grammar_reject_candidates(grammar.rules, grammar.stacks, candidates_grammar);
|
const auto rejects = llama_grammar_reject_candidates(grammar.rules, grammar.stacks, candidates_grammar);
|
||||||
for (const auto & reject : rejects) {
|
for (const auto & reject : rejects) {
|
||||||
candidates->data[reject.index].logit = -INFINITY;
|
cur_p->data[reject.index].logit = -INFINITY;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -136,7 +136,7 @@ struct llama_grammar * llama_grammar_cp_impl(const struct llama_grammar & gramma
|
||||||
// TODO: move the API below as member functions of llama_grammar
|
// TODO: move the API below as member functions of llama_grammar
|
||||||
void llama_grammar_apply_impl(
|
void llama_grammar_apply_impl(
|
||||||
const struct llama_grammar & grammar,
|
const struct llama_grammar & grammar,
|
||||||
llama_token_data_array * candidates);
|
llama_token_data_array * cur_p);
|
||||||
|
|
||||||
void llama_grammar_accept_impl(
|
void llama_grammar_accept_impl(
|
||||||
struct llama_grammar & grammar,
|
struct llama_grammar & grammar,
|
||||||
|
|
|
@ -24,51 +24,51 @@ static void llama_log_softmax(float * array, size_t size) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_softmax_impl(llama_token_data_array * candidates) {
|
static void llama_constraint_softmax_impl(llama_token_data_array * cur_p) {
|
||||||
GGML_ASSERT(candidates->size > 0);
|
GGML_ASSERT(cur_p->size > 0);
|
||||||
|
|
||||||
// Sort the logits in descending order
|
// Sort the logits in descending order
|
||||||
if (!candidates->sorted) {
|
if (!cur_p->sorted) {
|
||||||
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||||
return a.logit > b.logit;
|
return a.logit > b.logit;
|
||||||
});
|
});
|
||||||
candidates->sorted = true;
|
cur_p->sorted = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
float max_l = candidates->data[0].logit;
|
float max_l = cur_p->data[0].logit;
|
||||||
float cum_sum = 0.0f;
|
float cum_sum = 0.0f;
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
float p = expf(candidates->data[i].logit - max_l);
|
float p = expf(cur_p->data[i].logit - max_l);
|
||||||
candidates->data[i].p = p;
|
cur_p->data[i].p = p;
|
||||||
cum_sum += p;
|
cum_sum += p;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
candidates->data[i].p /= cum_sum;
|
cur_p->data[i].p /= cum_sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_top_k_impl(llama_token_data_array * candidates, int32_t k, size_t min_keep) {
|
static void llama_constraint_top_k_impl(llama_token_data_array * cur_p, int32_t k, size_t min_keep) {
|
||||||
// TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
|
// TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
|
||||||
// if (k >= (int32_t)candidates->size) {
|
// if (k >= (int32_t)cur_p->size) {
|
||||||
// return;
|
// return;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
if (k <= 0) {
|
if (k <= 0) {
|
||||||
k = candidates->size;
|
k = cur_p->size;
|
||||||
}
|
}
|
||||||
|
|
||||||
k = std::max(k, (int) min_keep);
|
k = std::max(k, (int) min_keep);
|
||||||
k = std::min(k, (int) candidates->size);
|
k = std::min(k, (int) cur_p->size);
|
||||||
|
|
||||||
// Sort scores in descending order
|
// Sort scores in descending order
|
||||||
if (!candidates->sorted) {
|
if (!cur_p->sorted) {
|
||||||
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
||||||
return a.logit > b.logit;
|
return a.logit > b.logit;
|
||||||
};
|
};
|
||||||
if (k <= 128) {
|
if (k <= 128) {
|
||||||
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
|
std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp);
|
||||||
} else {
|
} else {
|
||||||
constexpr int nbuckets = 128;
|
constexpr int nbuckets = 128;
|
||||||
constexpr float bucket_low = -10.0f;
|
constexpr float bucket_low = -10.0f;
|
||||||
|
@ -76,11 +76,11 @@ void llama_constraint_top_k_impl(llama_token_data_array * candidates, int32_t k,
|
||||||
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
||||||
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
||||||
|
|
||||||
std::vector<int> bucket_idx(candidates->size);
|
std::vector<int> bucket_idx(cur_p->size);
|
||||||
std::vector<int> histo(nbuckets, 0);
|
std::vector<int> histo(nbuckets, 0);
|
||||||
|
|
||||||
for (int i = 0; i < (int)candidates->size; ++i) {
|
for (int i = 0; i < (int)cur_p->size; ++i) {
|
||||||
const float val = candidates->data[i].logit;
|
const float val = cur_p->data[i].logit;
|
||||||
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
||||||
ib = std::max(0, std::min(nbuckets-1, ib));
|
ib = std::max(0, std::min(nbuckets-1, ib));
|
||||||
bucket_idx[i] = ib;
|
bucket_idx[i] = ib;
|
||||||
|
@ -102,10 +102,10 @@ void llama_constraint_top_k_impl(llama_token_data_array * candidates, int32_t k,
|
||||||
bucket_ptrs.push_back(ptr);
|
bucket_ptrs.push_back(ptr);
|
||||||
ptr += histo[j];
|
ptr += histo[j];
|
||||||
}
|
}
|
||||||
for (int i = 0; i < (int)candidates->size; ++i) {
|
for (int i = 0; i < (int)cur_p->size; ++i) {
|
||||||
int j = bucket_idx[i];
|
int j = bucket_idx[i];
|
||||||
if (j >= ib) {
|
if (j >= ib) {
|
||||||
*bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
|
*bucket_ptrs[nbuckets-1-j]++ = cur_p->data[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -118,27 +118,27 @@ void llama_constraint_top_k_impl(llama_token_data_array * candidates, int32_t k,
|
||||||
}
|
}
|
||||||
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
|
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
|
||||||
|
|
||||||
std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
|
std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data));
|
||||||
|
|
||||||
}
|
}
|
||||||
candidates->sorted = true;
|
cur_p->sorted = true;
|
||||||
}
|
}
|
||||||
candidates->size = k;
|
cur_p->size = k;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_top_p_impl(llama_token_data_array * candidates, float p, size_t min_keep) {
|
static void llama_constraint_top_p_impl(llama_token_data_array * cur_p, float p, size_t min_keep) {
|
||||||
if (p >= 1.0f) {
|
if (p >= 1.0f) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_constraint_softmax_impl(candidates);
|
llama_constraint_softmax_impl(cur_p);
|
||||||
|
|
||||||
// Compute the cumulative probabilities
|
// Compute the cumulative probabilities
|
||||||
float cum_sum = 0.0f;
|
float cum_sum = 0.0f;
|
||||||
size_t last_idx = candidates->size;
|
size_t last_idx = cur_p->size;
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
cum_sum += candidates->data[i].p;
|
cum_sum += cur_p->data[i].p;
|
||||||
|
|
||||||
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
||||||
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
||||||
|
@ -149,77 +149,77 @@ void llama_constraint_top_p_impl(llama_token_data_array * candidates, float p, s
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resize the output vector to keep only the top-p tokens
|
// Resize the output vector to keep only the top-p tokens
|
||||||
candidates->size = last_idx;
|
cur_p->size = last_idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_min_p_impl(llama_token_data_array * candidates, float p, size_t min_keep) {
|
static void llama_constraint_min_p_impl(llama_token_data_array * cur_p, float p, size_t min_keep) {
|
||||||
if (p <= 0.0f || !candidates->size) {
|
if (p <= 0.0f || !cur_p->size) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool min_p_applied = false;
|
bool min_p_applied = false;
|
||||||
|
|
||||||
// if the candidates aren't sorted, try the unsorted implementation first
|
// if the cur_p aren't sorted, try the unsorted implementation first
|
||||||
if (!candidates->sorted) {
|
if (!cur_p->sorted) {
|
||||||
std::vector<llama_token_data> filtered_tokens;
|
std::vector<llama_token_data> filtered_tokens;
|
||||||
|
|
||||||
float max_logit = -FLT_MAX;
|
float max_logit = -FLT_MAX;
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
max_logit = std::max(max_logit, candidates->data[i].logit);
|
max_logit = std::max(max_logit, cur_p->data[i].logit);
|
||||||
}
|
}
|
||||||
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
|
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
if (candidates->data[i].logit >= min_logit) {
|
if (cur_p->data[i].logit >= min_logit) {
|
||||||
filtered_tokens.push_back(candidates->data[i]);
|
filtered_tokens.push_back(cur_p->data[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if we have enough values the operation was a success
|
// if we have enough values the operation was a success
|
||||||
if (filtered_tokens.size() >= min_keep) {
|
if (filtered_tokens.size() >= min_keep) {
|
||||||
memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
||||||
candidates->size = filtered_tokens.size();
|
cur_p->size = filtered_tokens.size();
|
||||||
min_p_applied = true;
|
min_p_applied = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if the candidates are sorted or the unsorted implementation failed, use this implementation
|
// if the cur_p are sorted or the unsorted implementation failed, use this implementation
|
||||||
if (!min_p_applied) {
|
if (!min_p_applied) {
|
||||||
// Sort the logits in descending order
|
// Sort the logits in descending order
|
||||||
if (!candidates->sorted) {
|
if (!cur_p->sorted) {
|
||||||
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||||
return a.logit > b.logit;
|
return a.logit > b.logit;
|
||||||
});
|
});
|
||||||
candidates->sorted = true;
|
cur_p->sorted = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
|
const float min_logit = cur_p->data[0].logit + logf(p); // min logit for p_i >= p * p_max
|
||||||
size_t i = 1; // first token always matches
|
size_t i = 1; // first token always matches
|
||||||
|
|
||||||
for (; i < candidates->size; ++i) {
|
for (; i < cur_p->size; ++i) {
|
||||||
if (candidates->data[i].logit < min_logit && i >= min_keep) {
|
if (cur_p->data[i].logit < min_logit && i >= min_keep) {
|
||||||
break; // prob too small
|
break; // prob too small
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resize the output vector to keep only the matching tokens
|
// Resize the output vector to keep only the matching tokens
|
||||||
candidates->size = i;
|
cur_p->size = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_tail_free_impl(llama_token_data_array * candidates, float z, size_t min_keep) {
|
static void llama_constraint_tail_free_impl(llama_token_data_array * cur_p, float z, size_t min_keep) {
|
||||||
if (z >= 1.0f || candidates->size <= 2) {
|
if (z >= 1.0f || cur_p->size <= 2) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_constraint_softmax_impl(candidates);
|
llama_constraint_softmax_impl(cur_p);
|
||||||
|
|
||||||
// Compute the first and second derivatives
|
// Compute the first and second derivatives
|
||||||
std::vector<float> first_derivatives(candidates->size - 1);
|
std::vector<float> first_derivatives(cur_p->size - 1);
|
||||||
std::vector<float> second_derivatives(candidates->size - 2);
|
std::vector<float> second_derivatives(cur_p->size - 2);
|
||||||
|
|
||||||
for (size_t i = 0; i < first_derivatives.size(); ++i) {
|
for (size_t i = 0; i < first_derivatives.size(); ++i) {
|
||||||
first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
|
first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
||||||
second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
|
second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
|
||||||
|
@ -246,7 +246,7 @@ void llama_constraint_tail_free_impl(llama_token_data_array * candidates, float
|
||||||
}
|
}
|
||||||
|
|
||||||
float cum_sum = 0.0f;
|
float cum_sum = 0.0f;
|
||||||
size_t last_idx = candidates->size;
|
size_t last_idx = cur_p->size;
|
||||||
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
||||||
cum_sum += second_derivatives[i];
|
cum_sum += second_derivatives[i];
|
||||||
|
|
||||||
|
@ -258,10 +258,10 @@ void llama_constraint_tail_free_impl(llama_token_data_array * candidates, float
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resize the output vector to keep only the tokens above the tail location
|
// Resize the output vector to keep only the tokens above the tail location
|
||||||
candidates->size = last_idx;
|
cur_p->size = last_idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_typical_impl(llama_token_data_array * candidates, float p, size_t min_keep) {
|
static void llama_constraint_typical_impl(llama_token_data_array * cur_p, float p, size_t min_keep) {
|
||||||
// Reference implementation:
|
// Reference implementation:
|
||||||
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
|
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
|
||||||
if (p >= 1.0f) {
|
if (p >= 1.0f) {
|
||||||
|
@ -269,22 +269,22 @@ void llama_constraint_typical_impl(llama_token_data_array * candidates, float p,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute the softmax of logits and calculate entropy
|
// Compute the softmax of logits and calculate entropy
|
||||||
llama_constraint_softmax_impl(candidates);
|
llama_constraint_softmax_impl(cur_p);
|
||||||
|
|
||||||
float entropy = 0.0f;
|
float entropy = 0.0f;
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
entropy += -cur_p->data[i].p * logf(cur_p->data[i].p);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute the absolute difference between negative log probability and entropy for each candidate
|
// Compute the absolute difference between negative log probability and entropy for each candidate
|
||||||
std::vector<float> shifted_scores;
|
std::vector<float> shifted_scores;
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
|
float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy);
|
||||||
shifted_scores.push_back(shifted_score);
|
shifted_scores.push_back(shifted_score);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort tokens based on the shifted_scores and their corresponding indices
|
// Sort tokens based on the shifted_scores and their corresponding indices
|
||||||
std::vector<size_t> indices(candidates->size);
|
std::vector<size_t> indices(cur_p->size);
|
||||||
std::iota(indices.begin(), indices.end(), 0);
|
std::iota(indices.begin(), indices.end(), 0);
|
||||||
|
|
||||||
std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
|
std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
|
||||||
|
@ -297,7 +297,7 @@ void llama_constraint_typical_impl(llama_token_data_array * candidates, float p,
|
||||||
|
|
||||||
for (size_t i = 0; i < indices.size(); ++i) {
|
for (size_t i = 0; i < indices.size(); ++i) {
|
||||||
size_t idx = indices[i];
|
size_t idx = indices[i];
|
||||||
cum_sum += candidates->data[idx].p;
|
cum_sum += cur_p->data[idx].p;
|
||||||
|
|
||||||
// Check if the running sum is greater than typical or if we have kept at least min_keep tokens
|
// Check if the running sum is greater than typical or if we have kept at least min_keep tokens
|
||||||
if (cum_sum > p && i >= min_keep - 1) {
|
if (cum_sum > p && i >= min_keep - 1) {
|
||||||
|
@ -307,39 +307,39 @@ void llama_constraint_typical_impl(llama_token_data_array * candidates, float p,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resize the output vector to keep only the locally typical tokens
|
// Resize the output vector to keep only the locally typical tokens
|
||||||
std::vector<llama_token_data> new_candidates;
|
std::vector<llama_token_data> cur_p_new;
|
||||||
for (size_t i = 0; i < last_idx; ++i) {
|
for (size_t i = 0; i < last_idx; ++i) {
|
||||||
size_t idx = indices[i];
|
size_t idx = indices[i];
|
||||||
new_candidates.push_back(candidates->data[idx]);
|
cur_p_new.push_back(cur_p->data[idx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Replace the data in candidates with the new_candidates data
|
// Replace the data in cur_p with the cur_p_new data
|
||||||
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data);
|
||||||
candidates->size = new_candidates.size();
|
cur_p->size = cur_p_new.size();
|
||||||
candidates->sorted = false;
|
cur_p->sorted = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_entropy_impl(llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val) {
|
static void llama_constraint_entropy_impl(llama_token_data_array * cur_p, float min_temp, float max_temp, float exponent_val) {
|
||||||
// no need to do anything if there is only one (or zero) candidates
|
// no need to do anything if there is only one (or zero) candidates
|
||||||
if(candidates->size <= 1) {
|
if (cur_p->size <= 1) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate maximum possible entropy
|
// Calculate maximum possible entropy
|
||||||
float max_entropy = -logf(1.0f / candidates->size);
|
float max_entropy = -logf(1.0f / cur_p->size);
|
||||||
|
|
||||||
llama_constraint_softmax_impl(candidates);
|
llama_constraint_softmax_impl(cur_p);
|
||||||
|
|
||||||
// Calculate entropy of the softmax probabilities
|
// Calculate entropy of the softmax probabilities
|
||||||
float entropy = 0.0f;
|
float entropy = 0.0f;
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
float prob = candidates->data[i].p;
|
float prob = cur_p->data[i].p;
|
||||||
if (prob > 0.0f) { // Ensure no log(0)
|
if (prob > 0.0f) { // Ensure no log(0)
|
||||||
entropy -= prob * logf(prob);
|
entropy -= prob * logf(prob);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Normalize the entropy (max_entropy cannot be 0 here because we checked candidates->size != 1 above)
|
// Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above)
|
||||||
float normalized_entropy = entropy / max_entropy;
|
float normalized_entropy = entropy / max_entropy;
|
||||||
|
|
||||||
// Map the normalized entropy to the desired temperature range using the power function
|
// Map the normalized entropy to the desired temperature range using the power function
|
||||||
|
@ -355,52 +355,52 @@ void llama_constraint_entropy_impl(llama_token_data_array * candidates, float mi
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Apply the dynamically calculated temperature scaling
|
// Apply the dynamically calculated temperature scaling
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
candidates->data[i].logit /= dyn_temp;
|
cur_p->data[i].logit /= dyn_temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Re-compute softmax probabilities after scaling logits with dynamic temperature
|
// Re-compute softmax probabilities after scaling logits with dynamic temperature
|
||||||
const double max_l_double = candidates->data[0].logit;
|
const double max_l_double = cur_p->data[0].logit;
|
||||||
|
|
||||||
double cum_sum_double = 0.0;
|
double cum_sum_double = 0.0;
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
double p = exp(candidates->data[i].logit - max_l_double);
|
double p = exp(cur_p->data[i].logit - max_l_double);
|
||||||
candidates->data[i].p = p; // Store the scaled probability
|
cur_p->data[i].p = p; // Store the scaled probability
|
||||||
cum_sum_double += p;
|
cum_sum_double += p;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
candidates->data[i].p /= cum_sum_double; // Re-normalize the probabilities
|
cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
// Print the updated top 25 probabilities after temperature scaling
|
// Print the updated top 25 probabilities after temperature scaling
|
||||||
LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
|
LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
|
||||||
for (size_t i = 0; i < 25 && i < candidates->size; ++i) {
|
for (size_t i = 0; i < 25 && i < cur_p->size; ++i) {
|
||||||
LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates->data[i].p * 100.0f);
|
LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, cur_p->data[i].p * 100.0f);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_temp_impl(llama_token_data_array * candidates, float temp) {
|
static void llama_constraint_temp_impl(llama_token_data_array * cur_p, float temp) {
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
candidates->data[i].logit /= temp;
|
cur_p->data[i].logit /= temp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_grammar_impl(llama_token_data_array * candidates, const struct llama_grammar & grammar) {
|
static void llama_constraint_grammar_impl(llama_token_data_array * cur_p, const struct llama_grammar & grammar) {
|
||||||
llama_grammar_apply_impl(grammar, candidates);
|
llama_grammar_apply_impl(grammar, cur_p);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_penalties_impl(
|
void llama_constraint_penalties_impl(
|
||||||
llama_token_data_array * candidates,
|
llama_token_data_array * cur_p,
|
||||||
const llama_token_cnt & token_count,
|
const llama_token_cnt & token_count,
|
||||||
float penalty_repeat,
|
float penalty_repeat,
|
||||||
float penalty_freq,
|
float penalty_freq,
|
||||||
float penalty_present) {
|
float penalty_present) {
|
||||||
// Apply frequency and presence penalties to the candidates
|
// Apply frequency and presence penalties to the cur_p
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
const auto token_iter = token_count.find(candidates->data[i].id);
|
const auto token_iter = token_count.find(cur_p->data[i].id);
|
||||||
if (token_iter == token_count.end()) {
|
if (token_iter == token_count.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -409,23 +409,42 @@ void llama_constraint_penalties_impl(
|
||||||
|
|
||||||
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
||||||
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
||||||
if (candidates->data[i].logit <= 0) {
|
if (cur_p->data[i].logit <= 0) {
|
||||||
candidates->data[i].logit *= penalty_repeat;
|
cur_p->data[i].logit *= penalty_repeat;
|
||||||
} else {
|
} else {
|
||||||
candidates->data[i].logit /= penalty_repeat;
|
cur_p->data[i].logit /= penalty_repeat;
|
||||||
}
|
}
|
||||||
|
|
||||||
candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
|
cur_p->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
|
||||||
}
|
}
|
||||||
|
|
||||||
candidates->sorted = false;
|
cur_p->sorted = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// sampling
|
// constraints
|
||||||
//
|
//
|
||||||
|
|
||||||
// constraints
|
// softmax
|
||||||
|
|
||||||
|
static struct llama_constraint_i llama_constraint_softmax_i = {
|
||||||
|
/* .accept = */ nullptr,
|
||||||
|
/* .apply = */ [](struct llama_constraint * /*cnstr*/, llama_token_data_array * cur_p) {
|
||||||
|
llama_constraint_softmax_impl(cur_p);
|
||||||
|
},
|
||||||
|
/* .reset = */ nullptr,
|
||||||
|
/* .copy = */ nullptr,
|
||||||
|
/* .free = */ nullptr,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_constraint * llama_constraint_init_softmax_impl() {
|
||||||
|
struct llama_constraint * result = new llama_constraint {
|
||||||
|
/* .iface = */ &llama_constraint_softmax_i,
|
||||||
|
/* .ctx = */ nullptr,
|
||||||
|
};
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
// top-k
|
// top-k
|
||||||
|
|
||||||
|
@ -436,9 +455,9 @@ struct llama_constraint_context_top_k {
|
||||||
|
|
||||||
static struct llama_constraint_i llama_constraint_top_k_i = {
|
static struct llama_constraint_i llama_constraint_top_k_i = {
|
||||||
/* .accept = */ nullptr,
|
/* .accept = */ nullptr,
|
||||||
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) {
|
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) {
|
||||||
auto * ctx = (llama_constraint_context_top_k *) cnstr->ctx;
|
auto * ctx = (llama_constraint_context_top_k *) cnstr->ctx;
|
||||||
llama_constraint_top_k_impl(candidates, ctx->k, ctx->min_keep);
|
llama_constraint_top_k_impl(cur_p, ctx->k, ctx->min_keep);
|
||||||
},
|
},
|
||||||
/* .reset = */ nullptr,
|
/* .reset = */ nullptr,
|
||||||
/* .copy = */ [](const struct llama_constraint * cnstr) {
|
/* .copy = */ [](const struct llama_constraint * cnstr) {
|
||||||
|
@ -446,10 +465,7 @@ static struct llama_constraint_i llama_constraint_top_k_i = {
|
||||||
return llama_constraint_init_top_k_impl(ctx->k, ctx->min_keep);
|
return llama_constraint_init_top_k_impl(ctx->k, ctx->min_keep);
|
||||||
},
|
},
|
||||||
/* .free = */ [](struct llama_constraint * cnstr) {
|
/* .free = */ [](struct llama_constraint * cnstr) {
|
||||||
if (cnstr->ctx) {
|
delete (llama_constraint_context_top_k *) cnstr->ctx;
|
||||||
delete (llama_constraint_context_top_k *) cnstr->ctx;
|
|
||||||
}
|
|
||||||
delete cnstr;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -474,9 +490,9 @@ struct llama_constraint_context_top_p {
|
||||||
|
|
||||||
static struct llama_constraint_i llama_constraint_top_p_i = {
|
static struct llama_constraint_i llama_constraint_top_p_i = {
|
||||||
/* .accept = */ nullptr,
|
/* .accept = */ nullptr,
|
||||||
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) {
|
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) {
|
||||||
auto * ctx = (llama_constraint_context_top_p *) cnstr->ctx;
|
auto * ctx = (llama_constraint_context_top_p *) cnstr->ctx;
|
||||||
llama_constraint_top_p_impl(candidates, ctx->p, ctx->min_keep);
|
llama_constraint_top_p_impl(cur_p, ctx->p, ctx->min_keep);
|
||||||
},
|
},
|
||||||
/* .reset = */ nullptr,
|
/* .reset = */ nullptr,
|
||||||
/* .copy = */ [](const struct llama_constraint * cnstr) {
|
/* .copy = */ [](const struct llama_constraint * cnstr) {
|
||||||
|
@ -484,10 +500,7 @@ static struct llama_constraint_i llama_constraint_top_p_i = {
|
||||||
return llama_constraint_init_top_p_impl(ctx->p, ctx->min_keep);
|
return llama_constraint_init_top_p_impl(ctx->p, ctx->min_keep);
|
||||||
},
|
},
|
||||||
/* .free = */ [](struct llama_constraint * cnstr) {
|
/* .free = */ [](struct llama_constraint * cnstr) {
|
||||||
if (cnstr->ctx) {
|
delete (llama_constraint_context_top_p *) cnstr->ctx;
|
||||||
delete (llama_constraint_context_top_p *) cnstr->ctx;
|
|
||||||
}
|
|
||||||
delete cnstr;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -512,9 +525,9 @@ struct llama_constraint_context_min_p {
|
||||||
|
|
||||||
static struct llama_constraint_i llama_constraint_min_p_i = {
|
static struct llama_constraint_i llama_constraint_min_p_i = {
|
||||||
/* .accept = */ nullptr,
|
/* .accept = */ nullptr,
|
||||||
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) {
|
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) {
|
||||||
auto * ctx = (llama_constraint_context_min_p *) cnstr->ctx;
|
auto * ctx = (llama_constraint_context_min_p *) cnstr->ctx;
|
||||||
llama_constraint_min_p_impl(candidates, ctx->p, ctx->min_keep);
|
llama_constraint_min_p_impl(cur_p, ctx->p, ctx->min_keep);
|
||||||
},
|
},
|
||||||
/* .reset = */ nullptr,
|
/* .reset = */ nullptr,
|
||||||
/* .copy = */ [](const struct llama_constraint * cnstr) {
|
/* .copy = */ [](const struct llama_constraint * cnstr) {
|
||||||
|
@ -522,10 +535,7 @@ static struct llama_constraint_i llama_constraint_min_p_i = {
|
||||||
return llama_constraint_init_min_p_impl(ctx->p, ctx->min_keep);
|
return llama_constraint_init_min_p_impl(ctx->p, ctx->min_keep);
|
||||||
},
|
},
|
||||||
/* .free = */ [](struct llama_constraint * cnstr) {
|
/* .free = */ [](struct llama_constraint * cnstr) {
|
||||||
if (cnstr->ctx) {
|
delete (llama_constraint_context_min_p *) cnstr->ctx;
|
||||||
delete (llama_constraint_context_min_p *) cnstr->ctx;
|
|
||||||
}
|
|
||||||
delete cnstr;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -550,9 +560,9 @@ struct llama_constraint_context_tail_free {
|
||||||
|
|
||||||
static struct llama_constraint_i llama_constraint_tail_free_i = {
|
static struct llama_constraint_i llama_constraint_tail_free_i = {
|
||||||
/* .accept = */ nullptr,
|
/* .accept = */ nullptr,
|
||||||
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) {
|
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) {
|
||||||
auto * ctx = (llama_constraint_context_tail_free *) cnstr->ctx;
|
auto * ctx = (llama_constraint_context_tail_free *) cnstr->ctx;
|
||||||
llama_constraint_tail_free_impl(candidates, ctx->z, ctx->min_keep);
|
llama_constraint_tail_free_impl(cur_p, ctx->z, ctx->min_keep);
|
||||||
},
|
},
|
||||||
/* .reset = */ nullptr,
|
/* .reset = */ nullptr,
|
||||||
/* .copy = */ [](const struct llama_constraint * cnstr) {
|
/* .copy = */ [](const struct llama_constraint * cnstr) {
|
||||||
|
@ -560,10 +570,7 @@ static struct llama_constraint_i llama_constraint_tail_free_i = {
|
||||||
return llama_constraint_init_tail_free_impl(ctx->z, ctx->min_keep);
|
return llama_constraint_init_tail_free_impl(ctx->z, ctx->min_keep);
|
||||||
},
|
},
|
||||||
/* .free = */ [](struct llama_constraint * cnstr) {
|
/* .free = */ [](struct llama_constraint * cnstr) {
|
||||||
if (cnstr->ctx) {
|
delete (llama_constraint_context_tail_free *) cnstr->ctx;
|
||||||
delete (llama_constraint_context_tail_free *) cnstr->ctx;
|
|
||||||
}
|
|
||||||
delete cnstr;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -588,9 +595,9 @@ struct llama_constraint_context_typical {
|
||||||
|
|
||||||
static struct llama_constraint_i llama_constraint_typical_i = {
|
static struct llama_constraint_i llama_constraint_typical_i = {
|
||||||
/* .accept = */ nullptr,
|
/* .accept = */ nullptr,
|
||||||
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) {
|
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) {
|
||||||
auto * ctx = (llama_constraint_context_typical *) cnstr->ctx;
|
auto * ctx = (llama_constraint_context_typical *) cnstr->ctx;
|
||||||
llama_constraint_typical_impl(candidates, ctx->p, ctx->min_keep);
|
llama_constraint_typical_impl(cur_p, ctx->p, ctx->min_keep);
|
||||||
},
|
},
|
||||||
/* .reset = */ nullptr,
|
/* .reset = */ nullptr,
|
||||||
/* .copy = */ [](const struct llama_constraint * cnstr) {
|
/* .copy = */ [](const struct llama_constraint * cnstr) {
|
||||||
|
@ -598,10 +605,7 @@ static struct llama_constraint_i llama_constraint_typical_i = {
|
||||||
return llama_constraint_init_typical_impl(ctx->p, ctx->min_keep);
|
return llama_constraint_init_typical_impl(ctx->p, ctx->min_keep);
|
||||||
},
|
},
|
||||||
/* .free = */ [](struct llama_constraint * cnstr) {
|
/* .free = */ [](struct llama_constraint * cnstr) {
|
||||||
if (cnstr->ctx) {
|
delete (llama_constraint_context_typical *) cnstr->ctx;
|
||||||
delete (llama_constraint_context_typical *) cnstr->ctx;
|
|
||||||
}
|
|
||||||
delete cnstr;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -625,9 +629,9 @@ struct llama_constraint_context_temp {
|
||||||
|
|
||||||
static struct llama_constraint_i llama_constraint_temp_i = {
|
static struct llama_constraint_i llama_constraint_temp_i = {
|
||||||
/* .accept = */ nullptr,
|
/* .accept = */ nullptr,
|
||||||
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) {
|
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) {
|
||||||
auto * ctx = (llama_constraint_context_temp *) cnstr->ctx;
|
auto * ctx = (llama_constraint_context_temp *) cnstr->ctx;
|
||||||
llama_constraint_temp_impl(candidates, ctx->temp);
|
llama_constraint_temp_impl(cur_p, ctx->temp);
|
||||||
},
|
},
|
||||||
/* .reset = */ nullptr,
|
/* .reset = */ nullptr,
|
||||||
/* .copy = */ [](const struct llama_constraint * cnstr) {
|
/* .copy = */ [](const struct llama_constraint * cnstr) {
|
||||||
|
@ -635,10 +639,7 @@ static struct llama_constraint_i llama_constraint_temp_i = {
|
||||||
return llama_constraint_init_temp_impl(ctx->temp);
|
return llama_constraint_init_temp_impl(ctx->temp);
|
||||||
},
|
},
|
||||||
/* .free = */ [](struct llama_constraint * cnstr) {
|
/* .free = */ [](struct llama_constraint * cnstr) {
|
||||||
if (cnstr->ctx) {
|
delete (llama_constraint_context_temp *) cnstr->ctx;
|
||||||
delete (llama_constraint_context_temp *) cnstr->ctx;
|
|
||||||
}
|
|
||||||
delete cnstr;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -663,15 +664,15 @@ struct llama_constraint_context_temp_ext {
|
||||||
|
|
||||||
static struct llama_constraint_i llama_constraint_temp_ext_i = {
|
static struct llama_constraint_i llama_constraint_temp_ext_i = {
|
||||||
/* .accept = */ nullptr,
|
/* .accept = */ nullptr,
|
||||||
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) {
|
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) {
|
||||||
auto * ctx = (llama_constraint_context_temp_ext *) cnstr->ctx;
|
auto * ctx = (llama_constraint_context_temp_ext *) cnstr->ctx;
|
||||||
if (ctx->delta > 0) {
|
if (ctx->delta > 0) {
|
||||||
const float temp_min = std::max(0.0f, ctx->temp - ctx->delta);
|
const float temp_min = std::max(0.0f, ctx->temp - ctx->delta);
|
||||||
const float temp_max = ctx->temp + ctx->delta;
|
const float temp_max = ctx->temp + ctx->delta;
|
||||||
|
|
||||||
llama_constraint_entropy_impl(candidates, temp_min, temp_max, ctx->exponent);
|
llama_constraint_entropy_impl(cur_p, temp_min, temp_max, ctx->exponent);
|
||||||
} else {
|
} else {
|
||||||
llama_constraint_temp_impl(candidates, ctx->temp);
|
llama_constraint_temp_impl(cur_p, ctx->temp);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
/* .reset = */ nullptr,
|
/* .reset = */ nullptr,
|
||||||
|
@ -680,10 +681,7 @@ static struct llama_constraint_i llama_constraint_temp_ext_i = {
|
||||||
return llama_constraint_init_temp_ext_impl(ctx->temp, ctx->delta, ctx->exponent);
|
return llama_constraint_init_temp_ext_impl(ctx->temp, ctx->delta, ctx->exponent);
|
||||||
},
|
},
|
||||||
/* .free = */ [](struct llama_constraint * cnstr) {
|
/* .free = */ [](struct llama_constraint * cnstr) {
|
||||||
if (cnstr->ctx) {
|
delete (llama_constraint_context_temp_ext *) cnstr->ctx;
|
||||||
delete (llama_constraint_context_temp_ext *) cnstr->ctx;
|
|
||||||
}
|
|
||||||
delete cnstr;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -716,10 +714,10 @@ static struct llama_constraint_i llama_constraint_grammar_i = {
|
||||||
llama_grammar_accept_impl(*ctx->grammar, token);
|
llama_grammar_accept_impl(*ctx->grammar, token);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) {
|
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) {
|
||||||
auto * ctx = (llama_constraint_context_grammar *) cnstr->ctx;
|
auto * ctx = (llama_constraint_context_grammar *) cnstr->ctx;
|
||||||
if (ctx->grammar) {
|
if (ctx->grammar) {
|
||||||
llama_constraint_grammar_impl(candidates, *ctx->grammar);
|
llama_constraint_grammar_impl(cur_p, *ctx->grammar);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
/* .reset = */ [](struct llama_constraint * cnstr) {
|
/* .reset = */ [](struct llama_constraint * cnstr) {
|
||||||
|
@ -749,15 +747,13 @@ static struct llama_constraint_i llama_constraint_grammar_i = {
|
||||||
return result;
|
return result;
|
||||||
},
|
},
|
||||||
/* .free = */ [](struct llama_constraint * cnstr) {
|
/* .free = */ [](struct llama_constraint * cnstr) {
|
||||||
if (cnstr->ctx) {
|
auto * ctx = (llama_constraint_context_grammar *) cnstr->ctx;
|
||||||
{
|
|
||||||
auto * ctx = (llama_constraint_context_grammar *) cnstr->ctx;
|
|
||||||
llama_grammar_free_impl(ctx->grammar);
|
|
||||||
}
|
|
||||||
|
|
||||||
delete (llama_constraint_context_grammar *) cnstr->ctx;
|
if (ctx->grammar) {
|
||||||
|
llama_grammar_free_impl(ctx->grammar);
|
||||||
}
|
}
|
||||||
delete cnstr;
|
|
||||||
|
delete ctx;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -807,13 +803,13 @@ static struct llama_constraint_i llama_constraint_penalties_i = {
|
||||||
auto * ctx = (llama_constraint_context_penalties *) cnstr->ctx;
|
auto * ctx = (llama_constraint_context_penalties *) cnstr->ctx;
|
||||||
ctx->prev.push_back(token);
|
ctx->prev.push_back(token);
|
||||||
},
|
},
|
||||||
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) {
|
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) {
|
||||||
auto * ctx = (llama_constraint_context_penalties *) cnstr->ctx;
|
auto * ctx = (llama_constraint_context_penalties *) cnstr->ctx;
|
||||||
|
|
||||||
GGML_ASSERT(candidates->size == ctx->vocab->n_vocab && candidates->sorted == false && "the 'penalties' constraint must be applied on the full vocabulary");
|
GGML_ASSERT(cur_p->size == ctx->vocab->n_vocab && cur_p->sorted == false && "the 'penalties' constraint must be applied on the full vocabulary");
|
||||||
|
|
||||||
if (ctx->ignore_eos) {
|
if (ctx->ignore_eos) {
|
||||||
candidates->data[ctx->vocab->special_eos_id].logit = -INFINITY;
|
cur_p->data[ctx->vocab->special_eos_id].logit = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((ctx->penalty_last_n == 0) ||
|
if ((ctx->penalty_last_n == 0) ||
|
||||||
|
@ -821,7 +817,7 @@ static struct llama_constraint_i llama_constraint_penalties_i = {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const float nl_logit = !ctx->penalize_nl ? candidates->data[ctx->vocab->linefeed_id].logit : -INFINITY;
|
const float nl_logit = !ctx->penalize_nl ? cur_p->data[ctx->vocab->linefeed_id].logit : -INFINITY;
|
||||||
|
|
||||||
// Create a frequency map to count occurrences of each token in last_tokens
|
// Create a frequency map to count occurrences of each token in last_tokens
|
||||||
// TODO: optimize this by maintaining the token count in the constraint context
|
// TODO: optimize this by maintaining the token count in the constraint context
|
||||||
|
@ -830,11 +826,11 @@ static struct llama_constraint_i llama_constraint_penalties_i = {
|
||||||
token_count[ctx->prev.rat(i)]++;
|
token_count[ctx->prev.rat(i)]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_constraint_penalties_impl(candidates, token_count, ctx->penalty_repeat, ctx->penalty_freq, ctx->penalty_present);
|
llama_constraint_penalties_impl(cur_p, token_count, ctx->penalty_repeat, ctx->penalty_freq, ctx->penalty_present);
|
||||||
|
|
||||||
if (!ctx->penalize_nl) {
|
if (!ctx->penalize_nl) {
|
||||||
// restore the logit of the newline token if it was penalized
|
// restore the logit of the newline token if it was penalized
|
||||||
candidates->data[ctx->vocab->linefeed_id].logit = nl_logit;
|
cur_p->data[ctx->vocab->linefeed_id].logit = nl_logit;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
/* .reset = */ [](struct llama_constraint * cnstr) {
|
/* .reset = */ [](struct llama_constraint * cnstr) {
|
||||||
|
@ -858,10 +854,7 @@ static struct llama_constraint_i llama_constraint_penalties_i = {
|
||||||
return result;
|
return result;
|
||||||
},
|
},
|
||||||
/* .free = */ [](struct llama_constraint * cnstr) {
|
/* .free = */ [](struct llama_constraint * cnstr) {
|
||||||
if (cnstr->ctx) {
|
delete (llama_constraint_context_penalties *) cnstr->ctx;
|
||||||
delete (llama_constraint_context_penalties *) cnstr->ctx;
|
|
||||||
}
|
|
||||||
delete cnstr;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -896,13 +889,13 @@ struct llama_constraint_context_logit_bias {
|
||||||
|
|
||||||
static struct llama_constraint_i llama_constraint_logit_bias_i = {
|
static struct llama_constraint_i llama_constraint_logit_bias_i = {
|
||||||
/* .accept = */ nullptr,
|
/* .accept = */ nullptr,
|
||||||
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) {
|
/* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) {
|
||||||
auto * ctx = (llama_constraint_context_logit_bias *) cnstr->ctx;
|
auto * ctx = (llama_constraint_context_logit_bias *) cnstr->ctx;
|
||||||
|
|
||||||
GGML_ASSERT(candidates->size == ctx->vocab->n_vocab && candidates->sorted == false && "the 'logit_bias' constraint must be applied on the full vocabulary");
|
GGML_ASSERT(cur_p->size == ctx->vocab->n_vocab && cur_p->sorted == false && "the 'logit_bias' constraint must be applied on the full vocabulary");
|
||||||
|
|
||||||
for (const auto & lb : ctx->logit_bias) {
|
for (const auto & lb : ctx->logit_bias) {
|
||||||
candidates->data[lb.token].logit += lb.bias;
|
cur_p->data[lb.token].logit += lb.bias;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
/* .reset = */ nullptr,
|
/* .reset = */ nullptr,
|
||||||
|
@ -911,10 +904,7 @@ static struct llama_constraint_i llama_constraint_logit_bias_i = {
|
||||||
return llama_constraint_init_logit_bias_impl(*ctx_src->vocab, ctx_src->logit_bias.size(), ctx_src->logit_bias.data());
|
return llama_constraint_init_logit_bias_impl(*ctx_src->vocab, ctx_src->logit_bias.size(), ctx_src->logit_bias.data());
|
||||||
},
|
},
|
||||||
/* .free = */ [](struct llama_constraint * cnstr) {
|
/* .free = */ [](struct llama_constraint * cnstr) {
|
||||||
if (cnstr->ctx) {
|
delete (llama_constraint_context_logit_bias *) cnstr->ctx;
|
||||||
delete (llama_constraint_context_logit_bias *) cnstr->ctx;
|
|
||||||
}
|
|
||||||
delete cnstr;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -940,9 +930,15 @@ struct llama_constraint * llama_constraint_cp_impl(const struct llama_constraint
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_free_impl(struct llama_constraint * cnstr) {
|
void llama_constraint_free_impl(struct llama_constraint * cnstr) {
|
||||||
if (cnstr->iface->free && cnstr) {
|
if (cnstr == nullptr) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cnstr->iface->free) {
|
||||||
cnstr->iface->free(cnstr);
|
cnstr->iface->free(cnstr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
delete cnstr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_accept_impl(struct llama_constraint & cnstr, llama_token token) {
|
void llama_constraint_accept_impl(struct llama_constraint & cnstr, llama_token token) {
|
||||||
|
@ -951,9 +947,9 @@ void llama_constraint_accept_impl(struct llama_constraint & cnstr, llama_token t
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_apply_impl(struct llama_constraint & cnstr, struct llama_token_data_array * candidates) {
|
void llama_constraint_apply_impl(struct llama_constraint & cnstr, struct llama_token_data_array * cur_p) {
|
||||||
GGML_ASSERT(cnstr.iface->apply);
|
GGML_ASSERT(cnstr.iface->apply);
|
||||||
cnstr.iface->apply(&cnstr, candidates);
|
cnstr.iface->apply(&cnstr, cur_p);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_reset_impl(struct llama_constraint & cnstr) {
|
void llama_constraint_reset_impl(struct llama_constraint & cnstr) {
|
||||||
|
@ -962,7 +958,9 @@ void llama_constraint_reset_impl(struct llama_constraint & cnstr) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
// samplers
|
// samplers
|
||||||
|
//
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_impl(const struct llama_vocab & vocab, struct llama_sampler_params params) {
|
struct llama_sampler * llama_sampler_init_impl(const struct llama_vocab & vocab, struct llama_sampler_params params) {
|
||||||
auto * result = new llama_sampler {
|
auto * result = new llama_sampler {
|
||||||
|
@ -1050,9 +1048,9 @@ void llama_sampler_accept_impl(struct llama_sampler & smpl, llama_token token) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_sampler_apply_impl(struct llama_sampler & smpl, struct llama_token_data_array * candidates) {
|
void llama_sampler_apply_impl(struct llama_sampler & smpl, struct llama_token_data_array * cur_p) {
|
||||||
for (auto * cnstr : smpl.constraints) {
|
for (auto * cnstr : smpl.constraints) {
|
||||||
llama_constraint_apply_impl(*cnstr, candidates);
|
llama_constraint_apply_impl(*cnstr, cur_p);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1068,16 +1066,16 @@ int llama_sampler_n_prev_impl(const struct llama_sampler & smpl) {
|
||||||
return smpl.prev.size();
|
return smpl.prev.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_sampler_sample_mirostat_impl(struct llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, int32_t m, int32_t n_vocab, float & mu) {
|
llama_token llama_sampler_sample_mirostat_impl(struct llama_token_data_array * cur_p, std::mt19937 & rng, float tau, float eta, int32_t m, int32_t n_vocab, float & mu) {
|
||||||
llama_constraint_softmax_impl(candidates);
|
llama_constraint_softmax_impl(cur_p);
|
||||||
|
|
||||||
// Estimate s_hat using the most probable m tokens
|
// Estimate s_hat using the most probable m tokens
|
||||||
float s_hat = 0.0;
|
float s_hat = 0.0;
|
||||||
float sum_ti_bi = 0.0;
|
float sum_ti_bi = 0.0;
|
||||||
float sum_ti_sq = 0.0;
|
float sum_ti_sq = 0.0;
|
||||||
for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
|
for (size_t i = 0; i < size_t(m - 1) && i < cur_p->size - 1; ++i) {
|
||||||
float t_i = logf(float(i + 2) / float(i + 1));
|
float t_i = logf(float(i + 2) / float(i + 1));
|
||||||
float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
|
float b_i = logf(cur_p->data[i].p / cur_p->data[i + 1].p);
|
||||||
sum_ti_bi += t_i * b_i;
|
sum_ti_bi += t_i * b_i;
|
||||||
sum_ti_sq += t_i * t_i;
|
sum_ti_sq += t_i * t_i;
|
||||||
}
|
}
|
||||||
|
@ -1088,14 +1086,14 @@ llama_token llama_sampler_sample_mirostat_impl(struct llama_token_data_array * c
|
||||||
float k = powf((epsilon_hat * powf(2, mu)) / (1 - powf(n_vocab, -epsilon_hat)), 1 / s_hat);
|
float k = powf((epsilon_hat * powf(2, mu)) / (1 - powf(n_vocab, -epsilon_hat)), 1 / s_hat);
|
||||||
|
|
||||||
// Sample the next word X using top-k sampling
|
// Sample the next word X using top-k sampling
|
||||||
llama_constraint_top_k_impl(candidates, int(k), 1);
|
llama_constraint_top_k_impl(cur_p, int(k), 1);
|
||||||
llama_token X = llama_sampler_sample_dist_impl(candidates, rng);
|
llama_token X = llama_sampler_sample_dist_impl(cur_p, rng);
|
||||||
|
|
||||||
// Compute error as the difference between observed surprise and target surprise value
|
// Compute error as the difference between observed surprise and target surprise value
|
||||||
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
size_t X_idx = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
|
||||||
return candidate.id == X;
|
return candidate.id == X;
|
||||||
}));
|
}));
|
||||||
float observed_surprise = -log2f(candidates->data[X_idx].p);
|
float observed_surprise = -log2f(cur_p->data[X_idx].p);
|
||||||
float e = observed_surprise - tau;
|
float e = observed_surprise - tau;
|
||||||
|
|
||||||
// Update mu using the learning rate and error
|
// Update mu using the learning rate and error
|
||||||
|
@ -1104,30 +1102,30 @@ llama_token llama_sampler_sample_mirostat_impl(struct llama_token_data_array * c
|
||||||
return X;
|
return X;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_sampler_sample_mirostat_v2_impl(struct llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, float & mu) {
|
llama_token llama_sampler_sample_mirostat_v2_impl(struct llama_token_data_array * cur_p, std::mt19937 & rng, float tau, float eta, float & mu) {
|
||||||
llama_constraint_softmax_impl(candidates);
|
llama_constraint_softmax_impl(cur_p);
|
||||||
|
|
||||||
// Truncate the words with surprise values greater than mu
|
// Truncate the words with surprise values greater than mu
|
||||||
candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
|
||||||
return -log2f(candidate.p) > mu;
|
return -log2f(candidate.p) > mu;
|
||||||
}));
|
}));
|
||||||
|
|
||||||
if (candidates->size == 0) {
|
if (cur_p->size == 0) {
|
||||||
candidates->size = 1;
|
cur_p->size = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Normalize the probabilities of the remaining words
|
// Normalize the probabilities of the remaining words
|
||||||
llama_constraint_softmax_impl(candidates);
|
llama_constraint_softmax_impl(cur_p);
|
||||||
|
|
||||||
// Sample the next word X from the remaining words
|
// Sample the next word X from the remaining words
|
||||||
llama_token X = llama_sampler_sample_dist_impl(candidates, rng);
|
llama_token X = llama_sampler_sample_dist_impl(cur_p, rng);
|
||||||
|
|
||||||
// Compute error as the difference between observed surprise and target surprise value
|
// Compute error as the difference between observed surprise and target surprise value
|
||||||
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
size_t X_idx = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) {
|
||||||
return candidate.id == X;
|
return candidate.id == X;
|
||||||
}));
|
}));
|
||||||
|
|
||||||
float observed_surprise = -log2f(candidates->data[X_idx].p);
|
float observed_surprise = -log2f(cur_p->data[X_idx].p);
|
||||||
float e = observed_surprise - tau;
|
float e = observed_surprise - tau;
|
||||||
|
|
||||||
// Update mu using the learning rate and error
|
// Update mu using the learning rate and error
|
||||||
|
@ -1136,17 +1134,17 @@ llama_token llama_sampler_sample_mirostat_v2_impl(struct llama_token_data_array
|
||||||
return X;
|
return X;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_sampler_sample_greedy_impl(llama_token_data_array * candidates, bool probs) {
|
llama_token llama_sampler_sample_greedy_impl(llama_token_data_array * cur_p, bool probs) {
|
||||||
if (probs) {
|
if (probs) {
|
||||||
// if probs are needed, we apply softmax to get the probabilities
|
// if probs are needed, we apply softmax to get the probabilities
|
||||||
llama_constraint_softmax_impl(candidates);
|
llama_constraint_softmax_impl(cur_p);
|
||||||
|
|
||||||
// the candidates are sorted, so we can just return the first one
|
// the cur_p are sorted, so we can just return the first one
|
||||||
return candidates->data[0].id;
|
return cur_p->data[0].id;
|
||||||
}
|
}
|
||||||
|
|
||||||
// return the token with the highest logit
|
// return the token with the highest logit
|
||||||
auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
auto * max_iter = std::max_element(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||||
return a.logit < b.logit;
|
return a.logit < b.logit;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -1155,20 +1153,20 @@ llama_token llama_sampler_sample_greedy_impl(llama_token_data_array * candidates
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_sampler_sample_dist_impl(struct llama_token_data_array * candidates, std::mt19937 & rng) {
|
llama_token llama_sampler_sample_dist_impl(struct llama_token_data_array * cur_p, std::mt19937 & rng) {
|
||||||
llama_constraint_softmax_impl(candidates);
|
llama_constraint_softmax_impl(cur_p);
|
||||||
|
|
||||||
std::vector<float> probs;
|
std::vector<float> probs;
|
||||||
probs.reserve(candidates->size);
|
probs.reserve(cur_p->size);
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
probs.push_back(candidates->data[i].p);
|
probs.push_back(cur_p->data[i].p);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
||||||
|
|
||||||
const int idx = dist(rng);
|
const int idx = dist(rng);
|
||||||
llama_token result = candidates->data[idx].id;
|
llama_token result = cur_p->data[idx].id;
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,19 +10,9 @@ struct llama_grammar;
|
||||||
|
|
||||||
using llama_token_cnt = std::unordered_map<llama_token, int>;
|
using llama_token_cnt = std::unordered_map<llama_token, int>;
|
||||||
|
|
||||||
// TODO: tmp exposed, until tests start using llama_constraint
|
// TODO: tmp exposed until test-sampling is fixed
|
||||||
void llama_constraint_softmax_impl (struct llama_token_data_array * candidates);
|
|
||||||
void llama_constraint_top_k_impl (struct llama_token_data_array * candidates, int32_t k, size_t min_keep);
|
|
||||||
void llama_constraint_top_p_impl (struct llama_token_data_array * candidates, float p, size_t min_keep);
|
|
||||||
void llama_constraint_min_p_impl (struct llama_token_data_array * candidates, float p, size_t min_keep);
|
|
||||||
void llama_constraint_tail_free_impl(struct llama_token_data_array * candidates, float z, size_t min_keep);
|
|
||||||
void llama_constraint_typical_impl (struct llama_token_data_array * candidates, float p, size_t min_keep);
|
|
||||||
void llama_constraint_entropy_impl (struct llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
|
|
||||||
void llama_constraint_temp_impl (struct llama_token_data_array * candidates, float temp);
|
|
||||||
void llama_constraint_grammar_impl (struct llama_token_data_array * candidates, const struct llama_grammar & grammar);
|
|
||||||
|
|
||||||
void llama_constraint_penalties_impl(
|
void llama_constraint_penalties_impl(
|
||||||
llama_token_data_array * candidates,
|
llama_token_data_array * cur_p,
|
||||||
const llama_token_cnt & token_count,
|
const llama_token_cnt & token_count,
|
||||||
float penalty_repeat,
|
float penalty_repeat,
|
||||||
float penalty_freq,
|
float penalty_freq,
|
||||||
|
@ -30,6 +20,7 @@ void llama_constraint_penalties_impl(
|
||||||
|
|
||||||
// constraints
|
// constraints
|
||||||
|
|
||||||
|
struct llama_constraint * llama_constraint_init_softmax_impl ();
|
||||||
struct llama_constraint * llama_constraint_init_top_k_impl (int32_t k, size_t min_keep);
|
struct llama_constraint * llama_constraint_init_top_k_impl (int32_t k, size_t min_keep);
|
||||||
struct llama_constraint * llama_constraint_init_top_p_impl (float p, size_t min_keep);
|
struct llama_constraint * llama_constraint_init_top_p_impl (float p, size_t min_keep);
|
||||||
struct llama_constraint * llama_constraint_init_min_p_impl (float p, size_t min_keep);
|
struct llama_constraint * llama_constraint_init_min_p_impl (float p, size_t min_keep);
|
||||||
|
@ -62,7 +53,7 @@ struct llama_constraint * llama_constraint_cp_impl(const struct llama_constraint
|
||||||
void llama_constraint_free_impl(struct llama_constraint * cnstr);
|
void llama_constraint_free_impl(struct llama_constraint * cnstr);
|
||||||
|
|
||||||
void llama_constraint_accept_impl(struct llama_constraint & cnstr, llama_token token);
|
void llama_constraint_accept_impl(struct llama_constraint & cnstr, llama_token token);
|
||||||
void llama_constraint_apply_impl (struct llama_constraint & cnstr, struct llama_token_data_array * candidates);
|
void llama_constraint_apply_impl (struct llama_constraint & cnstr, struct llama_token_data_array * cur_p);
|
||||||
void llama_constraint_reset_impl (struct llama_constraint & cnstr);
|
void llama_constraint_reset_impl (struct llama_constraint & cnstr);
|
||||||
|
|
||||||
// samplers
|
// samplers
|
||||||
|
@ -101,7 +92,7 @@ void llama_sampler_reset_impl( struct llama_sampler & smp
|
||||||
void llama_sampler_add_constraint_impl(struct llama_sampler & smpl, struct llama_constraint * cnstr);
|
void llama_sampler_add_constraint_impl(struct llama_sampler & smpl, struct llama_constraint * cnstr);
|
||||||
|
|
||||||
void llama_sampler_accept_impl(struct llama_sampler & smpl, llama_token token);
|
void llama_sampler_accept_impl(struct llama_sampler & smpl, llama_token token);
|
||||||
void llama_sampler_apply_impl (struct llama_sampler & smpl, struct llama_token_data_array * candidates);
|
void llama_sampler_apply_impl (struct llama_sampler & smpl, struct llama_token_data_array * cur_p);
|
||||||
|
|
||||||
llama_token llama_sampler_prev_impl (const struct llama_sampler & smpl, int ith);
|
llama_token llama_sampler_prev_impl (const struct llama_sampler & smpl, int ith);
|
||||||
int llama_sampler_n_prev_impl(const struct llama_sampler & smpl);
|
int llama_sampler_n_prev_impl(const struct llama_sampler & smpl);
|
||||||
|
@ -112,14 +103,14 @@ int llama_sampler_n_prev_impl(const struct llama_sampler & smpl);
|
||||||
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
||||||
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
||||||
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
||||||
llama_token llama_sampler_sample_mirostat_impl (struct llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, int32_t m, int32_t n_vocab, float & mu);
|
llama_token llama_sampler_sample_mirostat_impl (struct llama_token_data_array * cur_p, std::mt19937 & rng, float tau, float eta, int32_t m, int32_t n_vocab, float & mu);
|
||||||
|
|
||||||
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
||||||
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||||
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
||||||
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
||||||
llama_token llama_sampler_sample_mirostat_v2_impl(struct llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, float & mu);
|
llama_token llama_sampler_sample_mirostat_v2_impl(struct llama_token_data_array * cur_p, std::mt19937 & rng, float tau, float eta, float & mu);
|
||||||
|
|
||||||
llama_token llama_sampler_sample_greedy_impl(struct llama_token_data_array * candidates, bool probs);
|
llama_token llama_sampler_sample_greedy_impl(struct llama_token_data_array * cur_p, bool probs);
|
||||||
llama_token llama_sampler_sample_dist_impl (struct llama_token_data_array * candidates, std::mt19937 & rng);
|
llama_token llama_sampler_sample_dist_impl (struct llama_token_data_array * cur_p, std::mt19937 & rng);
|
||||||
|
|
|
@ -20609,6 +20609,10 @@ int32_t llama_chat_apply_template(
|
||||||
// sampling
|
// sampling
|
||||||
//
|
//
|
||||||
|
|
||||||
|
struct llama_constraint * llama_constraint_init_softmax() {
|
||||||
|
return llama_constraint_init_softmax_impl();
|
||||||
|
}
|
||||||
|
|
||||||
struct llama_constraint * llama_constraint_init_top_k(int32_t k, int32_t min_keep) {
|
struct llama_constraint * llama_constraint_init_top_k(int32_t k, int32_t min_keep) {
|
||||||
return llama_constraint_init_top_k_impl(k, min_keep);
|
return llama_constraint_init_top_k_impl(k, min_keep);
|
||||||
}
|
}
|
||||||
|
@ -20675,8 +20679,8 @@ void llama_constraint_accept(struct llama_constraint * cnstr, llama_token token)
|
||||||
llama_constraint_accept_impl(*cnstr, token);
|
llama_constraint_accept_impl(*cnstr, token);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_apply(struct llama_constraint * cnstr, llama_token_data_array * candidates) {
|
void llama_constraint_apply(struct llama_constraint * cnstr, llama_token_data_array * cur_p) {
|
||||||
llama_constraint_apply_impl(*cnstr, candidates);
|
llama_constraint_apply_impl(*cnstr, cur_p);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_constraint_reset(struct llama_constraint * cnstr) {
|
void llama_constraint_reset(struct llama_constraint * cnstr) {
|
||||||
|
@ -20727,21 +20731,21 @@ void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
|
||||||
llama_sampler_accept_impl(*smpl, token);
|
llama_sampler_accept_impl(*smpl, token);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_sampler_apply(struct llama_sampler * smpl, llama_token_data_array * candidates) {
|
void llama_sampler_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
time_meas tm(smpl->t_sample_us);
|
time_meas tm(smpl->t_sample_us);
|
||||||
|
|
||||||
if (candidates == nullptr) {
|
if (cur_p == nullptr) {
|
||||||
candidates = &smpl->cur_p;
|
cur_p = &smpl->cur_p;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampler_apply_impl(*smpl, candidates);
|
llama_sampler_apply_impl(*smpl, cur_p);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_sampler_sample_mirostat(struct llama_sampler * smpl, llama_token_data_array * candidates) {
|
llama_token llama_sampler_sample_mirostat(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
time_meas tm(smpl->t_sample_us);
|
time_meas tm(smpl->t_sample_us);
|
||||||
|
|
||||||
if (candidates == nullptr) {
|
if (cur_p == nullptr) {
|
||||||
candidates = &smpl->cur_p;
|
cur_p = &smpl->cur_p;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto type = smpl->params.mirostat;
|
const auto type = smpl->params.mirostat;
|
||||||
|
@ -20749,7 +20753,7 @@ llama_token llama_sampler_sample_mirostat(struct llama_sampler * smpl, llama_tok
|
||||||
llama_token res;
|
llama_token res;
|
||||||
|
|
||||||
if (type == 1) {
|
if (type == 1) {
|
||||||
res = llama_sampler_sample_mirostat_impl(candidates,
|
res = llama_sampler_sample_mirostat_impl(cur_p,
|
||||||
smpl->rng,
|
smpl->rng,
|
||||||
smpl->params.mirostat_tau,
|
smpl->params.mirostat_tau,
|
||||||
smpl->params.mirostat_eta,
|
smpl->params.mirostat_eta,
|
||||||
|
@ -20757,7 +20761,7 @@ llama_token llama_sampler_sample_mirostat(struct llama_sampler * smpl, llama_tok
|
||||||
smpl->vocab->n_vocab,
|
smpl->vocab->n_vocab,
|
||||||
smpl->mirostat_mu);
|
smpl->mirostat_mu);
|
||||||
} else if (type == 2) {
|
} else if (type == 2) {
|
||||||
res = llama_sampler_sample_mirostat_v2_impl(candidates,
|
res = llama_sampler_sample_mirostat_v2_impl(cur_p,
|
||||||
smpl->rng,
|
smpl->rng,
|
||||||
smpl->params.mirostat_tau,
|
smpl->params.mirostat_tau,
|
||||||
smpl->params.mirostat_eta,
|
smpl->params.mirostat_eta,
|
||||||
|
@ -20771,28 +20775,28 @@ llama_token llama_sampler_sample_mirostat(struct llama_sampler * smpl, llama_tok
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_sampler_sample_greedy(struct llama_sampler * smpl, llama_token_data_array * candidates, bool probs) {
|
llama_token llama_sampler_sample_greedy(struct llama_sampler * smpl, llama_token_data_array * cur_p, bool probs) {
|
||||||
time_meas tm(smpl->t_sample_us);
|
time_meas tm(smpl->t_sample_us);
|
||||||
|
|
||||||
if (candidates == nullptr) {
|
if (cur_p == nullptr) {
|
||||||
candidates = &smpl->cur_p;
|
cur_p = &smpl->cur_p;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto res = llama_sampler_sample_greedy_impl(candidates, probs);
|
auto res = llama_sampler_sample_greedy_impl(cur_p, probs);
|
||||||
|
|
||||||
smpl->n_sample++;
|
smpl->n_sample++;
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_sampler_sample_dist(struct llama_sampler * smpl, llama_token_data_array * candidates) {
|
llama_token llama_sampler_sample_dist(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
time_meas tm(smpl->t_sample_us);
|
time_meas tm(smpl->t_sample_us);
|
||||||
|
|
||||||
if (candidates == nullptr) {
|
if (cur_p == nullptr) {
|
||||||
candidates = &smpl->cur_p;
|
cur_p = &smpl->cur_p;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto res = llama_sampler_sample_dist_impl(candidates, smpl->rng);
|
auto res = llama_sampler_sample_dist_impl(cur_p, smpl->rng);
|
||||||
|
|
||||||
smpl->n_sample++;
|
smpl->n_sample++;
|
||||||
|
|
||||||
|
|
|
@ -11,119 +11,125 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void dump(const llama_token_data_array * candidates) {
|
static void dump(const llama_token_data_array * cur_p) {
|
||||||
for (size_t i = 0; i < candidates->size; i++) {
|
for (size_t i = 0; i < cur_p->size; i++) {
|
||||||
printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
|
printf("%d: %f (%f)\n", cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
|
#define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0)
|
||||||
|
|
||||||
|
#define TEST(__cnstr, __cur_p) do { \
|
||||||
|
auto * cnstr = (__cnstr); \
|
||||||
|
llama_constraint_apply(cnstr, (__cur_p)); \
|
||||||
|
llama_constraint_free(cnstr); \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
|
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
|
||||||
const size_t n_vocab = probs.size();
|
const size_t n_vocab = probs.size();
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> cur;
|
||||||
candidates.reserve(n_vocab);
|
cur.reserve(n_vocab);
|
||||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||||
const float logit = logf(probs[token_id]);
|
const float logit = logf(probs[token_id]);
|
||||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||||
llama_constraint_softmax_impl(&candidates_p);
|
TEST(llama_constraint_init_softmax(), &cur_p);
|
||||||
DUMP(&candidates_p);
|
DUMP(&cur_p);
|
||||||
llama_constraint_top_k_impl(&candidates_p, k, 1);
|
TEST(llama_constraint_init_top_k(k, 1), &cur_p);
|
||||||
DUMP(&candidates_p);
|
DUMP(&cur_p);
|
||||||
|
|
||||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
GGML_ASSERT(cur_p.size == expected_probs.size());
|
||||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
for (size_t i = 0; i < cur_p.size; i++) {
|
||||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
|
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||||
const size_t n_vocab = probs.size();
|
const size_t n_vocab = probs.size();
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> cur;
|
||||||
candidates.reserve(n_vocab);
|
cur.reserve(n_vocab);
|
||||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||||
const float logit = logf(probs[token_id]);
|
const float logit = logf(probs[token_id]);
|
||||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||||
llama_constraint_softmax_impl(&candidates_p);
|
TEST(llama_constraint_init_softmax(), &cur_p);
|
||||||
DUMP(&candidates_p);
|
DUMP(&cur_p);
|
||||||
llama_constraint_top_p_impl(&candidates_p, p, 1);
|
TEST(llama_constraint_init_top_p(p, 1), &cur_p);
|
||||||
DUMP(&candidates_p);
|
DUMP(&cur_p);
|
||||||
|
|
||||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
GGML_ASSERT(cur_p.size == expected_probs.size());
|
||||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
for (size_t i = 0; i < cur_p.size; i++) {
|
||||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
|
static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
|
||||||
const size_t n_vocab = probs.size();
|
const size_t n_vocab = probs.size();
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> cur;
|
||||||
candidates.reserve(n_vocab);
|
cur.reserve(n_vocab);
|
||||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||||
const float logit = logf(probs[token_id]);
|
const float logit = logf(probs[token_id]);
|
||||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||||
DUMP(&candidates_p);
|
DUMP(&cur_p);
|
||||||
llama_constraint_tail_free_impl(&candidates_p, z, 1);
|
TEST(llama_constraint_init_tail_free(z, 1), &cur_p);
|
||||||
DUMP(&candidates_p);
|
DUMP(&cur_p);
|
||||||
|
|
||||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
GGML_ASSERT(cur_p.size == expected_probs.size());
|
||||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
for (size_t i = 0; i < cur_p.size; i++) {
|
||||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||||
const size_t n_vocab = probs.size();
|
const size_t n_vocab = probs.size();
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> cur;
|
||||||
candidates.reserve(n_vocab);
|
cur.reserve(n_vocab);
|
||||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||||
const float logit = logf(probs[token_id]);
|
const float logit = logf(probs[token_id]);
|
||||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||||
DUMP(&candidates_p);
|
DUMP(&cur_p);
|
||||||
llama_constraint_min_p_impl(&candidates_p, p, 1);
|
TEST(llama_constraint_init_min_p(p, 1), &cur_p);
|
||||||
DUMP(&candidates_p);
|
DUMP(&cur_p);
|
||||||
llama_constraint_softmax_impl(&candidates_p);
|
TEST(llama_constraint_init_softmax(), &cur_p);
|
||||||
|
|
||||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
GGML_ASSERT(cur_p.size == expected_probs.size());
|
||||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
for (size_t i = 0; i < cur_p.size; i++) {
|
||||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||||
const size_t n_vocab = probs.size();
|
const size_t n_vocab = probs.size();
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> cur;
|
||||||
candidates.reserve(n_vocab);
|
cur.reserve(n_vocab);
|
||||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||||
const float logit = logf(probs[token_id]);
|
const float logit = logf(probs[token_id]);
|
||||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||||
DUMP(&candidates_p);
|
DUMP(&cur_p);
|
||||||
llama_constraint_typical_impl(&candidates_p, p, 1);
|
TEST(llama_constraint_init_typical(p, 1), &cur_p);
|
||||||
DUMP(&candidates_p);
|
DUMP(&cur_p);
|
||||||
|
|
||||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
GGML_ASSERT(cur_p.size == expected_probs.size());
|
||||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
for (size_t i = 0; i < cur_p.size; i++) {
|
||||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -135,11 +141,11 @@ static void test_penalties(
|
||||||
|
|
||||||
const size_t n_vocab = probs.size();
|
const size_t n_vocab = probs.size();
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> cur;
|
||||||
candidates.reserve(n_vocab);
|
cur.reserve(n_vocab);
|
||||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||||
const float logit = logf(probs[token_id]);
|
const float logit = logf(probs[token_id]);
|
||||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_cnt token_count;
|
llama_token_cnt token_count;
|
||||||
|
@ -147,55 +153,55 @@ static void test_penalties(
|
||||||
token_count[last_tokens[i]]++;
|
token_count[last_tokens[i]]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||||
llama_constraint_softmax_impl(&candidates_p);
|
TEST(llama_constraint_init_softmax(), &cur_p);
|
||||||
DUMP(&candidates_p);
|
DUMP(&cur_p);
|
||||||
llama_constraint_penalties_impl(&candidates_p, token_count, repeat_penalty, alpha_frequency, alpha_presence);
|
llama_constraint_penalties_impl(&cur_p, token_count, repeat_penalty, alpha_frequency, alpha_presence); // TODO: avoid
|
||||||
llama_constraint_softmax_impl(&candidates_p);
|
TEST(llama_constraint_init_softmax(), &cur_p);
|
||||||
DUMP(&candidates_p);
|
DUMP(&cur_p);
|
||||||
|
|
||||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
GGML_ASSERT(cur_p.size == expected_probs.size());
|
||||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
for (size_t i = 0; i < cur_p.size; i++) {
|
||||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
|
static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
|
||||||
) {
|
) {
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> cur;
|
||||||
candidates.reserve(n_vocab);
|
cur.reserve(n_vocab);
|
||||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||||
const float logit = logf(token_id);
|
const float logit = logf(token_id);
|
||||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
||||||
|
|
||||||
llama_token min_token_id = 0;
|
llama_token min_token_id = 0;
|
||||||
const llama_token max_token_id = n_vocab-1;
|
const llama_token max_token_id = n_vocab-1;
|
||||||
|
|
||||||
for (auto s : samplers_sequence) {
|
for (auto s : samplers_sequence) {
|
||||||
switch (s){
|
switch (s){
|
||||||
case 'k': llama_constraint_top_k_impl(&candidates_p, top_k, 1); break;
|
case 'k': TEST(llama_constraint_init_top_k(top_k, 1), &cur_p); break;
|
||||||
case 'f': GGML_ABORT("tail_free test not implemented");
|
case 'f': GGML_ABORT("tail_free test not implemented");
|
||||||
case 'y': GGML_ABORT("typical test not implemented");
|
case 'y': GGML_ABORT("typical test not implemented");
|
||||||
case 'p': llama_constraint_top_p_impl(&candidates_p, top_p, 1); break;
|
case 'p': TEST(llama_constraint_init_top_p(top_p, 1), &cur_p); break;
|
||||||
case 'm': llama_constraint_min_p_impl(&candidates_p, min_p, 1); break;
|
case 'm': TEST(llama_constraint_init_min_p(min_p, 1), &cur_p); break;
|
||||||
case 't': GGML_ABORT("temperature test not implemented");
|
case 't': GGML_ABORT("temperature test not implemented");
|
||||||
default : GGML_ABORT("Unknown sampler");
|
default : GGML_ABORT("Unknown sampler");
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_constraint_softmax_impl(&candidates_p); // make sure tokens are sorted for tests
|
TEST(llama_constraint_init_softmax(), &cur_p); // make sure tokens are sorted for tests
|
||||||
|
|
||||||
const int size = candidates_p.size;
|
const int size = cur_p.size;
|
||||||
|
|
||||||
if (s == 'k') {
|
if (s == 'k') {
|
||||||
const int expected_size = std::min(size, top_k);
|
const int expected_size = std::min(size, top_k);
|
||||||
min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
|
min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
|
||||||
|
|
||||||
GGML_ASSERT(size == expected_size);
|
GGML_ASSERT(size == expected_size);
|
||||||
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
|
GGML_ASSERT(cur_p.data[0].id == max_token_id);
|
||||||
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
|
GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
|
||||||
} else if (s == 'p') {
|
} else if (s == 'p') {
|
||||||
const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2;
|
const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2;
|
||||||
const int softmax_numerator_target = ceilf(top_p * softmax_divisor);
|
const int softmax_numerator_target = ceilf(top_p * softmax_divisor);
|
||||||
|
@ -217,8 +223,8 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(size == expected_size);
|
GGML_ASSERT(size == expected_size);
|
||||||
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
|
GGML_ASSERT(cur_p.data[0].id == max_token_id);
|
||||||
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
|
GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
|
||||||
} else if (s == 'm') {
|
} else if (s == 'm') {
|
||||||
int expected_size = ceilf((1.0f-min_p) * n_vocab);
|
int expected_size = ceilf((1.0f-min_p) * n_vocab);
|
||||||
expected_size = std::max(expected_size, 1);
|
expected_size = std::max(expected_size, 1);
|
||||||
|
@ -230,8 +236,8 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
|
||||||
min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
|
min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
|
||||||
|
|
||||||
GGML_ASSERT(size == expected_size);
|
GGML_ASSERT(size == expected_size);
|
||||||
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
|
GGML_ASSERT(cur_p.data[0].id == max_token_id);
|
||||||
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
|
GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id);
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue