From c024fe45b0728a31d2245ac6cf365fe4b0a67293 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 4 Sep 2024 15:01:31 +0300 Subject: [PATCH] constraint : clean-up and simplify --- common/sampling.cpp | 14 +- common/sampling.h | 6 +- include/llama.h | 21 +- src/llama-grammar.cpp | 16 +- src/llama-grammar.h | 2 +- src/llama-sampling.cpp | 412 ++++++++++++++++++++-------------------- src/llama-sampling.h | 27 +-- src/llama.cpp | 42 ++-- tests/test-sampling.cpp | 174 +++++++++-------- 9 files changed, 357 insertions(+), 357 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 123c6b2a7..34371bc24 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -232,18 +232,18 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context return gpt_sampler_sample(smpl, cur_p, params.temp, params.mirostat, params.n_probs); } -void gpt_sampler_apply_grammar(struct gpt_sampler * gsmpl, llama_token_data_array * candidates) { - GGML_ASSERT(candidates != nullptr); +void gpt_sampler_apply_grammar(struct gpt_sampler * gsmpl, llama_token_data_array * cur_p) { + GGML_ASSERT(cur_p != nullptr); - llama_constraint_apply(gsmpl->grmr, candidates); + llama_constraint_apply(gsmpl->grmr, cur_p); } -llama_token gpt_sampler_sample_dist(struct gpt_sampler * gsmpl, llama_token_data_array * candidates) { - return llama_sampler_sample_dist(gsmpl->smpl, candidates); +llama_token gpt_sampler_sample_dist(struct gpt_sampler * gsmpl, llama_token_data_array * cur_p) { + return llama_sampler_sample_dist(gsmpl->smpl, cur_p); } -llama_token gpt_sampler_sample_greedy(struct gpt_sampler * gsmpl, llama_token_data_array * candidates, bool probs) { - return llama_sampler_sample_greedy(gsmpl->smpl, candidates, probs); +llama_token gpt_sampler_sample_greedy(struct gpt_sampler * gsmpl, llama_token_data_array * cur_p, bool probs) { + return llama_sampler_sample_greedy(gsmpl->smpl, cur_p, probs); } std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) { diff --git a/common/sampling.h b/common/sampling.h index 8cb3da762..a04645a67 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -93,10 +93,10 @@ void gpt_print_timings(struct llama_context * ctx, struct gpt_sampler * gsmpl); // llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx); -void gpt_sampler_apply_grammar(struct gpt_sampler * gsmpl, llama_token_data_array * candidates); +void gpt_sampler_apply_grammar(struct gpt_sampler * gsmpl, llama_token_data_array * cur_p); -llama_token gpt_sampler_sample_dist (struct gpt_sampler * gsmpl, llama_token_data_array * candidates); -llama_token gpt_sampler_sample_greedy(struct gpt_sampler * gsmpl, llama_token_data_array * candidates, bool probs); +llama_token gpt_sampler_sample_dist (struct gpt_sampler * gsmpl, llama_token_data_array * cur_p); +llama_token gpt_sampler_sample_greedy(struct gpt_sampler * gsmpl, llama_token_data_array * cur_p, bool probs); // helpers diff --git a/include/llama.h b/include/llama.h index 8a02800ce..0f08c44c0 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1027,11 +1027,11 @@ extern "C" { struct llama_constraint_i { // TODO: add name API - void (*accept)( struct llama_constraint * cnstr, llama_token token); // can be NULL - void (*apply) ( struct llama_constraint * cnstr, llama_token_data_array * candidates); // required - void (*reset) ( struct llama_constraint * cnstr); // can be NULL - struct llama_constraint * (*copy) (const struct llama_constraint * cnstr); // can be NULL if ctx is NULL - void (*free) ( struct llama_constraint * cnstr); // can be NULL + void (*accept)( struct llama_constraint * cnstr, llama_token token); // can be NULL + void (*apply) ( struct llama_constraint * cnstr, llama_token_data_array * cur_p); // required + void (*reset) ( struct llama_constraint * cnstr); // can be NULL + struct llama_constraint * (*copy) (const struct llama_constraint * cnstr); // can be NULL if ctx is NULL + void (*free) ( struct llama_constraint * cnstr); // can be NULL if ctx is NULL // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph //void (*apply_ggml) (struct llama_constraint * cnstr, ...); @@ -1044,6 +1044,7 @@ extern "C" { llama_constraint_context_t ctx; }; + LLAMA_API struct llama_constraint * llama_constraint_init_softmax (); LLAMA_API struct llama_constraint * llama_constraint_init_top_k (int32_t k, int32_t min_keep); LLAMA_API struct llama_constraint * llama_constraint_init_top_p (float p, int32_t min_keep); LLAMA_API struct llama_constraint * llama_constraint_init_min_p (float p, int32_t min_keep); @@ -1077,7 +1078,7 @@ extern "C" { LLAMA_API void llama_constraint_free(struct llama_constraint * cnstr); LLAMA_API void llama_constraint_accept(struct llama_constraint * cnstr, llama_token token); - LLAMA_API void llama_constraint_apply (struct llama_constraint * cnstr, llama_token_data_array * candidates); + LLAMA_API void llama_constraint_apply (struct llama_constraint * cnstr, llama_token_data_array * cur_p); LLAMA_API void llama_constraint_reset (struct llama_constraint * cnstr); // samplers @@ -1095,11 +1096,11 @@ extern "C" { LLAMA_API void llama_sampler_add_constraint(struct llama_sampler * smpl, struct llama_constraint * cnstr); LLAMA_API void llama_sampler_accept(struct llama_sampler * smpl, llama_token token); - LLAMA_API void llama_sampler_apply (struct llama_sampler * smpl, llama_token_data_array * candidates); + LLAMA_API void llama_sampler_apply (struct llama_sampler * smpl, llama_token_data_array * cur_p); - LLAMA_API llama_token llama_sampler_sample_dist (struct llama_sampler * smpl, llama_token_data_array * candidates); - LLAMA_API llama_token llama_sampler_sample_greedy (struct llama_sampler * smpl, llama_token_data_array * candidates, bool probs); - LLAMA_API llama_token llama_sampler_sample_mirostat(struct llama_sampler * smpl, llama_token_data_array * candidates); + LLAMA_API llama_token llama_sampler_sample_dist (struct llama_sampler * smpl, llama_token_data_array * cur_p); + LLAMA_API llama_token llama_sampler_sample_greedy (struct llama_sampler * smpl, llama_token_data_array * cur_p, bool probs); + LLAMA_API llama_token llama_sampler_sample_mirostat(struct llama_sampler * smpl, llama_token_data_array * cur_p); /// @details Get the number of accepted tokens so far (max of n_prev) LLAMA_API int llama_sampler_n_prev(const struct llama_sampler * smpl); diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index 092a738aa..a9813ebbf 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -1069,7 +1069,7 @@ struct llama_grammar * llama_grammar_cp_impl(const struct llama_grammar & gramma return result; } -void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * candidates) { +void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) { GGML_ASSERT(grammar.vocab != nullptr); bool allow_eog = false; @@ -1081,21 +1081,21 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_ } std::vector, llama_partial_utf8>> candidates_decoded; - candidates_decoded.reserve(candidates->size); + candidates_decoded.reserve(cur_p->size); llama_grammar_candidates candidates_grammar; - candidates_grammar.reserve(candidates->size); + candidates_grammar.reserve(cur_p->size); - for (size_t i = 0; i < candidates->size; ++i) { - const llama_token id = candidates->data[i].id; + for (size_t i = 0; i < cur_p->size; ++i) { + const llama_token id = cur_p->data[i].id; const std::string & piece = grammar.vocab->cache_token_to_piece.at(id); if (llama_token_is_eog_impl(*grammar.vocab, id)) { if (!allow_eog) { - candidates->data[i].logit = -INFINITY; + cur_p->data[i].logit = -INFINITY; } } else if (piece.empty() || piece[0] == 0) { - candidates->data[i].logit = -INFINITY; + cur_p->data[i].logit = -INFINITY; } else { candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8)); candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second }); @@ -1104,7 +1104,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_ const auto rejects = llama_grammar_reject_candidates(grammar.rules, grammar.stacks, candidates_grammar); for (const auto & reject : rejects) { - candidates->data[reject.index].logit = -INFINITY; + cur_p->data[reject.index].logit = -INFINITY; } } diff --git a/src/llama-grammar.h b/src/llama-grammar.h index 9b13354f6..6b9a2af8d 100644 --- a/src/llama-grammar.h +++ b/src/llama-grammar.h @@ -136,7 +136,7 @@ struct llama_grammar * llama_grammar_cp_impl(const struct llama_grammar & gramma // TODO: move the API below as member functions of llama_grammar void llama_grammar_apply_impl( const struct llama_grammar & grammar, - llama_token_data_array * candidates); + llama_token_data_array * cur_p); void llama_grammar_accept_impl( struct llama_grammar & grammar, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index a134fda95..99e0edfd9 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -24,51 +24,51 @@ static void llama_log_softmax(float * array, size_t size) { } } -void llama_constraint_softmax_impl(llama_token_data_array * candidates) { - GGML_ASSERT(candidates->size > 0); +static void llama_constraint_softmax_impl(llama_token_data_array * cur_p) { + GGML_ASSERT(cur_p->size > 0); // Sort the logits in descending order - if (!candidates->sorted) { - std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + if (!cur_p->sorted) { + std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; }); - candidates->sorted = true; + cur_p->sorted = true; } - float max_l = candidates->data[0].logit; + float max_l = cur_p->data[0].logit; float cum_sum = 0.0f; - for (size_t i = 0; i < candidates->size; ++i) { - float p = expf(candidates->data[i].logit - max_l); - candidates->data[i].p = p; + for (size_t i = 0; i < cur_p->size; ++i) { + float p = expf(cur_p->data[i].logit - max_l); + cur_p->data[i].p = p; cum_sum += p; } - for (size_t i = 0; i < candidates->size; ++i) { - candidates->data[i].p /= cum_sum; + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= cum_sum; } } -void llama_constraint_top_k_impl(llama_token_data_array * candidates, int32_t k, size_t min_keep) { +static void llama_constraint_top_k_impl(llama_token_data_array * cur_p, int32_t k, size_t min_keep) { // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast - // if (k >= (int32_t)candidates->size) { + // if (k >= (int32_t)cur_p->size) { // return; // } if (k <= 0) { - k = candidates->size; + k = cur_p->size; } k = std::max(k, (int) min_keep); - k = std::min(k, (int) candidates->size); + k = std::min(k, (int) cur_p->size); // Sort scores in descending order - if (!candidates->sorted) { + if (!cur_p->sorted) { auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; }; if (k <= 128) { - std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); + std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp); } else { constexpr int nbuckets = 128; constexpr float bucket_low = -10.0f; @@ -76,11 +76,11 @@ void llama_constraint_top_k_impl(llama_token_data_array * candidates, int32_t k, constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low); constexpr float bucket_inter = -bucket_low * bucket_scale; - std::vector bucket_idx(candidates->size); + std::vector bucket_idx(cur_p->size); std::vector histo(nbuckets, 0); - for (int i = 0; i < (int)candidates->size; ++i) { - const float val = candidates->data[i].logit; + for (int i = 0; i < (int)cur_p->size; ++i) { + const float val = cur_p->data[i].logit; int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); ib = std::max(0, std::min(nbuckets-1, ib)); bucket_idx[i] = ib; @@ -102,10 +102,10 @@ void llama_constraint_top_k_impl(llama_token_data_array * candidates, int32_t k, bucket_ptrs.push_back(ptr); ptr += histo[j]; } - for (int i = 0; i < (int)candidates->size; ++i) { + for (int i = 0; i < (int)cur_p->size; ++i) { int j = bucket_idx[i]; if (j >= ib) { - *bucket_ptrs[nbuckets-1-j]++ = candidates->data[i]; + *bucket_ptrs[nbuckets-1-j]++ = cur_p->data[i]; } } @@ -118,27 +118,27 @@ void llama_constraint_top_k_impl(llama_token_data_array * candidates, int32_t k, } std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp); - std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data)); + std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data)); } - candidates->sorted = true; + cur_p->sorted = true; } - candidates->size = k; + cur_p->size = k; } -void llama_constraint_top_p_impl(llama_token_data_array * candidates, float p, size_t min_keep) { +static void llama_constraint_top_p_impl(llama_token_data_array * cur_p, float p, size_t min_keep) { if (p >= 1.0f) { return; } - llama_constraint_softmax_impl(candidates); + llama_constraint_softmax_impl(cur_p); // Compute the cumulative probabilities float cum_sum = 0.0f; - size_t last_idx = candidates->size; + size_t last_idx = cur_p->size; - for (size_t i = 0; i < candidates->size; ++i) { - cum_sum += candidates->data[i].p; + for (size_t i = 0; i < cur_p->size; ++i) { + cum_sum += cur_p->data[i].p; // Check if the running sum is at least p or if we have kept at least min_keep tokens // we set the last index to i+1 to indicate that the current iterate should be included in the set @@ -149,77 +149,77 @@ void llama_constraint_top_p_impl(llama_token_data_array * candidates, float p, s } // Resize the output vector to keep only the top-p tokens - candidates->size = last_idx; + cur_p->size = last_idx; } -void llama_constraint_min_p_impl(llama_token_data_array * candidates, float p, size_t min_keep) { - if (p <= 0.0f || !candidates->size) { +static void llama_constraint_min_p_impl(llama_token_data_array * cur_p, float p, size_t min_keep) { + if (p <= 0.0f || !cur_p->size) { return; } bool min_p_applied = false; - // if the candidates aren't sorted, try the unsorted implementation first - if (!candidates->sorted) { + // if the cur_p aren't sorted, try the unsorted implementation first + if (!cur_p->sorted) { std::vector filtered_tokens; float max_logit = -FLT_MAX; - for (size_t i = 0; i < candidates->size; ++i) { - max_logit = std::max(max_logit, candidates->data[i].logit); + for (size_t i = 0; i < cur_p->size; ++i) { + max_logit = std::max(max_logit, cur_p->data[i].logit); } const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max - for (size_t i = 0; i < candidates->size; ++i) { - if (candidates->data[i].logit >= min_logit) { - filtered_tokens.push_back(candidates->data[i]); + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].logit >= min_logit) { + filtered_tokens.push_back(cur_p->data[i]); } } // if we have enough values the operation was a success if (filtered_tokens.size() >= min_keep) { - memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data)); - candidates->size = filtered_tokens.size(); + memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data)); + cur_p->size = filtered_tokens.size(); min_p_applied = true; } } - // if the candidates are sorted or the unsorted implementation failed, use this implementation + // if the cur_p are sorted or the unsorted implementation failed, use this implementation if (!min_p_applied) { // Sort the logits in descending order - if (!candidates->sorted) { - std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + if (!cur_p->sorted) { + std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; }); - candidates->sorted = true; + cur_p->sorted = true; } - const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max + const float min_logit = cur_p->data[0].logit + logf(p); // min logit for p_i >= p * p_max size_t i = 1; // first token always matches - for (; i < candidates->size; ++i) { - if (candidates->data[i].logit < min_logit && i >= min_keep) { + for (; i < cur_p->size; ++i) { + if (cur_p->data[i].logit < min_logit && i >= min_keep) { break; // prob too small } } // Resize the output vector to keep only the matching tokens - candidates->size = i; + cur_p->size = i; } } -void llama_constraint_tail_free_impl(llama_token_data_array * candidates, float z, size_t min_keep) { - if (z >= 1.0f || candidates->size <= 2) { +static void llama_constraint_tail_free_impl(llama_token_data_array * cur_p, float z, size_t min_keep) { + if (z >= 1.0f || cur_p->size <= 2) { return; } - llama_constraint_softmax_impl(candidates); + llama_constraint_softmax_impl(cur_p); // Compute the first and second derivatives - std::vector first_derivatives(candidates->size - 1); - std::vector second_derivatives(candidates->size - 2); + std::vector first_derivatives(cur_p->size - 1); + std::vector second_derivatives(cur_p->size - 2); for (size_t i = 0; i < first_derivatives.size(); ++i) { - first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p; + first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p; } for (size_t i = 0; i < second_derivatives.size(); ++i) { second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1]; @@ -246,7 +246,7 @@ void llama_constraint_tail_free_impl(llama_token_data_array * candidates, float } float cum_sum = 0.0f; - size_t last_idx = candidates->size; + size_t last_idx = cur_p->size; for (size_t i = 0; i < second_derivatives.size(); ++i) { cum_sum += second_derivatives[i]; @@ -258,10 +258,10 @@ void llama_constraint_tail_free_impl(llama_token_data_array * candidates, float } // Resize the output vector to keep only the tokens above the tail location - candidates->size = last_idx; + cur_p->size = last_idx; } -void llama_constraint_typical_impl(llama_token_data_array * candidates, float p, size_t min_keep) { +static void llama_constraint_typical_impl(llama_token_data_array * cur_p, float p, size_t min_keep) { // Reference implementation: // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr if (p >= 1.0f) { @@ -269,22 +269,22 @@ void llama_constraint_typical_impl(llama_token_data_array * candidates, float p, } // Compute the softmax of logits and calculate entropy - llama_constraint_softmax_impl(candidates); + llama_constraint_softmax_impl(cur_p); float entropy = 0.0f; - for (size_t i = 0; i < candidates->size; ++i) { - entropy += -candidates->data[i].p * logf(candidates->data[i].p); + for (size_t i = 0; i < cur_p->size; ++i) { + entropy += -cur_p->data[i].p * logf(cur_p->data[i].p); } // Compute the absolute difference between negative log probability and entropy for each candidate std::vector shifted_scores; - for (size_t i = 0; i < candidates->size; ++i) { - float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy); + for (size_t i = 0; i < cur_p->size; ++i) { + float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy); shifted_scores.push_back(shifted_score); } // Sort tokens based on the shifted_scores and their corresponding indices - std::vector indices(candidates->size); + std::vector indices(cur_p->size); std::iota(indices.begin(), indices.end(), 0); std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) { @@ -297,7 +297,7 @@ void llama_constraint_typical_impl(llama_token_data_array * candidates, float p, for (size_t i = 0; i < indices.size(); ++i) { size_t idx = indices[i]; - cum_sum += candidates->data[idx].p; + cum_sum += cur_p->data[idx].p; // Check if the running sum is greater than typical or if we have kept at least min_keep tokens if (cum_sum > p && i >= min_keep - 1) { @@ -307,39 +307,39 @@ void llama_constraint_typical_impl(llama_token_data_array * candidates, float p, } // Resize the output vector to keep only the locally typical tokens - std::vector new_candidates; + std::vector cur_p_new; for (size_t i = 0; i < last_idx; ++i) { size_t idx = indices[i]; - new_candidates.push_back(candidates->data[idx]); + cur_p_new.push_back(cur_p->data[idx]); } - // Replace the data in candidates with the new_candidates data - std::copy(new_candidates.begin(), new_candidates.end(), candidates->data); - candidates->size = new_candidates.size(); - candidates->sorted = false; + // Replace the data in cur_p with the cur_p_new data + std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data); + cur_p->size = cur_p_new.size(); + cur_p->sorted = false; } -void llama_constraint_entropy_impl(llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val) { +static void llama_constraint_entropy_impl(llama_token_data_array * cur_p, float min_temp, float max_temp, float exponent_val) { // no need to do anything if there is only one (or zero) candidates - if(candidates->size <= 1) { + if (cur_p->size <= 1) { return; } // Calculate maximum possible entropy - float max_entropy = -logf(1.0f / candidates->size); + float max_entropy = -logf(1.0f / cur_p->size); - llama_constraint_softmax_impl(candidates); + llama_constraint_softmax_impl(cur_p); // Calculate entropy of the softmax probabilities float entropy = 0.0f; - for (size_t i = 0; i < candidates->size; ++i) { - float prob = candidates->data[i].p; + for (size_t i = 0; i < cur_p->size; ++i) { + float prob = cur_p->data[i].p; if (prob > 0.0f) { // Ensure no log(0) entropy -= prob * logf(prob); } } - // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates->size != 1 above) + // Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above) float normalized_entropy = entropy / max_entropy; // Map the normalized entropy to the desired temperature range using the power function @@ -355,52 +355,52 @@ void llama_constraint_entropy_impl(llama_token_data_array * candidates, float mi #endif // Apply the dynamically calculated temperature scaling - for (size_t i = 0; i < candidates->size; ++i) { - candidates->data[i].logit /= dyn_temp; + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].logit /= dyn_temp; } // Re-compute softmax probabilities after scaling logits with dynamic temperature - const double max_l_double = candidates->data[0].logit; + const double max_l_double = cur_p->data[0].logit; double cum_sum_double = 0.0; - for (size_t i = 0; i < candidates->size; ++i) { - double p = exp(candidates->data[i].logit - max_l_double); - candidates->data[i].p = p; // Store the scaled probability + for (size_t i = 0; i < cur_p->size; ++i) { + double p = exp(cur_p->data[i].logit - max_l_double); + cur_p->data[i].p = p; // Store the scaled probability cum_sum_double += p; } - for (size_t i = 0; i < candidates->size; ++i) { - candidates->data[i].p /= cum_sum_double; // Re-normalize the probabilities + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities } #ifdef DEBUG // Print the updated top 25 probabilities after temperature scaling LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n"); - for (size_t i = 0; i < 25 && i < candidates->size; ++i) { - LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates->data[i].p * 100.0f); + for (size_t i = 0; i < 25 && i < cur_p->size; ++i) { + LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, cur_p->data[i].p * 100.0f); } #endif } -void llama_constraint_temp_impl(llama_token_data_array * candidates, float temp) { - for (size_t i = 0; i < candidates->size; ++i) { - candidates->data[i].logit /= temp; +static void llama_constraint_temp_impl(llama_token_data_array * cur_p, float temp) { + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].logit /= temp; } } -void llama_constraint_grammar_impl(llama_token_data_array * candidates, const struct llama_grammar & grammar) { - llama_grammar_apply_impl(grammar, candidates); +static void llama_constraint_grammar_impl(llama_token_data_array * cur_p, const struct llama_grammar & grammar) { + llama_grammar_apply_impl(grammar, cur_p); } void llama_constraint_penalties_impl( - llama_token_data_array * candidates, + llama_token_data_array * cur_p, const llama_token_cnt & token_count, float penalty_repeat, float penalty_freq, float penalty_present) { - // Apply frequency and presence penalties to the candidates - for (size_t i = 0; i < candidates->size; ++i) { - const auto token_iter = token_count.find(candidates->data[i].id); + // Apply frequency and presence penalties to the cur_p + for (size_t i = 0; i < cur_p->size; ++i) { + const auto token_iter = token_count.find(cur_p->data[i].id); if (token_iter == token_count.end()) { continue; } @@ -409,23 +409,42 @@ void llama_constraint_penalties_impl( // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. // This is common fix for this problem, which is to multiply by the penalty instead of dividing. - if (candidates->data[i].logit <= 0) { - candidates->data[i].logit *= penalty_repeat; + if (cur_p->data[i].logit <= 0) { + cur_p->data[i].logit *= penalty_repeat; } else { - candidates->data[i].logit /= penalty_repeat; + cur_p->data[i].logit /= penalty_repeat; } - candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present; + cur_p->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present; } - candidates->sorted = false; + cur_p->sorted = false; } // -// sampling +// constraints // -// constraints +// softmax + +static struct llama_constraint_i llama_constraint_softmax_i = { + /* .accept = */ nullptr, + /* .apply = */ [](struct llama_constraint * /*cnstr*/, llama_token_data_array * cur_p) { + llama_constraint_softmax_impl(cur_p); + }, + /* .reset = */ nullptr, + /* .copy = */ nullptr, + /* .free = */ nullptr, +}; + +struct llama_constraint * llama_constraint_init_softmax_impl() { + struct llama_constraint * result = new llama_constraint { + /* .iface = */ &llama_constraint_softmax_i, + /* .ctx = */ nullptr, + }; + + return result; +} // top-k @@ -436,9 +455,9 @@ struct llama_constraint_context_top_k { static struct llama_constraint_i llama_constraint_top_k_i = { /* .accept = */ nullptr, - /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) { + /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) { auto * ctx = (llama_constraint_context_top_k *) cnstr->ctx; - llama_constraint_top_k_impl(candidates, ctx->k, ctx->min_keep); + llama_constraint_top_k_impl(cur_p, ctx->k, ctx->min_keep); }, /* .reset = */ nullptr, /* .copy = */ [](const struct llama_constraint * cnstr) { @@ -446,10 +465,7 @@ static struct llama_constraint_i llama_constraint_top_k_i = { return llama_constraint_init_top_k_impl(ctx->k, ctx->min_keep); }, /* .free = */ [](struct llama_constraint * cnstr) { - if (cnstr->ctx) { - delete (llama_constraint_context_top_k *) cnstr->ctx; - } - delete cnstr; + delete (llama_constraint_context_top_k *) cnstr->ctx; } }; @@ -474,9 +490,9 @@ struct llama_constraint_context_top_p { static struct llama_constraint_i llama_constraint_top_p_i = { /* .accept = */ nullptr, - /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) { + /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) { auto * ctx = (llama_constraint_context_top_p *) cnstr->ctx; - llama_constraint_top_p_impl(candidates, ctx->p, ctx->min_keep); + llama_constraint_top_p_impl(cur_p, ctx->p, ctx->min_keep); }, /* .reset = */ nullptr, /* .copy = */ [](const struct llama_constraint * cnstr) { @@ -484,10 +500,7 @@ static struct llama_constraint_i llama_constraint_top_p_i = { return llama_constraint_init_top_p_impl(ctx->p, ctx->min_keep); }, /* .free = */ [](struct llama_constraint * cnstr) { - if (cnstr->ctx) { - delete (llama_constraint_context_top_p *) cnstr->ctx; - } - delete cnstr; + delete (llama_constraint_context_top_p *) cnstr->ctx; } }; @@ -512,9 +525,9 @@ struct llama_constraint_context_min_p { static struct llama_constraint_i llama_constraint_min_p_i = { /* .accept = */ nullptr, - /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) { + /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) { auto * ctx = (llama_constraint_context_min_p *) cnstr->ctx; - llama_constraint_min_p_impl(candidates, ctx->p, ctx->min_keep); + llama_constraint_min_p_impl(cur_p, ctx->p, ctx->min_keep); }, /* .reset = */ nullptr, /* .copy = */ [](const struct llama_constraint * cnstr) { @@ -522,10 +535,7 @@ static struct llama_constraint_i llama_constraint_min_p_i = { return llama_constraint_init_min_p_impl(ctx->p, ctx->min_keep); }, /* .free = */ [](struct llama_constraint * cnstr) { - if (cnstr->ctx) { - delete (llama_constraint_context_min_p *) cnstr->ctx; - } - delete cnstr; + delete (llama_constraint_context_min_p *) cnstr->ctx; } }; @@ -550,9 +560,9 @@ struct llama_constraint_context_tail_free { static struct llama_constraint_i llama_constraint_tail_free_i = { /* .accept = */ nullptr, - /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) { + /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) { auto * ctx = (llama_constraint_context_tail_free *) cnstr->ctx; - llama_constraint_tail_free_impl(candidates, ctx->z, ctx->min_keep); + llama_constraint_tail_free_impl(cur_p, ctx->z, ctx->min_keep); }, /* .reset = */ nullptr, /* .copy = */ [](const struct llama_constraint * cnstr) { @@ -560,10 +570,7 @@ static struct llama_constraint_i llama_constraint_tail_free_i = { return llama_constraint_init_tail_free_impl(ctx->z, ctx->min_keep); }, /* .free = */ [](struct llama_constraint * cnstr) { - if (cnstr->ctx) { - delete (llama_constraint_context_tail_free *) cnstr->ctx; - } - delete cnstr; + delete (llama_constraint_context_tail_free *) cnstr->ctx; } }; @@ -588,9 +595,9 @@ struct llama_constraint_context_typical { static struct llama_constraint_i llama_constraint_typical_i = { /* .accept = */ nullptr, - /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) { + /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) { auto * ctx = (llama_constraint_context_typical *) cnstr->ctx; - llama_constraint_typical_impl(candidates, ctx->p, ctx->min_keep); + llama_constraint_typical_impl(cur_p, ctx->p, ctx->min_keep); }, /* .reset = */ nullptr, /* .copy = */ [](const struct llama_constraint * cnstr) { @@ -598,10 +605,7 @@ static struct llama_constraint_i llama_constraint_typical_i = { return llama_constraint_init_typical_impl(ctx->p, ctx->min_keep); }, /* .free = */ [](struct llama_constraint * cnstr) { - if (cnstr->ctx) { - delete (llama_constraint_context_typical *) cnstr->ctx; - } - delete cnstr; + delete (llama_constraint_context_typical *) cnstr->ctx; } }; @@ -625,9 +629,9 @@ struct llama_constraint_context_temp { static struct llama_constraint_i llama_constraint_temp_i = { /* .accept = */ nullptr, - /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) { + /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) { auto * ctx = (llama_constraint_context_temp *) cnstr->ctx; - llama_constraint_temp_impl(candidates, ctx->temp); + llama_constraint_temp_impl(cur_p, ctx->temp); }, /* .reset = */ nullptr, /* .copy = */ [](const struct llama_constraint * cnstr) { @@ -635,10 +639,7 @@ static struct llama_constraint_i llama_constraint_temp_i = { return llama_constraint_init_temp_impl(ctx->temp); }, /* .free = */ [](struct llama_constraint * cnstr) { - if (cnstr->ctx) { - delete (llama_constraint_context_temp *) cnstr->ctx; - } - delete cnstr; + delete (llama_constraint_context_temp *) cnstr->ctx; } }; @@ -663,15 +664,15 @@ struct llama_constraint_context_temp_ext { static struct llama_constraint_i llama_constraint_temp_ext_i = { /* .accept = */ nullptr, - /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) { + /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) { auto * ctx = (llama_constraint_context_temp_ext *) cnstr->ctx; if (ctx->delta > 0) { const float temp_min = std::max(0.0f, ctx->temp - ctx->delta); const float temp_max = ctx->temp + ctx->delta; - llama_constraint_entropy_impl(candidates, temp_min, temp_max, ctx->exponent); + llama_constraint_entropy_impl(cur_p, temp_min, temp_max, ctx->exponent); } else { - llama_constraint_temp_impl(candidates, ctx->temp); + llama_constraint_temp_impl(cur_p, ctx->temp); } }, /* .reset = */ nullptr, @@ -680,10 +681,7 @@ static struct llama_constraint_i llama_constraint_temp_ext_i = { return llama_constraint_init_temp_ext_impl(ctx->temp, ctx->delta, ctx->exponent); }, /* .free = */ [](struct llama_constraint * cnstr) { - if (cnstr->ctx) { - delete (llama_constraint_context_temp_ext *) cnstr->ctx; - } - delete cnstr; + delete (llama_constraint_context_temp_ext *) cnstr->ctx; } }; @@ -716,10 +714,10 @@ static struct llama_constraint_i llama_constraint_grammar_i = { llama_grammar_accept_impl(*ctx->grammar, token); } }, - /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) { + /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) { auto * ctx = (llama_constraint_context_grammar *) cnstr->ctx; if (ctx->grammar) { - llama_constraint_grammar_impl(candidates, *ctx->grammar); + llama_constraint_grammar_impl(cur_p, *ctx->grammar); } }, /* .reset = */ [](struct llama_constraint * cnstr) { @@ -749,15 +747,13 @@ static struct llama_constraint_i llama_constraint_grammar_i = { return result; }, /* .free = */ [](struct llama_constraint * cnstr) { - if (cnstr->ctx) { - { - auto * ctx = (llama_constraint_context_grammar *) cnstr->ctx; - llama_grammar_free_impl(ctx->grammar); - } + auto * ctx = (llama_constraint_context_grammar *) cnstr->ctx; - delete (llama_constraint_context_grammar *) cnstr->ctx; + if (ctx->grammar) { + llama_grammar_free_impl(ctx->grammar); } - delete cnstr; + + delete ctx; } }; @@ -807,13 +803,13 @@ static struct llama_constraint_i llama_constraint_penalties_i = { auto * ctx = (llama_constraint_context_penalties *) cnstr->ctx; ctx->prev.push_back(token); }, - /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) { + /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) { auto * ctx = (llama_constraint_context_penalties *) cnstr->ctx; - GGML_ASSERT(candidates->size == ctx->vocab->n_vocab && candidates->sorted == false && "the 'penalties' constraint must be applied on the full vocabulary"); + GGML_ASSERT(cur_p->size == ctx->vocab->n_vocab && cur_p->sorted == false && "the 'penalties' constraint must be applied on the full vocabulary"); if (ctx->ignore_eos) { - candidates->data[ctx->vocab->special_eos_id].logit = -INFINITY; + cur_p->data[ctx->vocab->special_eos_id].logit = -INFINITY; } if ((ctx->penalty_last_n == 0) || @@ -821,7 +817,7 @@ static struct llama_constraint_i llama_constraint_penalties_i = { return; } - const float nl_logit = !ctx->penalize_nl ? candidates->data[ctx->vocab->linefeed_id].logit : -INFINITY; + const float nl_logit = !ctx->penalize_nl ? cur_p->data[ctx->vocab->linefeed_id].logit : -INFINITY; // Create a frequency map to count occurrences of each token in last_tokens // TODO: optimize this by maintaining the token count in the constraint context @@ -830,11 +826,11 @@ static struct llama_constraint_i llama_constraint_penalties_i = { token_count[ctx->prev.rat(i)]++; } - llama_constraint_penalties_impl(candidates, token_count, ctx->penalty_repeat, ctx->penalty_freq, ctx->penalty_present); + llama_constraint_penalties_impl(cur_p, token_count, ctx->penalty_repeat, ctx->penalty_freq, ctx->penalty_present); if (!ctx->penalize_nl) { // restore the logit of the newline token if it was penalized - candidates->data[ctx->vocab->linefeed_id].logit = nl_logit; + cur_p->data[ctx->vocab->linefeed_id].logit = nl_logit; } }, /* .reset = */ [](struct llama_constraint * cnstr) { @@ -858,10 +854,7 @@ static struct llama_constraint_i llama_constraint_penalties_i = { return result; }, /* .free = */ [](struct llama_constraint * cnstr) { - if (cnstr->ctx) { - delete (llama_constraint_context_penalties *) cnstr->ctx; - } - delete cnstr; + delete (llama_constraint_context_penalties *) cnstr->ctx; } }; @@ -896,13 +889,13 @@ struct llama_constraint_context_logit_bias { static struct llama_constraint_i llama_constraint_logit_bias_i = { /* .accept = */ nullptr, - /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * candidates) { + /* .apply = */ [](struct llama_constraint * cnstr, llama_token_data_array * cur_p) { auto * ctx = (llama_constraint_context_logit_bias *) cnstr->ctx; - GGML_ASSERT(candidates->size == ctx->vocab->n_vocab && candidates->sorted == false && "the 'logit_bias' constraint must be applied on the full vocabulary"); + GGML_ASSERT(cur_p->size == ctx->vocab->n_vocab && cur_p->sorted == false && "the 'logit_bias' constraint must be applied on the full vocabulary"); for (const auto & lb : ctx->logit_bias) { - candidates->data[lb.token].logit += lb.bias; + cur_p->data[lb.token].logit += lb.bias; } }, /* .reset = */ nullptr, @@ -911,10 +904,7 @@ static struct llama_constraint_i llama_constraint_logit_bias_i = { return llama_constraint_init_logit_bias_impl(*ctx_src->vocab, ctx_src->logit_bias.size(), ctx_src->logit_bias.data()); }, /* .free = */ [](struct llama_constraint * cnstr) { - if (cnstr->ctx) { - delete (llama_constraint_context_logit_bias *) cnstr->ctx; - } - delete cnstr; + delete (llama_constraint_context_logit_bias *) cnstr->ctx; } }; @@ -940,9 +930,15 @@ struct llama_constraint * llama_constraint_cp_impl(const struct llama_constraint } void llama_constraint_free_impl(struct llama_constraint * cnstr) { - if (cnstr->iface->free && cnstr) { + if (cnstr == nullptr) { + return; + } + + if (cnstr->iface->free) { cnstr->iface->free(cnstr); } + + delete cnstr; } void llama_constraint_accept_impl(struct llama_constraint & cnstr, llama_token token) { @@ -951,9 +947,9 @@ void llama_constraint_accept_impl(struct llama_constraint & cnstr, llama_token t } } -void llama_constraint_apply_impl(struct llama_constraint & cnstr, struct llama_token_data_array * candidates) { +void llama_constraint_apply_impl(struct llama_constraint & cnstr, struct llama_token_data_array * cur_p) { GGML_ASSERT(cnstr.iface->apply); - cnstr.iface->apply(&cnstr, candidates); + cnstr.iface->apply(&cnstr, cur_p); } void llama_constraint_reset_impl(struct llama_constraint & cnstr) { @@ -962,7 +958,9 @@ void llama_constraint_reset_impl(struct llama_constraint & cnstr) { } } +// // samplers +// struct llama_sampler * llama_sampler_init_impl(const struct llama_vocab & vocab, struct llama_sampler_params params) { auto * result = new llama_sampler { @@ -1050,9 +1048,9 @@ void llama_sampler_accept_impl(struct llama_sampler & smpl, llama_token token) { } } -void llama_sampler_apply_impl(struct llama_sampler & smpl, struct llama_token_data_array * candidates) { +void llama_sampler_apply_impl(struct llama_sampler & smpl, struct llama_token_data_array * cur_p) { for (auto * cnstr : smpl.constraints) { - llama_constraint_apply_impl(*cnstr, candidates); + llama_constraint_apply_impl(*cnstr, cur_p); } } @@ -1068,16 +1066,16 @@ int llama_sampler_n_prev_impl(const struct llama_sampler & smpl) { return smpl.prev.size(); } -llama_token llama_sampler_sample_mirostat_impl(struct llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, int32_t m, int32_t n_vocab, float & mu) { - llama_constraint_softmax_impl(candidates); +llama_token llama_sampler_sample_mirostat_impl(struct llama_token_data_array * cur_p, std::mt19937 & rng, float tau, float eta, int32_t m, int32_t n_vocab, float & mu) { + llama_constraint_softmax_impl(cur_p); // Estimate s_hat using the most probable m tokens float s_hat = 0.0; float sum_ti_bi = 0.0; float sum_ti_sq = 0.0; - for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) { + for (size_t i = 0; i < size_t(m - 1) && i < cur_p->size - 1; ++i) { float t_i = logf(float(i + 2) / float(i + 1)); - float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p); + float b_i = logf(cur_p->data[i].p / cur_p->data[i + 1].p); sum_ti_bi += t_i * b_i; sum_ti_sq += t_i * t_i; } @@ -1088,14 +1086,14 @@ llama_token llama_sampler_sample_mirostat_impl(struct llama_token_data_array * c float k = powf((epsilon_hat * powf(2, mu)) / (1 - powf(n_vocab, -epsilon_hat)), 1 / s_hat); // Sample the next word X using top-k sampling - llama_constraint_top_k_impl(candidates, int(k), 1); - llama_token X = llama_sampler_sample_dist_impl(candidates, rng); + llama_constraint_top_k_impl(cur_p, int(k), 1); + llama_token X = llama_sampler_sample_dist_impl(cur_p, rng); // Compute error as the difference between observed surprise and target surprise value - size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + size_t X_idx = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) { return candidate.id == X; })); - float observed_surprise = -log2f(candidates->data[X_idx].p); + float observed_surprise = -log2f(cur_p->data[X_idx].p); float e = observed_surprise - tau; // Update mu using the learning rate and error @@ -1104,30 +1102,30 @@ llama_token llama_sampler_sample_mirostat_impl(struct llama_token_data_array * c return X; } -llama_token llama_sampler_sample_mirostat_v2_impl(struct llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, float & mu) { - llama_constraint_softmax_impl(candidates); +llama_token llama_sampler_sample_mirostat_v2_impl(struct llama_token_data_array * cur_p, std::mt19937 & rng, float tau, float eta, float & mu) { + llama_constraint_softmax_impl(cur_p); // Truncate the words with surprise values greater than mu - candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) { return -log2f(candidate.p) > mu; })); - if (candidates->size == 0) { - candidates->size = 1; + if (cur_p->size == 0) { + cur_p->size = 1; } // Normalize the probabilities of the remaining words - llama_constraint_softmax_impl(candidates); + llama_constraint_softmax_impl(cur_p); // Sample the next word X from the remaining words - llama_token X = llama_sampler_sample_dist_impl(candidates, rng); + llama_token X = llama_sampler_sample_dist_impl(cur_p, rng); // Compute error as the difference between observed surprise and target surprise value - size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + size_t X_idx = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) { return candidate.id == X; })); - float observed_surprise = -log2f(candidates->data[X_idx].p); + float observed_surprise = -log2f(cur_p->data[X_idx].p); float e = observed_surprise - tau; // Update mu using the learning rate and error @@ -1136,17 +1134,17 @@ llama_token llama_sampler_sample_mirostat_v2_impl(struct llama_token_data_array return X; } -llama_token llama_sampler_sample_greedy_impl(llama_token_data_array * candidates, bool probs) { +llama_token llama_sampler_sample_greedy_impl(llama_token_data_array * cur_p, bool probs) { if (probs) { // if probs are needed, we apply softmax to get the probabilities - llama_constraint_softmax_impl(candidates); + llama_constraint_softmax_impl(cur_p); - // the candidates are sorted, so we can just return the first one - return candidates->data[0].id; + // the cur_p are sorted, so we can just return the first one + return cur_p->data[0].id; } // return the token with the highest logit - auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + auto * max_iter = std::max_element(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) { return a.logit < b.logit; }); @@ -1155,20 +1153,20 @@ llama_token llama_sampler_sample_greedy_impl(llama_token_data_array * candidates return result; } -llama_token llama_sampler_sample_dist_impl(struct llama_token_data_array * candidates, std::mt19937 & rng) { - llama_constraint_softmax_impl(candidates); +llama_token llama_sampler_sample_dist_impl(struct llama_token_data_array * cur_p, std::mt19937 & rng) { + llama_constraint_softmax_impl(cur_p); std::vector probs; - probs.reserve(candidates->size); + probs.reserve(cur_p->size); - for (size_t i = 0; i < candidates->size; ++i) { - probs.push_back(candidates->data[i].p); + for (size_t i = 0; i < cur_p->size; ++i) { + probs.push_back(cur_p->data[i].p); } std::discrete_distribution<> dist(probs.begin(), probs.end()); const int idx = dist(rng); - llama_token result = candidates->data[idx].id; + llama_token result = cur_p->data[idx].id; return result; } diff --git a/src/llama-sampling.h b/src/llama-sampling.h index f60d5b95f..e4f910886 100644 --- a/src/llama-sampling.h +++ b/src/llama-sampling.h @@ -10,19 +10,9 @@ struct llama_grammar; using llama_token_cnt = std::unordered_map; -// TODO: tmp exposed, until tests start using llama_constraint -void llama_constraint_softmax_impl (struct llama_token_data_array * candidates); -void llama_constraint_top_k_impl (struct llama_token_data_array * candidates, int32_t k, size_t min_keep); -void llama_constraint_top_p_impl (struct llama_token_data_array * candidates, float p, size_t min_keep); -void llama_constraint_min_p_impl (struct llama_token_data_array * candidates, float p, size_t min_keep); -void llama_constraint_tail_free_impl(struct llama_token_data_array * candidates, float z, size_t min_keep); -void llama_constraint_typical_impl (struct llama_token_data_array * candidates, float p, size_t min_keep); -void llama_constraint_entropy_impl (struct llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val); -void llama_constraint_temp_impl (struct llama_token_data_array * candidates, float temp); -void llama_constraint_grammar_impl (struct llama_token_data_array * candidates, const struct llama_grammar & grammar); - +// TODO: tmp exposed until test-sampling is fixed void llama_constraint_penalties_impl( - llama_token_data_array * candidates, + llama_token_data_array * cur_p, const llama_token_cnt & token_count, float penalty_repeat, float penalty_freq, @@ -30,6 +20,7 @@ void llama_constraint_penalties_impl( // constraints +struct llama_constraint * llama_constraint_init_softmax_impl (); struct llama_constraint * llama_constraint_init_top_k_impl (int32_t k, size_t min_keep); struct llama_constraint * llama_constraint_init_top_p_impl (float p, size_t min_keep); struct llama_constraint * llama_constraint_init_min_p_impl (float p, size_t min_keep); @@ -62,7 +53,7 @@ struct llama_constraint * llama_constraint_cp_impl(const struct llama_constraint void llama_constraint_free_impl(struct llama_constraint * cnstr); void llama_constraint_accept_impl(struct llama_constraint & cnstr, llama_token token); -void llama_constraint_apply_impl (struct llama_constraint & cnstr, struct llama_token_data_array * candidates); +void llama_constraint_apply_impl (struct llama_constraint & cnstr, struct llama_token_data_array * cur_p); void llama_constraint_reset_impl (struct llama_constraint & cnstr); // samplers @@ -101,7 +92,7 @@ void llama_sampler_reset_impl( struct llama_sampler & smp void llama_sampler_add_constraint_impl(struct llama_sampler & smpl, struct llama_constraint * cnstr); void llama_sampler_accept_impl(struct llama_sampler & smpl, llama_token token); -void llama_sampler_apply_impl (struct llama_sampler & smpl, struct llama_token_data_array * candidates); +void llama_sampler_apply_impl (struct llama_sampler & smpl, struct llama_token_data_array * cur_p); llama_token llama_sampler_prev_impl (const struct llama_sampler & smpl, int ith); int llama_sampler_n_prev_impl(const struct llama_sampler & smpl); @@ -112,14 +103,14 @@ int llama_sampler_n_prev_impl(const struct llama_sampler & smpl); /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. -llama_token llama_sampler_sample_mirostat_impl (struct llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, int32_t m, int32_t n_vocab, float & mu); +llama_token llama_sampler_sample_mirostat_impl (struct llama_token_data_array * cur_p, std::mt19937 & rng, float tau, float eta, int32_t m, int32_t n_vocab, float & mu); /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. -llama_token llama_sampler_sample_mirostat_v2_impl(struct llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, float & mu); +llama_token llama_sampler_sample_mirostat_v2_impl(struct llama_token_data_array * cur_p, std::mt19937 & rng, float tau, float eta, float & mu); -llama_token llama_sampler_sample_greedy_impl(struct llama_token_data_array * candidates, bool probs); -llama_token llama_sampler_sample_dist_impl (struct llama_token_data_array * candidates, std::mt19937 & rng); +llama_token llama_sampler_sample_greedy_impl(struct llama_token_data_array * cur_p, bool probs); +llama_token llama_sampler_sample_dist_impl (struct llama_token_data_array * cur_p, std::mt19937 & rng); diff --git a/src/llama.cpp b/src/llama.cpp index 2b54a1ff3..28f406ce2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20609,6 +20609,10 @@ int32_t llama_chat_apply_template( // sampling // +struct llama_constraint * llama_constraint_init_softmax() { + return llama_constraint_init_softmax_impl(); +} + struct llama_constraint * llama_constraint_init_top_k(int32_t k, int32_t min_keep) { return llama_constraint_init_top_k_impl(k, min_keep); } @@ -20675,8 +20679,8 @@ void llama_constraint_accept(struct llama_constraint * cnstr, llama_token token) llama_constraint_accept_impl(*cnstr, token); } -void llama_constraint_apply(struct llama_constraint * cnstr, llama_token_data_array * candidates) { - llama_constraint_apply_impl(*cnstr, candidates); +void llama_constraint_apply(struct llama_constraint * cnstr, llama_token_data_array * cur_p) { + llama_constraint_apply_impl(*cnstr, cur_p); } void llama_constraint_reset(struct llama_constraint * cnstr) { @@ -20727,21 +20731,21 @@ void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) { llama_sampler_accept_impl(*smpl, token); } -void llama_sampler_apply(struct llama_sampler * smpl, llama_token_data_array * candidates) { +void llama_sampler_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { time_meas tm(smpl->t_sample_us); - if (candidates == nullptr) { - candidates = &smpl->cur_p; + if (cur_p == nullptr) { + cur_p = &smpl->cur_p; } - llama_sampler_apply_impl(*smpl, candidates); + llama_sampler_apply_impl(*smpl, cur_p); } -llama_token llama_sampler_sample_mirostat(struct llama_sampler * smpl, llama_token_data_array * candidates) { +llama_token llama_sampler_sample_mirostat(struct llama_sampler * smpl, llama_token_data_array * cur_p) { time_meas tm(smpl->t_sample_us); - if (candidates == nullptr) { - candidates = &smpl->cur_p; + if (cur_p == nullptr) { + cur_p = &smpl->cur_p; } const auto type = smpl->params.mirostat; @@ -20749,7 +20753,7 @@ llama_token llama_sampler_sample_mirostat(struct llama_sampler * smpl, llama_tok llama_token res; if (type == 1) { - res = llama_sampler_sample_mirostat_impl(candidates, + res = llama_sampler_sample_mirostat_impl(cur_p, smpl->rng, smpl->params.mirostat_tau, smpl->params.mirostat_eta, @@ -20757,7 +20761,7 @@ llama_token llama_sampler_sample_mirostat(struct llama_sampler * smpl, llama_tok smpl->vocab->n_vocab, smpl->mirostat_mu); } else if (type == 2) { - res = llama_sampler_sample_mirostat_v2_impl(candidates, + res = llama_sampler_sample_mirostat_v2_impl(cur_p, smpl->rng, smpl->params.mirostat_tau, smpl->params.mirostat_eta, @@ -20771,28 +20775,28 @@ llama_token llama_sampler_sample_mirostat(struct llama_sampler * smpl, llama_tok return res; } -llama_token llama_sampler_sample_greedy(struct llama_sampler * smpl, llama_token_data_array * candidates, bool probs) { +llama_token llama_sampler_sample_greedy(struct llama_sampler * smpl, llama_token_data_array * cur_p, bool probs) { time_meas tm(smpl->t_sample_us); - if (candidates == nullptr) { - candidates = &smpl->cur_p; + if (cur_p == nullptr) { + cur_p = &smpl->cur_p; } - auto res = llama_sampler_sample_greedy_impl(candidates, probs); + auto res = llama_sampler_sample_greedy_impl(cur_p, probs); smpl->n_sample++; return res; } -llama_token llama_sampler_sample_dist(struct llama_sampler * smpl, llama_token_data_array * candidates) { +llama_token llama_sampler_sample_dist(struct llama_sampler * smpl, llama_token_data_array * cur_p) { time_meas tm(smpl->t_sample_us); - if (candidates == nullptr) { - candidates = &smpl->cur_p; + if (cur_p == nullptr) { + cur_p = &smpl->cur_p; } - auto res = llama_sampler_sample_dist_impl(candidates, smpl->rng); + auto res = llama_sampler_sample_dist_impl(cur_p, smpl->rng); smpl->n_sample++; diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 16eeaa1c8..0c9b46429 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -11,119 +11,125 @@ #include #include -static void dump(const llama_token_data_array * candidates) { - for (size_t i = 0; i < candidates->size; i++) { - printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit); +static void dump(const llama_token_data_array * cur_p) { + for (size_t i = 0; i < cur_p->size; i++) { + printf("%d: %f (%f)\n", cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); } } -#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0) +#define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0) + +#define TEST(__cnstr, __cur_p) do { \ + auto * cnstr = (__cnstr); \ + llama_constraint_apply(cnstr, (__cur_p)); \ + llama_constraint_free(cnstr); \ +} while(0) static void test_top_k(const std::vector & probs, const std::vector & expected_probs, int k) { const size_t n_vocab = probs.size(); - std::vector candidates; - candidates.reserve(n_vocab); + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { const float logit = logf(probs[token_id]); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - llama_constraint_softmax_impl(&candidates_p); - DUMP(&candidates_p); - llama_constraint_top_k_impl(&candidates_p, k, 1); - DUMP(&candidates_p); + llama_token_data_array cur_p = { cur.data(), cur.size(), false }; + TEST(llama_constraint_init_softmax(), &cur_p); + DUMP(&cur_p); + TEST(llama_constraint_init_top_k(k, 1), &cur_p); + DUMP(&cur_p); - GGML_ASSERT(candidates_p.size == expected_probs.size()); - for (size_t i = 0; i < candidates_p.size; i++) { - GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5); + GGML_ASSERT(cur_p.size == expected_probs.size()); + for (size_t i = 0; i < cur_p.size; i++) { + GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5); } } static void test_top_p(const std::vector & probs, const std::vector & expected_probs, float p) { const size_t n_vocab = probs.size(); - std::vector candidates; - candidates.reserve(n_vocab); + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { const float logit = logf(probs[token_id]); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - llama_constraint_softmax_impl(&candidates_p); - DUMP(&candidates_p); - llama_constraint_top_p_impl(&candidates_p, p, 1); - DUMP(&candidates_p); + llama_token_data_array cur_p = { cur.data(), cur.size(), false }; + TEST(llama_constraint_init_softmax(), &cur_p); + DUMP(&cur_p); + TEST(llama_constraint_init_top_p(p, 1), &cur_p); + DUMP(&cur_p); - GGML_ASSERT(candidates_p.size == expected_probs.size()); - for (size_t i = 0; i < candidates_p.size; i++) { - GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + GGML_ASSERT(cur_p.size == expected_probs.size()); + for (size_t i = 0; i < cur_p.size; i++) { + GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); } } static void test_tfs(const std::vector & probs, const std::vector & expected_probs, float z) { const size_t n_vocab = probs.size(); - std::vector candidates; - candidates.reserve(n_vocab); + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { const float logit = logf(probs[token_id]); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - DUMP(&candidates_p); - llama_constraint_tail_free_impl(&candidates_p, z, 1); - DUMP(&candidates_p); + llama_token_data_array cur_p = { cur.data(), cur.size(), false }; + DUMP(&cur_p); + TEST(llama_constraint_init_tail_free(z, 1), &cur_p); + DUMP(&cur_p); - GGML_ASSERT(candidates_p.size == expected_probs.size()); - for (size_t i = 0; i < candidates_p.size; i++) { - GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + GGML_ASSERT(cur_p.size == expected_probs.size()); + for (size_t i = 0; i < cur_p.size; i++) { + GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); } } static void test_min_p(const std::vector & probs, const std::vector & expected_probs, float p) { const size_t n_vocab = probs.size(); - std::vector candidates; - candidates.reserve(n_vocab); + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { const float logit = logf(probs[token_id]); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - DUMP(&candidates_p); - llama_constraint_min_p_impl(&candidates_p, p, 1); - DUMP(&candidates_p); - llama_constraint_softmax_impl(&candidates_p); + llama_token_data_array cur_p = { cur.data(), cur.size(), false }; + DUMP(&cur_p); + TEST(llama_constraint_init_min_p(p, 1), &cur_p); + DUMP(&cur_p); + TEST(llama_constraint_init_softmax(), &cur_p); - GGML_ASSERT(candidates_p.size == expected_probs.size()); - for (size_t i = 0; i < candidates_p.size; i++) { - GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + GGML_ASSERT(cur_p.size == expected_probs.size()); + for (size_t i = 0; i < cur_p.size; i++) { + GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); } } static void test_typical(const std::vector & probs, const std::vector & expected_probs, float p) { const size_t n_vocab = probs.size(); - std::vector candidates; - candidates.reserve(n_vocab); + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { const float logit = logf(probs[token_id]); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - DUMP(&candidates_p); - llama_constraint_typical_impl(&candidates_p, p, 1); - DUMP(&candidates_p); + llama_token_data_array cur_p = { cur.data(), cur.size(), false }; + DUMP(&cur_p); + TEST(llama_constraint_init_typical(p, 1), &cur_p); + DUMP(&cur_p); - GGML_ASSERT(candidates_p.size == expected_probs.size()); - for (size_t i = 0; i < candidates_p.size; i++) { - GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + GGML_ASSERT(cur_p.size == expected_probs.size()); + for (size_t i = 0; i < cur_p.size; i++) { + GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); } } @@ -135,11 +141,11 @@ static void test_penalties( const size_t n_vocab = probs.size(); - std::vector candidates; - candidates.reserve(n_vocab); + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { const float logit = logf(probs[token_id]); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } llama_token_cnt token_count; @@ -147,55 +153,55 @@ static void test_penalties( token_count[last_tokens[i]]++; } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - llama_constraint_softmax_impl(&candidates_p); - DUMP(&candidates_p); - llama_constraint_penalties_impl(&candidates_p, token_count, repeat_penalty, alpha_frequency, alpha_presence); - llama_constraint_softmax_impl(&candidates_p); - DUMP(&candidates_p); + llama_token_data_array cur_p = { cur.data(), cur.size(), false }; + TEST(llama_constraint_init_softmax(), &cur_p); + DUMP(&cur_p); + llama_constraint_penalties_impl(&cur_p, token_count, repeat_penalty, alpha_frequency, alpha_presence); // TODO: avoid + TEST(llama_constraint_init_softmax(), &cur_p); + DUMP(&cur_p); - GGML_ASSERT(candidates_p.size == expected_probs.size()); - for (size_t i = 0; i < candidates_p.size; i++) { - GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + GGML_ASSERT(cur_p.size == expected_probs.size()); + for (size_t i = 0; i < cur_p.size; i++) { + GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); } } static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p ) { - std::vector candidates; - candidates.reserve(n_vocab); + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { const float logit = logf(token_id); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + llama_token_data_array cur_p = { cur.data(), cur.size(), false }; llama_token min_token_id = 0; const llama_token max_token_id = n_vocab-1; for (auto s : samplers_sequence) { switch (s){ - case 'k': llama_constraint_top_k_impl(&candidates_p, top_k, 1); break; + case 'k': TEST(llama_constraint_init_top_k(top_k, 1), &cur_p); break; case 'f': GGML_ABORT("tail_free test not implemented"); case 'y': GGML_ABORT("typical test not implemented"); - case 'p': llama_constraint_top_p_impl(&candidates_p, top_p, 1); break; - case 'm': llama_constraint_min_p_impl(&candidates_p, min_p, 1); break; + case 'p': TEST(llama_constraint_init_top_p(top_p, 1), &cur_p); break; + case 'm': TEST(llama_constraint_init_min_p(min_p, 1), &cur_p); break; case 't': GGML_ABORT("temperature test not implemented"); default : GGML_ABORT("Unknown sampler"); } - llama_constraint_softmax_impl(&candidates_p); // make sure tokens are sorted for tests + TEST(llama_constraint_init_softmax(), &cur_p); // make sure tokens are sorted for tests - const int size = candidates_p.size; + const int size = cur_p.size; if (s == 'k') { const int expected_size = std::min(size, top_k); min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k)); GGML_ASSERT(size == expected_size); - GGML_ASSERT(candidates_p.data[0].id == max_token_id); - GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id); + GGML_ASSERT(cur_p.data[0].id == max_token_id); + GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id); } else if (s == 'p') { const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2; const int softmax_numerator_target = ceilf(top_p * softmax_divisor); @@ -217,8 +223,8 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler } GGML_ASSERT(size == expected_size); - GGML_ASSERT(candidates_p.data[0].id == max_token_id); - GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id); + GGML_ASSERT(cur_p.data[0].id == max_token_id); + GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id); } else if (s == 'm') { int expected_size = ceilf((1.0f-min_p) * n_vocab); expected_size = std::max(expected_size, 1); @@ -230,8 +236,8 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1)); GGML_ASSERT(size == expected_size); - GGML_ASSERT(candidates_p.data[0].id == max_token_id); - GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id); + GGML_ASSERT(cur_p.data[0].id == max_token_id); + GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id); } else { GGML_ABORT("fatal error"); }