cleanup
This commit is contained in:
parent
fcbbfc1666
commit
69e638e56a
4 changed files with 19 additions and 34 deletions
|
@ -218,12 +218,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
sparams.top_p = std::stof(argv[i]);
|
||||
} else if (arg == "--min-p") { // Adding min_p argument
|
||||
} else if (arg == "--min-p") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.min_p = std::stof(argv[i]); // Parsing and setting the min_p value from command line
|
||||
sparams.min_p = std::stof(argv[i]);
|
||||
} else if (arg == "--temp") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -685,7 +685,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||
printf(" --min-p N min-p sampling (default: %.2f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
||||
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
||||
|
|
|
@ -191,7 +191,7 @@ llama_token llama_sampling_sample(
|
|||
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
||||
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
||||
if (min_p != 0.0) {
|
||||
llama_sample_min_p(ctx_main, &cur_p, min_p, min_keep);
|
||||
llama_sample_min_p(ctx_main, &cur_p, min_p, min_keep);
|
||||
} else {
|
||||
llama_sample_top_p(ctx_main, &cur_p, top_p, min_keep);
|
||||
}
|
||||
|
|
43
llama.cpp
43
llama.cpp
|
@ -7361,44 +7361,29 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
|||
}
|
||||
|
||||
void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
||||
float base_min_p = p; // This will hold the base minimum probability value
|
||||
float multiplied_min_p; // This will hold the adjusted minimum probability threshold
|
||||
if (p <= 0.0f || !candidates->size) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Ensure the probabilities are calculated.
|
||||
llama_sample_softmax(ctx, candidates);
|
||||
|
||||
// Calculate the multiplication factor based on the highest scoring token.
|
||||
float multiplication_factor = candidates->data[0].p;
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
// Calculate the minimum percentage requirement.
|
||||
multiplied_min_p = base_min_p * multiplication_factor;
|
||||
float scale = candidates->data[0].p; // scale by max prob
|
||||
size_t i = 1; // first token always matches
|
||||
|
||||
// Store the tokens that meet the threshold in a new list.
|
||||
std::vector<llama_token_data> filtered_candidates;
|
||||
filtered_candidates.reserve(candidates->size); // Reserve to avoid multiple reallocations
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
// If a token's probability is above the threshold or if we haven't kept enough tokens yet
|
||||
if (candidates->data[i].p >= multiplied_min_p || filtered_candidates.size() < min_keep) {
|
||||
filtered_candidates.push_back(candidates->data[i]);
|
||||
for (; i < candidates->size; ++i) {
|
||||
if (candidates->data[i].p < p * scale && i >= min_keep) {
|
||||
break; // prob too small
|
||||
}
|
||||
}
|
||||
|
||||
// If not enough candidates meet the threshold, take the top 'min_keep' ones
|
||||
if (filtered_candidates.size() < min_keep) {
|
||||
std::sort(candidates->data, candidates->data + candidates->size,
|
||||
[](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.p > b.p; // Sort by probability in descending order
|
||||
});
|
||||
filtered_candidates.clear(); // Clear the previously filtered candidates
|
||||
for (size_t i = 0; i < min_keep; ++i) {
|
||||
filtered_candidates.push_back(candidates->data[i]);
|
||||
}
|
||||
}
|
||||
// Resize the output vector to keep only the matching tokens
|
||||
candidates->size = i;
|
||||
|
||||
// Now we replace the original candidates with the filtered list.
|
||||
std::copy(filtered_candidates.begin(), filtered_candidates.end(), candidates->data);
|
||||
candidates->size = filtered_candidates.size();
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
|
||||
|
|
2
llama.h
2
llama.h
|
@ -600,7 +600,7 @@ extern "C" {
|
|||
float p,
|
||||
size_t min_keep);
|
||||
|
||||
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841#issue-1966758357
|
||||
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
||||
LLAMA_API void llama_sample_min_p(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue