diff --git a/common/common.cpp b/common/common.cpp index 5ff8c579d..44893a219 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -218,12 +218,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } sparams.top_p = std::stof(argv[i]); - } else if (arg == "--min-p") { // Adding min_p argument + } else if (arg == "--min-p") { if (++i >= argc) { invalid_param = true; break; } - sparams.min_p = std::stof(argv[i]); // Parsing and setting the min_p value from command line + sparams.min_p = std::stof(argv[i]); } else if (arg == "--temp") { if (++i >= argc) { invalid_param = true; @@ -685,7 +685,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); - printf(" --min-p N min-p sampling (default: %.2f, 0.0 = disabled)\n", (double)sparams.min_p); + printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z); printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p); printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n); diff --git a/common/sampling.cpp b/common/sampling.cpp index bd4f34b9e..673d67a6d 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -190,11 +190,8 @@ llama_token llama_sampling_sample( llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); - if (min_p != 0.0) { - llama_sample_min_p(ctx_main, &cur_p, min_p, min_keep); - } else { - llama_sample_top_p(ctx_main, &cur_p, top_p, min_keep); - } + llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); + llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); llama_sample_temp (ctx_main, &cur_p, temp); id = llama_sample_token(ctx_main, &cur_p); diff --git a/llama.cpp b/llama.cpp index 07ec721e2..1f27016c6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7361,44 +7361,29 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can } void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { - float base_min_p = p; // This will hold the base minimum probability value - float multiplied_min_p; // This will hold the adjusted minimum probability threshold + if (p <= 0.0f || !candidates->size) { + return; + } - // Ensure the probabilities are calculated. llama_sample_softmax(ctx, candidates); - // Calculate the multiplication factor based on the highest scoring token. - float multiplication_factor = candidates->data[0].p; + const int64_t t_start_sample_us = ggml_time_us(); - // Calculate the minimum percentage requirement. - multiplied_min_p = base_min_p * multiplication_factor; + float scale = candidates->data[0].p; // scale by max prob + size_t i = 1; // first token always matches - // Store the tokens that meet the threshold in a new list. - std::vector filtered_candidates; - filtered_candidates.reserve(candidates->size); // Reserve to avoid multiple reallocations - - for (size_t i = 0; i < candidates->size; ++i) { - // If a token's probability is above the threshold or if we haven't kept enough tokens yet - if (candidates->data[i].p >= multiplied_min_p || filtered_candidates.size() < min_keep) { - filtered_candidates.push_back(candidates->data[i]); + for (; i < candidates->size; ++i) { + if (candidates->data[i].p < p * scale && i >= min_keep) { + break; // prob too small } } - // If not enough candidates meet the threshold, take the top 'min_keep' ones - if (filtered_candidates.size() < min_keep) { - std::sort(candidates->data, candidates->data + candidates->size, - [](const llama_token_data & a, const llama_token_data & b) { - return a.p > b.p; // Sort by probability in descending order - }); - filtered_candidates.clear(); // Clear the previously filtered candidates - for (size_t i = 0; i < min_keep; ++i) { - filtered_candidates.push_back(candidates->data[i]); - } - } + // Resize the output vector to keep only the matching tokens + candidates->size = i; - // Now we replace the original candidates with the filtered list. - std::copy(filtered_candidates.begin(), filtered_candidates.end(), candidates->data); - candidates->size = filtered_candidates.size(); + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } } void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) { diff --git a/llama.h b/llama.h index 62addffdf..157d75575 100644 --- a/llama.h +++ b/llama.h @@ -600,7 +600,7 @@ extern "C" { float p, size_t min_keep); - /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841#issue-1966758357 + /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 LLAMA_API void llama_sample_min_p( struct llama_context * ctx, llama_token_data_array * candidates,