diff --git a/common/common.cpp b/common/common.cpp index f81f4d354..b7ed4827d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -218,6 +218,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } sparams.top_p = std::stof(argv[i]); + } else if (arg == "--min-p") { // Adding min_p argument + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.min_p = std::stof(argv[i]); // Parsing and setting the min_p value from command line } else if (arg == "--temp") { if (++i >= argc) { invalid_param = true; @@ -679,6 +685,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); + printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z); printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p); printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n); @@ -1275,6 +1282,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency()); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); + fprintf(stream, "min_p: %f # default: 0.05\n", sparams.min_p); fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); } diff --git a/common/sampling.cpp b/common/sampling.cpp index c4996c985..673d67a6d 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -89,10 +89,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) { snprintf(result, sizeof(result), "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" - "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n" + "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n" "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present, - params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, + params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau); return std::string(result); @@ -110,6 +110,7 @@ llama_token llama_sampling_sample( const float temp = params.temp; const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; const float top_p = params.top_p; + const float min_p = params.min_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; @@ -190,6 +191,7 @@ llama_token llama_sampling_sample( llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); + llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); llama_sample_temp (ctx_main, &cur_p, temp); id = llama_sample_token(ctx_main, &cur_p); diff --git a/common/sampling.h b/common/sampling.h index 62ea6d4cf..7c9b8dcf2 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -14,6 +14,7 @@ typedef struct llama_sampling_params { int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t top_k = 40; // <= 0 to use vocab size float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled float tfs_z = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled float temp = 0.80f; // 1.0 = disabled diff --git a/llama.cpp b/llama.cpp index 47b411bd6..f926b99f5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7328,112 +7328,8 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can } } -void read_or_write_base_min_p(float* base_min_p) { - // Define the filename - const std::string filename = "SamplerBaseMinP.txt"; - - // Check if the file exists - std::ifstream infile(filename); - if (!infile.good()) { - // File doesn't exist, create it with default values - std::ofstream outfile(filename); - if (outfile.is_open()) { - // Set the default value for base_min_p - *base_min_p = 0.05f; - outfile << "base_min_p = " << *base_min_p << "\n"; - outfile.close(); - } else { - // Handle the error during file opening or writing here (optional) - // For example, you might want to set a safe default value or notify the user - } - } else { - // File exists, read the values from it - std::string line; - while (getline(infile, line)) { - std::istringstream iss(line); - std::string key; - float value; - char equals; // Using char to read the '=' character - - // Read each line in the format key = value - if (iss >> key >> equals >> value && equals == '=' && key == "base_min_p") { - *base_min_p = value; - } - // Note: If the key doesn't match, or the format is incorrect, - // you might want to handle the error or set a default value - } - infile.close(); - } -} - void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { - // Variables for the special mode - float base_min_p; // This will hold the base minimum probability value - float multiplied_min_p; // This will hold the adjusted minimum probability threshold - - // If p is 1.0, we switch to a different sampling mode. if (p >= 1.0f) { - printf("returning, top p set to 1.0"); - return; - } - - // If p is ~0.02, we switch to the Min P sampler - if (p >= 0.01f && p <= 0.03f) { - printf("USING MIN P SAMPLING MODE\n"); - - // Ensure the probabilities are calculated. - llama_sample_softmax(ctx, candidates); - - // Print the top tokens before filtering - printf("Top tokens before filtering:\n"); - for (size_t i = 0; i < candidates->size && i < 10; ++i) { - printf("Token %zu: %.6f%%\n", i + 1, candidates->data[i].p * 100); // Multiplying by 100 to convert to percentage - } - - base_min_p = 0.05; // For example, 5% as the base minimum probability before reading the text value - - read_or_write_base_min_p(&base_min_p); - - // Calculate the multiplication factor based on the highest scoring token. - float multiplication_factor = candidates->data[0].p; // Assuming the probabilities are sorted - printf("Highest scoring token probability (multiplication factor): %f\n", multiplication_factor); - - // Calculate the dynamic threshold. - multiplied_min_p = base_min_p * multiplication_factor; - printf("Base min_p value: %f\n", base_min_p); - printf("Calculated multiplied_min_p (threshold) value: %f\n", multiplied_min_p); - - // Store the tokens that meet the threshold in a new list. - std::vector filtered_candidates; - filtered_candidates.reserve(candidates->size); // Reserve to avoid multiple reallocations - - // Variable to count how many tokens meet the condition - int count_qualifying_tokens = 0; - - for (size_t i = 0; i < candidates->size; ++i) { - // If a token's probability is above the threshold, we keep it. - if (candidates->data[i].p >= multiplied_min_p) { - filtered_candidates.push_back(candidates->data[i]); - ++count_qualifying_tokens; // Increase count - } - } - - // Debug information about how many tokens were retained - printf("Number of tokens that met the multiplied_min_p condition: %d\n", count_qualifying_tokens); - - llama_sample_softmax(ctx, candidates); // re-normalize after pruning - - // Print the top tokens after filtering - printf("Tokens after filtering:\n"); - for (size_t i = 0; i < filtered_candidates.size() && i < 10; ++i) { // Adjust 10 to however many top tokens you want to display - printf("Token %zu: %.6f%%\n", i + 1, filtered_candidates[i].p * 100); // Multiplying by 100 to convert to percentage - } - - // Now we replace the original candidates with the filtered list. - std::copy(filtered_candidates.begin(), filtered_candidates.end(), candidates->data); - candidates->size = filtered_candidates.size(); - - // Since we're not actually sampling below a certain 'p', we return from the function after this. return; } @@ -7464,6 +7360,61 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can } } +void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { + float base_min_p = p; // This will hold the base minimum probability value + float multiplied_min_p; // This will hold the adjusted minimum probability threshold + + printf("\nUSING MIN P SAMPLING MODE\n\n"); + + // Ensure the probabilities are calculated. + llama_sample_softmax(ctx, candidates); + + // Print the top tokens before filtering + printf("Top tokens before filtering:\n"); + for (size_t i = 0; i < candidates->size && i < 10; ++i) { + printf("Token %zu: %.6f%%\n", i + 1, candidates->data[i].p * 100); // Multiplying by 100 to convert to percentage + } + + // Calculate the multiplication factor based on the highest scoring token. + float multiplication_factor = candidates->data[0].p; // Assuming the probabilities are sorted + printf("Highest scoring token probability (multiplication factor): %f\n", multiplication_factor); + + // Calculate the dynamic threshold. + multiplied_min_p = base_min_p * multiplication_factor; + printf("Base min_p value: %f\n", base_min_p); + printf("Calculated multiplied_min_p (threshold) value: %f\n", multiplied_min_p); + + // Store the tokens that meet the threshold in a new list. + std::vector filtered_candidates; + filtered_candidates.reserve(candidates->size); // Reserve to avoid multiple reallocations + + // Variable to count how many tokens meet the condition + int count_qualifying_tokens = 0; + + for (size_t i = 0; i < candidates->size; ++i) { + // If a token's probability is above the threshold, we keep it. + if (candidates->data[i].p >= multiplied_min_p) { + filtered_candidates.push_back(candidates->data[i]); + ++count_qualifying_tokens; // Increase count + } + } + + // Debug information about how many tokens were retained + printf("Number of tokens that met the multiplied_min_p condition: %d\n", count_qualifying_tokens); + + // Print the top tokens after filtering + printf("Tokens after filtering:\n\n"); + for (size_t i = 0; i < filtered_candidates.size() && i < 10; ++i) { // Adjust 10 to however many top tokens you want to display + printf("Token %zu: %.6f%%\n", i + 1, filtered_candidates[i].p * 100); // Multiplying by 100 to convert to percentage + } + + // Now we replace the original candidates with the filtered list. + std::copy(filtered_candidates.begin(), filtered_candidates.end(), candidates->data); + candidates->size = filtered_candidates.size(); + + return; +} + void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) { if (z >= 1.0f || candidates->size <= 2) { return; diff --git a/llama.h b/llama.h index d901dcd91..0ec28eb14 100644 --- a/llama.h +++ b/llama.h @@ -600,6 +600,13 @@ extern "C" { float p, size_t min_keep); + /// @details Minimum P sampling by Kalomaze + LLAMA_API void llama_sample_min_p( + struct llama_context * ctx, + llama_token_data_array * candidates, + float p, + size_t min_keep); + /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. LLAMA_API void llama_sample_tail_free( struct llama_context * ctx,