Transform Min P into a proper CLI option

2023-10-28 20:49:17 -05:00 · 2023-10-28 20:49:17 -05:00 · a235a0d226
commit a235a0d226
parent a9e2b74f1a
5 changed files with 75 additions and 106 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -218,6 +218,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            sparams.top_p = std::stof(argv[i]);
+        } else if (arg == "--min-p") {  // Adding min_p argument
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.min_p = std::stof(argv[i]);  // Parsing and setting the min_p value from command line
        } else if (arg == "--temp") {
            if (++i >= argc) {
                invalid_param = true;
@ -679,6 +685,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
+    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
@ -1275,6 +1282,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
+    fprintf(stream, "min_p: %f # default: 0.05\n", sparams.min_p);
    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -89,10 +89,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {

    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
-            params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
+            params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
            params.mirostat, params.mirostat_eta, params.mirostat_tau);

    return std::string(result);
@ -110,6 +110,7 @@ llama_token llama_sampling_sample(
    const float   temp            = params.temp;
    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
    const float   top_p           = params.top_p;
+    const float   min_p           = params.min_p;
    const float   tfs_z           = params.tfs_z;
    const float   typical_p       = params.typical_p;
    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
@ -190,6 +191,7 @@ llama_token llama_sampling_sample(
            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
+            llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep);
            llama_sample_temp     (ctx_main, &cur_p, temp);

            id = llama_sample_token(ctx_main, &cur_p);
--- a/common/sampling.h
+++ b/common/sampling.h
@ -14,6 +14,7 @@ typedef struct llama_sampling_params {
    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
+    float   min_p             = 0.05f; // 0.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
    float   typical_p         = 1.00f; // 1.0 = disabled
    float   temp              = 0.80f; // 1.0 = disabled
--- a/llama.cpp
+++ b/llama.cpp
@ -7328,112 +7328,8 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
    }
 }

-void read_or_write_base_min_p(float* base_min_p) {
-    // Define the filename
-    const std::string filename = "SamplerBaseMinP.txt";
-
-    // Check if the file exists
-    std::ifstream infile(filename);
-    if (!infile.good()) {
-        // File doesn't exist, create it with default values
-        std::ofstream outfile(filename);
-        if (outfile.is_open()) {
-            // Set the default value for base_min_p
-            *base_min_p = 0.05f;
-            outfile << "base_min_p = " << *base_min_p << "\n";
-            outfile.close();
-        } else {
-            // Handle the error during file opening or writing here (optional)
-            // For example, you might want to set a safe default value or notify the user
-        }
-    } else {
-        // File exists, read the values from it
-        std::string line;
-        while (getline(infile, line)) {
-            std::istringstream iss(line);
-            std::string key;
-            float value;
-            char equals;  // Using char to read the '=' character
-
-            // Read each line in the format key = value
-            if (iss >> key >> equals >> value && equals == '=' && key == "base_min_p") {
-                *base_min_p = value;
-            }
-            // Note: If the key doesn't match, or the format is incorrect, 
-            // you might want to handle the error or set a default value
-        }
-        infile.close();
-    }
-}
-
 void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    // Variables for the special mode
-    float base_min_p;  // This will hold the base minimum probability value
-    float multiplied_min_p;  // This will hold the adjusted minimum probability threshold
-
-    // If p is 1.0, we switch to a different sampling mode.
    if (p >= 1.0f) {
-        printf("returning, top p set to 1.0");
-        return;
-    }
-
-    // If p is ~0.02, we switch to the Min P sampler
-    if (p >= 0.01f && p <= 0.03f) {
-        printf("USING MIN P SAMPLING MODE\n");
-
-        // Ensure the probabilities are calculated.
-        llama_sample_softmax(ctx, candidates);
-
-        // Print the top tokens before filtering
-        printf("Top tokens before filtering:\n");
-        for (size_t i = 0; i < candidates->size && i < 10; ++i) {
-            printf("Token %zu: %.6f%%\n", i + 1, candidates->data[i].p * 100);  // Multiplying by 100 to convert to percentage
-        }
-
-        base_min_p = 0.05;  // For example, 5% as the base minimum probability before reading the text value
-
-        read_or_write_base_min_p(&base_min_p);
-
-        // Calculate the multiplication factor based on the highest scoring token.
-        float multiplication_factor = candidates->data[0].p;  // Assuming the probabilities are sorted
-        printf("Highest scoring token probability (multiplication factor): %f\n", multiplication_factor);
-
-        // Calculate the dynamic threshold.
-        multiplied_min_p = base_min_p * multiplication_factor;
-        printf("Base min_p value: %f\n", base_min_p);
-        printf("Calculated multiplied_min_p (threshold) value: %f\n", multiplied_min_p);
-
-        // Store the tokens that meet the threshold in a new list.
-        std::vector<llama_token_data> filtered_candidates;
-        filtered_candidates.reserve(candidates->size);  // Reserve to avoid multiple reallocations
-
-        // Variable to count how many tokens meet the condition
-        int count_qualifying_tokens = 0;
-
-        for (size_t i = 0; i < candidates->size; ++i) {
-            // If a token's probability is above the threshold, we keep it.
-            if (candidates->data[i].p >= multiplied_min_p) {
-                filtered_candidates.push_back(candidates->data[i]);
-                ++count_qualifying_tokens;  // Increase count
-            }
-        }
-
-        // Debug information about how many tokens were retained
-        printf("Number of tokens that met the multiplied_min_p condition: %d\n", count_qualifying_tokens);
-        
-        llama_sample_softmax(ctx, candidates); // re-normalize after pruning
-
-        // Print the top tokens after filtering
-        printf("Tokens after filtering:\n");
-        for (size_t i = 0; i < filtered_candidates.size() && i < 10; ++i) {  // Adjust 10 to however many top tokens you want to display
-            printf("Token %zu: %.6f%%\n", i + 1, filtered_candidates[i].p * 100);  // Multiplying by 100 to convert to percentage
-        }
-
-        // Now we replace the original candidates with the filtered list.
-        std::copy(filtered_candidates.begin(), filtered_candidates.end(), candidates->data);
-        candidates->size = filtered_candidates.size();
-
-        // Since we're not actually sampling below a certain 'p', we return from the function after this.
        return;
    }
    
@ -7464,6 +7360,61 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
    }
 }

+void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
+    float base_min_p = p;  // This will hold the base minimum probability value
+    float multiplied_min_p;  // This will hold the adjusted minimum probability threshold
+
+    printf("\nUSING MIN P SAMPLING MODE\n\n");
+
+    // Ensure the probabilities are calculated.
+    llama_sample_softmax(ctx, candidates);
+
+    // Print the top tokens before filtering
+    printf("Top tokens before filtering:\n");
+    for (size_t i = 0; i < candidates->size && i < 10; ++i) {
+        printf("Token %zu: %.6f%%\n", i + 1, candidates->data[i].p * 100);  // Multiplying by 100 to convert to percentage
+    }
+
+    // Calculate the multiplication factor based on the highest scoring token.
+    float multiplication_factor = candidates->data[0].p;  // Assuming the probabilities are sorted
+    printf("Highest scoring token probability (multiplication factor): %f\n", multiplication_factor);
+
+    // Calculate the dynamic threshold.
+    multiplied_min_p = base_min_p * multiplication_factor;
+    printf("Base min_p value: %f\n", base_min_p);
+    printf("Calculated multiplied_min_p (threshold) value: %f\n", multiplied_min_p);
+
+    // Store the tokens that meet the threshold in a new list.
+    std::vector<llama_token_data> filtered_candidates;
+    filtered_candidates.reserve(candidates->size);  // Reserve to avoid multiple reallocations
+
+    // Variable to count how many tokens meet the condition
+    int count_qualifying_tokens = 0;
+
+    for (size_t i = 0; i < candidates->size; ++i) {
+        // If a token's probability is above the threshold, we keep it.
+        if (candidates->data[i].p >= multiplied_min_p) {
+            filtered_candidates.push_back(candidates->data[i]);
+            ++count_qualifying_tokens;  // Increase count
+        }
+    }
+
+    // Debug information about how many tokens were retained
+    printf("Number of tokens that met the multiplied_min_p condition: %d\n", count_qualifying_tokens);
+
+    // Print the top tokens after filtering
+    printf("Tokens after filtering:\n\n");
+    for (size_t i = 0; i < filtered_candidates.size() && i < 10; ++i) {  // Adjust 10 to however many top tokens you want to display
+        printf("Token %zu: %.6f%%\n", i + 1, filtered_candidates[i].p * 100);  // Multiplying by 100 to convert to percentage
+    }
+
+    // Now we replace the original candidates with the filtered list.
+    std::copy(filtered_candidates.begin(), filtered_candidates.end(), candidates->data);
+    candidates->size = filtered_candidates.size();
+
+    return;
+}
+
 void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
    if (z >= 1.0f || candidates->size <= 2) {
        return;
--- a/llama.h
+++ b/llama.h
@ -600,6 +600,13 @@ extern "C" {
                           float   p,
                          size_t   min_keep);

+    /// @details Minimum P sampling by Kalomaze
+    LLAMA_API void llama_sample_min_p(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   p,
+                          size_t   min_keep);
+
    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
    LLAMA_API void llama_sample_tail_free(
            struct llama_context * ctx,