implemented dynamic temperature sampling from koboldcpp

2024-01-16 20:58:41 +09:00 · 2024-01-16 20:58:41 +09:00 · ea15108462
commit ea15108462
parent 3e5ca7931c
4 changed files with 97 additions and 1 deletions
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -129,6 +129,7 @@ static void sampler_queue(
    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
    const float         temp              = params.temp;
    const float         dynatemp_range    = params.dynatemp_range;
    const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k;
    const float         top_p             = params.top_p;
    const float         min_p             = params.min_p;
@ -143,7 +144,25 @@ static void sampler_queue(
            case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
            case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
            case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
-            case 't': llama_sample_temp     (ctx_main, &cur_p, temp); break;
+            
            case 't':
                if (dynatemp_range>0)
                {
                    float dynatemp_min = temp - dynatemp_range;
                    float dynatemp_max = temp + dynatemp_range;
                    //do not allow negative values
                    dynatemp_min = dynatemp_min<0?0:dynatemp_min;
                    dynatemp_max = dynatemp_max<0?0:dynatemp_max;
                    llama_sample_entropy(ctx_main, &cur_p, temp, dynatemp_min, dynatemp_max);
                }
                else
                {
                    llama_sample_temp(ctx_main, &cur_p, temp);
                }
                break;
            default : break;
        }
    }
--- a/common/sampling.h
+++ b/common/sampling.h
@ -18,6 +18,7 @@ typedef struct llama_sampling_params {
    float       tfs_z                 = 1.00f;    // 1.0 = disabled
    float       typical_p             = 1.00f;    // 1.0 = disabled
    float       temp                  = 0.80f;    // 1.0 = disabled
    float       dynatemp_range        = 0.00f;    // 0.0 = disabled
    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
    float       penalty_freq          = 0.00f;    // 0.0 = disabled
--- a/llama.cpp
+++ b/llama.cpp
@ -7779,6 +7779,74 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
    }
 }
 void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp, float min_temp = 0, float max_temp = 2.0f) {
    const int64_t t_start_sample_us = ggml_time_us();
    llama_sample_softmax(ctx, candidates_p);
    float exponent_val = 1.0f;
    // Calculate entropy of the softmax probabilities
    float entropy = 0.0f;
    for (size_t i = 0; i < candidates_p->size; ++i) {
        float prob = candidates_p->data[i].p;
        if (prob > 0.0f) { // Ensure no log(0)
            entropy -= prob * logf(prob);
        }
    }
    // Calculate maximum possible entropy
    float max_entropy = -logf(1.0f / candidates_p->size);
    // Guard against division by zero
    if (max_entropy == 0.0f) {
        max_entropy = 1.0f; // This ensures that normalized_entropy will be 0 when entropy is 0
    }
    // Normalize the entropy
    float normalized_entropy = entropy / max_entropy;
    // Map the normalized entropy to the desired temperature range using the power function
    float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
    // //todo: Ensure to hide print statements unless debugging!
    // printf("Your text maxtemp value is: %f\n", max_temp);
    // // Print the variables
    // printf("Entropy: %f\n", entropy);
    // printf("Max Possible Entropy: %f\n", max_entropy);
    // printf("Normalized Entropy: %f\n", normalized_entropy);
    // printf("Exponent: %f\n", exponent_val);
    // printf("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
    // Apply the dynamically calculated temperature scaling
    for (size_t i = 0; i < candidates_p->size; ++i) {
        candidates_p->data[i].logit /= dyn_temp;
    }
    // Re-compute softmax probabilities after scaling logits with dynamic temperature
    double max_l_double = candidates_p->data[0].logit;
    double cum_sum_double = 0.0;
    for (size_t i = 0; i < candidates_p->size; ++i) {
        double p = exp(candidates_p->data[i].logit - max_l_double);
        candidates_p->data[i].p = p; // Store the scaled probability
        cum_sum_double += p;
    }
    for (size_t i = 0; i < candidates_p->size; ++i) {
        candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
    }
    // //todo: Ensure to hide print statements unless debugging!
    // // Print the updated top 25 probabilities after temperature scaling
    // printf("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
    // for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
    //     printf("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
    // }
    if (ctx) {
        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
    }
 }
 void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
    const int64_t t_start_sample_us = ggml_time_us();
--- a/llama.h
+++ b/llama.h
@ -770,6 +770,14 @@ extern "C" {
                           float   p,
                          size_t   min_keep);
    /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
    LLAMA_API void llama_sample_entropy(
            struct llama_context * ctx,
          llama_token_data_array * candidates_p,
                           float   temp,
                           float   min_temp,
                           float   max_temp);
    LLAMA_API void llama_sample_temp(
            struct llama_context * ctx,
          llama_token_data_array * candidates,