implemented dynamic temperature sampling from koboldcpp
This commit is contained in:
parent
3e5ca7931c
commit
ea15108462
4 changed files with 97 additions and 1 deletions
|
@ -129,6 +129,7 @@ static void sampler_queue(
|
|||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||
|
||||
const float temp = params.temp;
|
||||
const float dynatemp_range = params.dynatemp_range;
|
||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float min_p = params.min_p;
|
||||
|
@ -143,7 +144,25 @@ static void sampler_queue(
|
|||
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
||||
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
||||
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
||||
case 't': llama_sample_temp (ctx_main, &cur_p, temp); break;
|
||||
|
||||
case 't':
|
||||
if (dynatemp_range>0)
|
||||
{
|
||||
float dynatemp_min = temp - dynatemp_range;
|
||||
float dynatemp_max = temp + dynatemp_range;
|
||||
//do not allow negative values
|
||||
dynatemp_min = dynatemp_min<0?0:dynatemp_min;
|
||||
dynatemp_max = dynatemp_max<0?0:dynatemp_max;
|
||||
|
||||
llama_sample_entropy(ctx_main, &cur_p, temp, dynatemp_min, dynatemp_max);
|
||||
}
|
||||
else
|
||||
{
|
||||
llama_sample_temp(ctx_main, &cur_p, temp);
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
default : break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@ typedef struct llama_sampling_params {
|
|||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typical_p = 1.00f; // 1.0 = disabled
|
||||
float temp = 0.80f; // 1.0 = disabled
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
|
|
68
llama.cpp
68
llama.cpp
|
@ -7779,6 +7779,74 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|||
}
|
||||
}
|
||||
|
||||
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp, float min_temp = 0, float max_temp = 2.0f) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
llama_sample_softmax(ctx, candidates_p);
|
||||
|
||||
float exponent_val = 1.0f;
|
||||
|
||||
// Calculate entropy of the softmax probabilities
|
||||
float entropy = 0.0f;
|
||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||
float prob = candidates_p->data[i].p;
|
||||
if (prob > 0.0f) { // Ensure no log(0)
|
||||
entropy -= prob * logf(prob);
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate maximum possible entropy
|
||||
float max_entropy = -logf(1.0f / candidates_p->size);
|
||||
|
||||
// Guard against division by zero
|
||||
if (max_entropy == 0.0f) {
|
||||
max_entropy = 1.0f; // This ensures that normalized_entropy will be 0 when entropy is 0
|
||||
}
|
||||
|
||||
// Normalize the entropy
|
||||
float normalized_entropy = entropy / max_entropy;
|
||||
|
||||
// Map the normalized entropy to the desired temperature range using the power function
|
||||
float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
|
||||
|
||||
// //todo: Ensure to hide print statements unless debugging!
|
||||
// printf("Your text maxtemp value is: %f\n", max_temp);
|
||||
// // Print the variables
|
||||
// printf("Entropy: %f\n", entropy);
|
||||
// printf("Max Possible Entropy: %f\n", max_entropy);
|
||||
// printf("Normalized Entropy: %f\n", normalized_entropy);
|
||||
// printf("Exponent: %f\n", exponent_val);
|
||||
// printf("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
|
||||
|
||||
// Apply the dynamically calculated temperature scaling
|
||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||
candidates_p->data[i].logit /= dyn_temp;
|
||||
}
|
||||
|
||||
// Re-compute softmax probabilities after scaling logits with dynamic temperature
|
||||
double max_l_double = candidates_p->data[0].logit;
|
||||
double cum_sum_double = 0.0;
|
||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||
double p = exp(candidates_p->data[i].logit - max_l_double);
|
||||
candidates_p->data[i].p = p; // Store the scaled probability
|
||||
cum_sum_double += p;
|
||||
}
|
||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||
candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
|
||||
}
|
||||
|
||||
// //todo: Ensure to hide print statements unless debugging!
|
||||
// // Print the updated top 25 probabilities after temperature scaling
|
||||
// printf("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
|
||||
// for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
|
||||
// printf("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
|
||||
// }
|
||||
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
|
|
8
llama.h
8
llama.h
|
@ -770,6 +770,14 @@ extern "C" {
|
|||
float p,
|
||||
size_t min_keep);
|
||||
|
||||
/// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
|
||||
LLAMA_API void llama_sample_entropy(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates_p,
|
||||
float temp,
|
||||
float min_temp,
|
||||
float max_temp);
|
||||
|
||||
LLAMA_API void llama_sample_temp(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue