Full DynaTemp implementation + UI (#600)
* move Dynatemp changes to new branch * fix float header * Properly reintroduce variable expert count Controllable through experts.txt * first pass at DynaTemp UI Checkbox partial implemented, Min and Max Temp implemented * DynaTemp UI Checkbox Trigger DynaTemp on checkbox * DynaTemp UI checkbox edition Hell Yeah! DynaTemp! * Remove greedy dynatemp * Fix race condition caused by debug print * Fixed broken presets and miro Fixes broken presets and mirostat * Remove debug function + HHI temp Also removed unnecessary softmax double precision * Fix whitespace (?) for generate function * epic upstream renaming scheme fix * fix stupid indents * Other cleanup Reintroduce unused rep pen function, move temp functions first before entropy dynamic temp * Slight indent fix * revert batch pyinstaller maker to mainline and also delete experts.txt since adjustable routing is also being removed for the PR * compact dynatemp into a single value dynatemp_range. This is a float which represents the allowed deviation from the min and max temperature when using dynatemp. Thus, if we want a value of dynatemp_min=0.3, dynatemp_max=0.5, then we would simply set temperature=0.4 and dynatemp_range=0.1. Functionally dynatemp would operate the same, but it would simplify usage and make it a single easy to adjust value. --------- Co-authored-by: Alexander Abushady <aabushady214@gmail.com> Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
This commit is contained in:
parent
427ba21e62
commit
123bff9a0f
9 changed files with 132 additions and 8 deletions
|
@ -80,6 +80,10 @@ struct gpt_params {
|
|||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
|
||||
// DynaTemp!
|
||||
float dynatemp_range = 0.0f; // enables DynaTemp if greater than 0. dynatemp_min = temperature - dt_range, dynatemp_max = temperature + dt_range
|
||||
|
||||
// // sampling parameters
|
||||
struct llama_sampling_params sparams;
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ typedef struct llama_sampling_params {
|
|||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool dynatemp_range = 0.00f; // dynamic temperature range
|
||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||
std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
|
||||
|
||||
|
|
2
expose.h
2
expose.h
|
@ -81,7 +81,9 @@ struct generation_inputs
|
|||
const char * grammar;
|
||||
const bool grammar_retain_state;
|
||||
const bool quiet = false;
|
||||
const float dynatemp_range = 0.0f;
|
||||
const logit_bias logit_biases[logit_bias_max];
|
||||
|
||||
};
|
||||
struct generation_outputs
|
||||
{
|
||||
|
|
|
@ -482,7 +482,7 @@ void sample_grammar(FileFormat file_format, int32_t n_vocab, llama_token_data_ar
|
|||
}
|
||||
|
||||
int SampleLogits(const float * logits, int n_ctx, int n_vocab, int rep_pen_range, float rep_pen, float presence_penalty, float top_k, float top_a, float top_p, float min_p, float typical_p, float tfs, float temp, std::mt19937 & rng,
|
||||
int mirostat, float mirostat_tau, float mirostat_eta, const std::vector<samplers> & sampler_order, llama_grammar * grammar)
|
||||
int mirostat, float mirostat_tau, float mirostat_eta, const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dynatemp_range)
|
||||
{
|
||||
int id = 0;
|
||||
std::vector<llama_token_data> candidates;
|
||||
|
@ -541,7 +541,19 @@ int mirostat, float mirostat_tau, float mirostat_eta, const std::vector<samplers
|
|||
llama_sample_typical(nullptr, &candidates_p, typical_p,1);
|
||||
break;
|
||||
case KCPP_SAMPLER_TEMP:
|
||||
sample_temperature(&candidates_p, temp);
|
||||
if (dynatemp_range>0)
|
||||
{
|
||||
float dynatemp_min = temp - dynatemp_range;
|
||||
float dynatemp_max = temp + dynatemp_range;
|
||||
//do not allow negative values
|
||||
dynatemp_min = dynatemp_min<0?0:dynatemp_min;
|
||||
dynatemp_max = dynatemp_max<0?0:dynatemp_max;
|
||||
llama_sample_entropy(nullptr, &candidates_p, temp, dynatemp_min, dynatemp_max);
|
||||
}
|
||||
else
|
||||
{
|
||||
sample_temperature(&candidates_p, temp);
|
||||
}
|
||||
break;
|
||||
case KCPP_SAMPLER_REP_PEN:
|
||||
sample_rep_pen(n_ctx, rep_pen_range, rep_pen, presence_penalty, &candidates_p);
|
||||
|
@ -1480,6 +1492,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
}
|
||||
|
||||
std::string addedmemory = inputs.memory;
|
||||
|
||||
kcpp_params->prompt = inputs.prompt;
|
||||
kcpp_params->seed = inputs.seed;
|
||||
kcpp_params->n_predict = inputs.max_length;
|
||||
|
@ -1495,10 +1508,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
kcpp_params->mirostat = inputs.mirostat;
|
||||
kcpp_params->mirostat_eta = inputs.mirostat_eta;
|
||||
kcpp_params->mirostat_tau = inputs.mirostat_tau;
|
||||
kcpp_params->dynatemp_range = inputs.dynatemp_range;
|
||||
kcpp_params->n_ctx = inputs.max_context_length;
|
||||
kcpp_params->n_batch = n_batch;
|
||||
kcpp_params->n_threads = n_threads;
|
||||
kcpp_params->n_threads_batch = n_blasthreads;
|
||||
|
||||
bool stream_sse = inputs.stream_sse;
|
||||
|
||||
bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet) || debugmode >= 1;
|
||||
|
@ -1889,6 +1904,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
const float presence_penalty = kcpp_params->presence_penalty;
|
||||
const float typical_p = kcpp_params->typical_p;
|
||||
const float tfs_z = kcpp_params->tfs_z;
|
||||
const float dynatemp_range = kcpp_params->dynatemp_range;
|
||||
|
||||
if (!startedsampling)
|
||||
{
|
||||
|
@ -1944,7 +1960,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
|
||||
id = SampleLogits(logitsPtr, nctx, n_vocab, last_n_size, repeat_penalty, presence_penalty,
|
||||
top_k, top_a, top_p, min_p, typical_p, tfs_z, temp, rng,
|
||||
kcpp_params->mirostat, kcpp_params->mirostat_tau, kcpp_params->mirostat_eta, sampler_order, grammar);
|
||||
kcpp_params->mirostat, kcpp_params->mirostat_tau, kcpp_params->mirostat_eta, sampler_order, grammar, dynatemp_range);
|
||||
|
||||
if (grammar != nullptr) {
|
||||
grammar_accept_token(file_format, n_vocab, grammar, id);
|
||||
|
|
|
@ -139,6 +139,12 @@
|
|||
"description": "If true, prevents the EOS token from being generated (Ban EOS). For unbantokens, set this to false.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"dynatemp_range": {
|
||||
"default": 0,
|
||||
"description": "If greater than 0, uses dynamic temperature. Dynamic temperature range will be between Temp+Range and Temp-Range. If less or equal to 0 , uses static temperature.",
|
||||
"exclusiveMinimum": 0,
|
||||
"type": "number"
|
||||
},
|
||||
"mirostat": {
|
||||
"description": "KoboldCpp ONLY. Sets the mirostat mode, 0=disabled, 1=mirostat_v1, 2=mirostat_v2",
|
||||
"minimum": 0,
|
||||
|
@ -876,4 +882,4 @@
|
|||
|
||||
</body>
|
||||
|
||||
</html>
|
||||
</html>
|
||||
|
|
15
klite.embd
15
klite.embd
File diff suppressed because one or more lines are too long
|
@ -78,6 +78,7 @@ class generation_inputs(ctypes.Structure):
|
|||
("grammar", ctypes.c_char_p),
|
||||
("grammar_retain_state", ctypes.c_bool),
|
||||
("quiet", ctypes.c_bool),
|
||||
("dynatemp_range", ctypes.c_float),
|
||||
("logit_biases", logit_bias * logit_bias_max)]
|
||||
|
||||
class generation_outputs(ctypes.Structure):
|
||||
|
@ -310,7 +311,7 @@ def load_model(model_filename):
|
|||
ret = handle.load_model(inputs)
|
||||
return ret
|
||||
|
||||
def generate(prompt, memory="", max_length=32, max_context_length=512, temperature=0.7, top_k=100, top_a=0.0, top_p=0.92, min_p=0.0, typical_p=1.0, tfs=1.0, rep_pen=1.1, rep_pen_range=128, presence_penalty=0.0, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=False, stream_sse=False, grammar='', grammar_retain_state=False, genkey='', trimstop=False, quiet=False, logit_biases={}):
|
||||
def generate(prompt, memory="", max_length=32, max_context_length=512, temperature=0.7, top_k=100, top_a=0.0, top_p=0.92, min_p=0.0, typical_p=1.0, tfs=1.0, rep_pen=1.1, rep_pen_range=128, presence_penalty=0.0, mirostat=0, mirostat_tau=5.0, mirostat_eta=0.1, sampler_order=[6,0,1,3,4,2,5], seed=-1, stop_sequence=[], use_default_badwordsids=False, stream_sse=False, grammar='', grammar_retain_state=False, genkey='', trimstop=False, quiet=False, dynatemp_range=0.0, logit_biases={}):
|
||||
global maxctx, args, currentusergenkey, totalgens
|
||||
inputs = generation_inputs()
|
||||
outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs))
|
||||
|
@ -338,6 +339,7 @@ def generate(prompt, memory="", max_length=32, max_context_length=512, temperatu
|
|||
inputs.presence_penalty = presence_penalty
|
||||
inputs.stream_sse = stream_sse
|
||||
inputs.quiet = quiet
|
||||
inputs.dynatemp_range = dynatemp_range
|
||||
inputs.grammar = grammar.encode("UTF-8")
|
||||
inputs.grammar_retain_state = grammar_retain_state
|
||||
inputs.unban_tokens_rt = not use_default_badwordsids
|
||||
|
@ -547,7 +549,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|||
genkey=genparams.get('genkey', ''),
|
||||
trimstop=genparams.get('trim_stop', False),
|
||||
quiet=is_quiet,
|
||||
logit_biases=genparams.get('logit_bias', {}))
|
||||
dynatemp_range=genparams.get('dynatemp_range', 0.0),
|
||||
logit_biases=genparams.get('logit_bias', {})
|
||||
)
|
||||
|
||||
recvtxt = ""
|
||||
if stream_flag:
|
||||
|
|
71
llama.cpp
71
llama.cpp
|
@ -8510,10 +8510,81 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
||||
llama_sample_temp(ctx, candidates_p, temp);
|
||||
}
|
||||
|
||||
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp, float min_temp = 0, float max_temp = 2.0f) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
llama_sample_softmax(ctx, candidates_p);
|
||||
|
||||
float exponent_val = 1.0f;
|
||||
|
||||
// Calculate entropy of the softmax probabilities
|
||||
float entropy = 0.0f;
|
||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||
float prob = candidates_p->data[i].p;
|
||||
if (prob > 0.0f) { // Ensure no log(0)
|
||||
entropy -= prob * logf(prob);
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate maximum possible entropy
|
||||
float max_entropy = -logf(1.0f / candidates_p->size);
|
||||
|
||||
// Guard against division by zero
|
||||
if (max_entropy == 0.0f) {
|
||||
max_entropy = 1.0f; // This ensures that normalized_entropy will be 0 when entropy is 0
|
||||
}
|
||||
|
||||
// Normalize the entropy
|
||||
float normalized_entropy = entropy / max_entropy;
|
||||
|
||||
// Map the normalized entropy to the desired temperature range using the power function
|
||||
float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
|
||||
|
||||
//todo: Ensure to hide print statements unless debugging!
|
||||
printf("Your text maxtemp value is: %f\n", max_temp);
|
||||
// Print the variables
|
||||
printf("Entropy: %f\n", entropy);
|
||||
printf("Max Possible Entropy: %f\n", max_entropy);
|
||||
printf("Normalized Entropy: %f\n", normalized_entropy);
|
||||
printf("Exponent: %f\n", exponent_val);
|
||||
printf("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
|
||||
|
||||
// Apply the dynamically calculated temperature scaling
|
||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||
candidates_p->data[i].logit /= dyn_temp;
|
||||
}
|
||||
|
||||
// Re-compute softmax probabilities after scaling logits with dynamic temperature
|
||||
double max_l_double = candidates_p->data[0].logit;
|
||||
double cum_sum_double = 0.0;
|
||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||
double p = exp(candidates_p->data[i].logit - max_l_double);
|
||||
candidates_p->data[i].p = p; // Store the scaled probability
|
||||
cum_sum_double += p;
|
||||
}
|
||||
for (size_t i = 0; i < candidates_p->size; ++i) {
|
||||
candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
|
||||
}
|
||||
|
||||
//todo: Ensure to hide print statements unless debugging!
|
||||
// Print the updated top 25 probabilities after temperature scaling
|
||||
printf("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
|
||||
for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
|
||||
printf("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
|
||||
}
|
||||
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
// The llama.cpp repetition penalty code goes unused in kobold's API
|
||||
|
||||
void llama_sample_repetition_penalties(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
|
|
9
llama.h
9
llama.h
|
@ -723,6 +723,15 @@ extern "C" {
|
|||
float p,
|
||||
size_t min_keep);
|
||||
|
||||
/// @details DYNATEMP! #TODO KALO
|
||||
LLAMA_API void llama_sample_entropy(
|
||||
struct llama_context* ctx,
|
||||
llama_token_data_array* candidates,
|
||||
float p,
|
||||
size_t min_keep,
|
||||
float min_temp,
|
||||
float max_temp);
|
||||
|
||||
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
||||
LLAMA_API void llama_sample_tail_free(
|
||||
struct llama_context * ctx,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue