Renamed parameters, fixed info and defaults
* probability is at 0 by default, but XTC is included in sampling queue * threshold higher than 0.5 switches XTC off
This commit is contained in:
parent
ba29d31fb7
commit
2107882cf5
6 changed files with 28 additions and 27 deletions
|
@ -967,24 +967,24 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||
}
|
||||
).set_sparam());
|
||||
add_opt(llama_arg(
|
||||
{"--xtc-p"}, "N",
|
||||
format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_p),
|
||||
{"-xtc-p", "--xtc-probability"}, "N",
|
||||
format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.sparams.xtc_p = std::stof(value);
|
||||
params.sparams.xtc_probability = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(llama_arg(
|
||||
{"--xtc-t"}, "N",
|
||||
format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_t),
|
||||
{"-xtc-t", "--xtc-threshold"}, "N",
|
||||
format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.sparams.xtc_t = std::stof(value);
|
||||
params.sparams.xtc_threshold = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(llama_arg(
|
||||
{"--xtc-t-max"}, "N",
|
||||
format("xtc upper threshold (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_t_max),
|
||||
{"-xtc-t-max", "--xtc-threshold-max"}, "N",
|
||||
format("xtc upper threshold (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_threshold_max),
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
params.sparams.xtc_t_max = std::stof(value);
|
||||
params.sparams.xtc_threshold_max = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(llama_arg(
|
||||
|
|
|
@ -2088,9 +2088,9 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||
fprintf(stream, "xtc_p: %f # default: 0.5\n", sparams.xtc_p);
|
||||
fprintf(stream, "xtc_t: %f # default: 0.1\n", sparams.xtc_t);
|
||||
fprintf(stream, "xtc_t_max: %f # default: 1.0\n", sparams.xtc_t_max);
|
||||
fprintf(stream, "xtc_probability: %f # default: 0.5\n", sparams.xtc_probability);
|
||||
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
|
||||
fprintf(stream, "xtc_threshold_max: %f # default: 1.0\n", sparams.xtc_threshold_max);
|
||||
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
||||
|
|
|
@ -109,9 +109,9 @@ struct gpt_sampler_params {
|
|||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float xtc_p = 0.50f; // 0.0 = disabled
|
||||
float xtc_t = 0.10f; // 1.0 = disabled
|
||||
float xtc_t_max = 1.00f; // 0.0 = disabled
|
||||
float xtc_probability = 0.00f; // 0.0 = disabled
|
||||
float xtc_threshold = 0.10f; // 0.5 = disabled
|
||||
float xtc_threshold_max = 1.00f; // 0.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
|
@ -134,6 +134,7 @@ struct gpt_sampler_params {
|
|||
GPT_SAMPLER_TYPE_TYPICAL_P,
|
||||
GPT_SAMPLER_TYPE_TOP_P,
|
||||
GPT_SAMPLER_TYPE_MIN_P,
|
||||
GPT_SAMPLER_TYPE_XTC,
|
||||
GPT_SAMPLER_TYPE_TEMPERATURE
|
||||
};
|
||||
|
||||
|
|
|
@ -130,10 +130,10 @@ std::string gpt_sampler_params::print() const {
|
|||
|
||||
snprintf(result, sizeof(result),
|
||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_p = %.3f, xtc_t = %.3f, xtc_t_max = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, xtc_threshold_max = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
||||
top_k, tfs_z, top_p, min_p, xtc_p, xtc_t, xtc_t_max, typ_p, temp,
|
||||
top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, xtc_threshold_max, typ_p, temp,
|
||||
mirostat, mirostat_eta, mirostat_tau);
|
||||
|
||||
return std::string(result);
|
||||
|
@ -185,7 +185,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
|
|||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case GPT_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_p, params.xtc_t, params.xtc_t_max, params.min_keep, params.seed));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.xtc_threshold_max, params.min_keep, params.seed));
|
||||
break;
|
||||
case GPT_SAMPLER_TYPE_TFS_Z:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
||||
|
|
|
@ -243,19 +243,19 @@ Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
|
|||
|
||||
### XTC Sampling
|
||||
|
||||
- `--xtc-p N`: Sets the chance for token removal (checked once on sampler start) (default: 0.5).
|
||||
- `--xtc-t N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
|
||||
- `--xtc-t-max N`: Sets a maximum probability threshold for tokens to be removed (highly expetrimental) (default: 1.0).
|
||||
- `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0).
|
||||
- `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
|
||||
- `--xtc-threshold-max N`: Sets a maximum probability threshold for tokens to be removed (highly experimental) (default: 1.0).
|
||||
|
||||
Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive answers. With a chance of `xtc-p` it searches for tokens with probabilities of `xtc-t` threshold and above, then removes all such tokens except the least probable one.
|
||||
Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-p` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
|
||||
|
||||
By removing top tokens XTC can improve variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last top token XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
|
||||
By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
|
||||
|
||||
The additional `xtc-t-max` parameter may help with finetuned models that already give relatively creative output, meaning that clichés and repetitive phrases may appear at lower probabilities. It allows to remove tokens from a middle range which will always be specific to a model, requiring careful experimenting. Leave `xtc-t-max` on default 1.0 for all base/instruct models.
|
||||
The additional `xtc-threshold-max` parameter may help with finetuned models that already give relatively creative output, meaning that clichés and repetitive phrases may appear at lower probabilities. It allows to remove tokens from a middle range which will always be specific to a model, requiring careful experimenting. Leave `xtc-threshold-max` on default 1.0 for all base/instruct models.
|
||||
|
||||
Being experimental and unique, XTC is not included in the default sampling queue. You can start from a recommended combination of Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02`.
|
||||
Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 -xtc-p 0.5`.
|
||||
|
||||
Example usage: `--xtc-p 0.5 --xtc-t 0.1 --xtc-t-max 1.0`
|
||||
Example usage: `-xtc-p 0.5 -xtc-t 0.1 -xtc-t-max 1.0`
|
||||
|
||||
### Logit Bias
|
||||
|
||||
|
|
|
@ -1081,7 +1081,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
|
|||
auto * ctx = (llama_sampler_xtc *) smpl->ctx;
|
||||
|
||||
if (ctx->probability <= 0.0f
|
||||
|| ctx->threshold >= 1.0f
|
||||
|| ctx->threshold > 0.5f
|
||||
|| ctx->threshold_max <= 0.0f
|
||||
|| ctx->threshold_max <= ctx->threshold
|
||||
|| cur_p->size <= 2) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue