Initial XTC commit
Adds XTC sampler, not activated by default, but recommended settings by default.
This commit is contained in:
parent
f3fdcfaa79
commit
89640b00a1
7 changed files with 128 additions and 3 deletions
|
@ -966,6 +966,27 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||||
params.sparams.min_p = std::stof(value);
|
params.sparams.min_p = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
|
add_opt(llama_arg(
|
||||||
|
{"--xtc-p"}, "N",
|
||||||
|
format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_p),
|
||||||
|
[](gpt_params & params, const std::string & value) {
|
||||||
|
params.sparams.xtc_p = std::stof(value);
|
||||||
|
}
|
||||||
|
).set_sparam());
|
||||||
|
add_opt(llama_arg(
|
||||||
|
{"--xtc-t"}, "N",
|
||||||
|
format("xtc threshold (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_t),
|
||||||
|
[](gpt_params & params, const std::string & value) {
|
||||||
|
params.sparams.xtc_t = std::stof(value);
|
||||||
|
}
|
||||||
|
).set_sparam());
|
||||||
|
add_opt(llama_arg(
|
||||||
|
{"--xtc-t-max"}, "N",
|
||||||
|
format("xtc upper threshold (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_t_max),
|
||||||
|
[](gpt_params & params, const std::string & value) {
|
||||||
|
params.sparams.xtc_t_max = std::stof(value);
|
||||||
|
}
|
||||||
|
).set_sparam());
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"--tfs"}, "N",
|
{"--tfs"}, "N",
|
||||||
format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
|
format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
|
||||||
|
|
4
common/build-info.cpp
Normal file
4
common/build-info.cpp
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
int LLAMA_BUILD_NUMBER = 0;
|
||||||
|
char const *LLAMA_COMMIT = "unknown";
|
||||||
|
char const *LLAMA_COMPILER = "cc (GCC) 14.1.0";
|
||||||
|
char const *LLAMA_BUILD_TARGET = "x86_64-w64-mingw32";
|
|
@ -2060,6 +2060,9 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||||
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||||
|
fprintf(stream, "xtc_p: %f # default: 0.0\n", sparams.xtc_p);
|
||||||
|
fprintf(stream, "xtc_t: %f # default: 0.0\n", sparams.xtc_t);
|
||||||
|
fprintf(stream, "xtc_t_max: %f # default: 0.0\n", sparams.xtc_t_max);
|
||||||
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
||||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||||
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
||||||
|
|
|
@ -90,6 +90,7 @@ enum gpt_sampler_type {
|
||||||
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
||||||
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
||||||
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
||||||
|
GPT_SAMPLER_TYPE_XTC = 7,
|
||||||
};
|
};
|
||||||
|
|
||||||
// dimensionality reduction methods, used by cvector-generator
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
|
@ -108,6 +109,9 @@ struct gpt_sampler_params {
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
float min_p = 0.05f; // 0.0 = disabled
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
|
float xtc_p = 0.50f; // 0.0 = disabled
|
||||||
|
float xtc_t = 0.10f; // 1.0 = disabled
|
||||||
|
float xtc_t_max = 1.00f; // 0.0 = disabled
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||||
|
|
|
@ -130,10 +130,10 @@ std::string gpt_sampler_params::print() const {
|
||||||
|
|
||||||
snprintf(result, sizeof(result),
|
snprintf(result, sizeof(result),
|
||||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_p = %.3f, xtc_t = %.3f, xtc_t_max = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||||
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
||||||
top_k, tfs_z, top_p, min_p, typ_p, temp,
|
top_k, tfs_z, top_p, min_p, xtc_p, xtc_t, xtc_t_max, typ_p, temp,
|
||||||
mirostat, mirostat_eta, mirostat_tau);
|
mirostat, mirostat_eta, mirostat_tau);
|
||||||
|
|
||||||
return std::string(result);
|
return std::string(result);
|
||||||
|
@ -184,6 +184,9 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
|
||||||
case GPT_SAMPLER_TYPE_MIN_P:
|
case GPT_SAMPLER_TYPE_MIN_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_XTC:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_p, params.xtc_t, params.xtc_t_max, params.min_keep));
|
||||||
|
break;
|
||||||
case GPT_SAMPLER_TYPE_TFS_Z:
|
case GPT_SAMPLER_TYPE_TFS_Z:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
||||||
break;
|
break;
|
||||||
|
@ -372,6 +375,7 @@ char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
|
||||||
case GPT_SAMPLER_TYPE_TOP_P: return 'p';
|
case GPT_SAMPLER_TYPE_TOP_P: return 'p';
|
||||||
case GPT_SAMPLER_TYPE_MIN_P: return 'm';
|
case GPT_SAMPLER_TYPE_MIN_P: return 'm';
|
||||||
case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
|
case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||||
|
case GPT_SAMPLER_TYPE_XTC: return 'x';
|
||||||
default : return '?';
|
default : return '?';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -384,6 +388,7 @@ std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
|
||||||
case GPT_SAMPLER_TYPE_TOP_P: return "top_p";
|
case GPT_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||||
case GPT_SAMPLER_TYPE_MIN_P: return "min_p";
|
case GPT_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||||
case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||||
|
case GPT_SAMPLER_TYPE_XTC: return "xtc";
|
||||||
default : return "";
|
default : return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -396,6 +401,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
|
||||||
{ "min_p", GPT_SAMPLER_TYPE_MIN_P },
|
{ "min_p", GPT_SAMPLER_TYPE_MIN_P },
|
||||||
{ "tfs_z", GPT_SAMPLER_TYPE_TFS_Z },
|
{ "tfs_z", GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
{ "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
|
{ "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
|
||||||
|
{ "xtc", GPT_SAMPLER_TYPE_XTC },
|
||||||
};
|
};
|
||||||
|
|
||||||
// since samplers names are written multiple ways
|
// since samplers names are written multiple ways
|
||||||
|
@ -441,7 +447,8 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & c
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P },
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P },
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P },
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P },
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_XTC), GPT_SAMPLER_TYPE_XTC }
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<gpt_sampler_type> samplers;
|
std::vector<gpt_sampler_type> samplers;
|
||||||
|
|
|
@ -1093,6 +1093,9 @@ extern "C" {
|
||||||
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
|
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
|
||||||
LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
|
LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
|
||||||
|
|
||||||
|
/// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
|
||||||
|
LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, float t_max, size_t min_keep);
|
||||||
|
|
||||||
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
||||||
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||||
|
|
|
@ -1059,6 +1059,89 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// xtc
|
||||||
|
|
||||||
|
struct llama_sampler_xtc {
|
||||||
|
const float probability;
|
||||||
|
const float threshold;
|
||||||
|
const float threshold_max;
|
||||||
|
const size_t min_keep;
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
|
||||||
|
return "xtc";
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
|
const auto * ctx = (llama_sampler_xtc *) smpl->ctx;
|
||||||
|
|
||||||
|
if (ctx->probability <= 0.0f || ctx->threshold <= 0.0f || cur_p->size <= 1 || ctx->min_keep <= 2) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::random_device rd;
|
||||||
|
float chance = (float)(rd()%100)/100;
|
||||||
|
if (chance > ctx->probability) return;
|
||||||
|
// in case it's not sorted/recalculated yet
|
||||||
|
llama_sampler_softmax_impl(cur_p);
|
||||||
|
|
||||||
|
int removed = 0;
|
||||||
|
// going through all candidates from back to front, easier to keep the last of probables
|
||||||
|
for (int i = (cur_p->size - 1); i >= 0; --i) {
|
||||||
|
if (cur_p->data[i].p >= ctx->threshold && cur_p->data[i].p <= ctx->threshold_max) {
|
||||||
|
if (removed == 0 || chance <= ctx->probability) {
|
||||||
|
++removed;
|
||||||
|
if (removed >= 2) {
|
||||||
|
// .logits are used for sorting and calculating .p in llama_sample_softmax_impl
|
||||||
|
cur_p->data[i].logit = -999.0f;
|
||||||
|
chance = (float)(rd()%100)/100;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (removed >= 2) {
|
||||||
|
// sorting with new logits, ex-last probable will be the first anyway
|
||||||
|
std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||||
|
return a.logit > b.logit;
|
||||||
|
});
|
||||||
|
cur_p->sorted = true;
|
||||||
|
|
||||||
|
// resizing now that penalized tokens are at the back
|
||||||
|
cur_p->size = cur_p->size - removed + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
|
||||||
|
const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
|
||||||
|
return llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->threshold_max, ctx->min_keep);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
|
||||||
|
delete (llama_sampler_xtc *) smpl->ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llama_sampler_i llama_sampler_xtc_i = {
|
||||||
|
/* .name = */ llama_sampler_xtc_name,
|
||||||
|
/* .accept = */ nullptr,
|
||||||
|
/* .apply = */ llama_sample_xtc_apply,
|
||||||
|
/* .reset = */ nullptr,
|
||||||
|
/* .clone = */ llama_sampler_xtc_clone,
|
||||||
|
/* .free = */ llama_sampler_xtc_free,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_sampler * llama_sampler_init_xtc(float p, float t, float t_max, size_t min_keep) {
|
||||||
|
return new llama_sampler {
|
||||||
|
/* .iface = */ &llama_sampler_xtc_i,
|
||||||
|
/* .ctx = */ new llama_sampler_xtc {
|
||||||
|
/* .probability = */ p,
|
||||||
|
/* .threshold = */ t,
|
||||||
|
/* .threshold_max = */ t_max,
|
||||||
|
/* .min_keep = */ min_keep,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// mirostat
|
// mirostat
|
||||||
|
|
||||||
struct llama_sampler_mirostat {
|
struct llama_sampler_mirostat {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue