diff --git a/common/arg.cpp b/common/arg.cpp index 2a85ad845..ffe2518e0 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -966,6 +966,27 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.sparams.min_p = std::stof(value); } ).set_sparam()); + add_opt(llama_arg( + {"--xtc-p"}, "N", + format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_p), + [](gpt_params & params, const std::string & value) { + params.sparams.xtc_p = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--xtc-t"}, "N", + format("xtc threshold (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_t), + [](gpt_params & params, const std::string & value) { + params.sparams.xtc_t = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--xtc-t-max"}, "N", + format("xtc upper threshold (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_t_max), + [](gpt_params & params, const std::string & value) { + params.sparams.xtc_t_max = std::stof(value); + } + ).set_sparam()); add_opt(llama_arg( {"--tfs"}, "N", format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), diff --git a/common/build-info.cpp b/common/build-info.cpp new file mode 100644 index 000000000..d839c9bab --- /dev/null +++ b/common/build-info.cpp @@ -0,0 +1,4 @@ +int LLAMA_BUILD_NUMBER = 0; +char const *LLAMA_COMMIT = "unknown"; +char const *LLAMA_COMPILER = "cc (GCC) 14.1.0"; +char const *LLAMA_BUILD_TARGET = "x86_64-w64-mingw32"; diff --git a/common/common.cpp b/common/common.cpp index a0611f3d1..33355fd0a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2060,6 +2060,9 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); + fprintf(stream, "xtc_p: %f # default: 0.0\n", sparams.xtc_p); + fprintf(stream, "xtc_t: %f # default: 0.0\n", sparams.xtc_t); + fprintf(stream, "xtc_t_max: %f # default: 0.0\n", sparams.xtc_t_max); fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); diff --git a/common/common.h b/common/common.h index 8b84cf9ad..a4bb13afd 100644 --- a/common/common.h +++ b/common/common.h @@ -90,6 +90,7 @@ enum gpt_sampler_type { GPT_SAMPLER_TYPE_TFS_Z = 4, GPT_SAMPLER_TYPE_TYPICAL_P = 5, GPT_SAMPLER_TYPE_TEMPERATURE = 6, + GPT_SAMPLER_TYPE_XTC = 7, }; // dimensionality reduction methods, used by cvector-generator @@ -108,6 +109,9 @@ struct gpt_sampler_params { int32_t top_k = 40; // <= 0 to use vocab size float top_p = 0.95f; // 1.0 = disabled float min_p = 0.05f; // 0.0 = disabled + float xtc_p = 0.50f; // 0.0 = disabled + float xtc_t = 0.10f; // 1.0 = disabled + float xtc_t_max = 1.00f; // 0.0 = disabled float tfs_z = 1.00f; // 1.0 = disabled float typ_p = 1.00f; // typical_p, 1.0 = disabled float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities diff --git a/common/sampling.cpp b/common/sampling.cpp index 3dc7f1120..fd77e7bf6 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -130,10 +130,10 @@ std::string gpt_sampler_params::print() const { snprintf(result, sizeof(result), "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" - "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n" + "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_p = %.3f, xtc_t = %.3f, xtc_t_max = %.3f, typical_p = %.3f, temp = %.3f\n" "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", penalty_last_n, penalty_repeat, penalty_freq, penalty_present, - top_k, tfs_z, top_p, min_p, typ_p, temp, + top_k, tfs_z, top_p, min_p, xtc_p, xtc_t, xtc_t_max, typ_p, temp, mirostat, mirostat_eta, mirostat_tau); return std::string(result); @@ -184,6 +184,9 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st case GPT_SAMPLER_TYPE_MIN_P: llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); break; + case GPT_SAMPLER_TYPE_XTC: + llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_p, params.xtc_t, params.xtc_t_max, params.min_keep)); + break; case GPT_SAMPLER_TYPE_TFS_Z: llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep)); break; @@ -372,6 +375,7 @@ char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) { case GPT_SAMPLER_TYPE_TOP_P: return 'p'; case GPT_SAMPLER_TYPE_MIN_P: return 'm'; case GPT_SAMPLER_TYPE_TEMPERATURE: return 't'; + case GPT_SAMPLER_TYPE_XTC: return 'x'; default : return '?'; } } @@ -384,6 +388,7 @@ std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) { case GPT_SAMPLER_TYPE_TOP_P: return "top_p"; case GPT_SAMPLER_TYPE_MIN_P: return "min_p"; case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature"; + case GPT_SAMPLER_TYPE_XTC: return "xtc"; default : return ""; } } @@ -396,6 +401,7 @@ std::vector gpt_sampler_types_from_names(const std::vector gpt_sampler_types_from_chars(const std::string & c { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P }, { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P }, { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE } + { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }, + { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_XTC), GPT_SAMPLER_TYPE_XTC } }; std::vector samplers; diff --git a/include/llama.h b/include/llama.h index 7cae1bbe2..ae8e0960d 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1093,6 +1093,9 @@ extern "C" { /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772. LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent); + /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 + LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, float t_max, size_t min_keep); + /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index e255a8fc4..416a973f6 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1059,6 +1059,89 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa }; } +// xtc + +struct llama_sampler_xtc { + const float probability; + const float threshold; + const float threshold_max; + const size_t min_keep; +}; + +static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) { + return "xtc"; +} + +static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_xtc *) smpl->ctx; + + if (ctx->probability <= 0.0f || ctx->threshold <= 0.0f || cur_p->size <= 1 || ctx->min_keep <= 2) { + return; + } + + std::random_device rd; + float chance = (float)(rd()%100)/100; + if (chance > ctx->probability) return; + // in case it's not sorted/recalculated yet + llama_sampler_softmax_impl(cur_p); + + int removed = 0; + // going through all candidates from back to front, easier to keep the last of probables + for (int i = (cur_p->size - 1); i >= 0; --i) { + if (cur_p->data[i].p >= ctx->threshold && cur_p->data[i].p <= ctx->threshold_max) { + if (removed == 0 || chance <= ctx->probability) { + ++removed; + if (removed >= 2) { + // .logits are used for sorting and calculating .p in llama_sample_softmax_impl + cur_p->data[i].logit = -999.0f; + chance = (float)(rd()%100)/100; + } + } + } + } + + if (removed >= 2) { + // sorting with new logits, ex-last probable will be the first anyway + std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }); + cur_p->sorted = true; + + // resizing now that penalized tokens are at the back + cur_p->size = cur_p->size - removed + 1; + } +} + +static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_xtc *) smpl->ctx; + return llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->threshold_max, ctx->min_keep); +} + +static void llama_sampler_xtc_free(struct llama_sampler * smpl) { + delete (llama_sampler_xtc *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_xtc_i = { + /* .name = */ llama_sampler_xtc_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sample_xtc_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_xtc_clone, + /* .free = */ llama_sampler_xtc_free, +}; + +struct llama_sampler * llama_sampler_init_xtc(float p, float t, float t_max, size_t min_keep) { + return new llama_sampler { + /* .iface = */ &llama_sampler_xtc_i, + /* .ctx = */ new llama_sampler_xtc { + /* .probability = */ p, + /* .threshold = */ t, + /* .threshold_max = */ t_max, + /* .min_keep = */ min_keep, + }, + }; +} + // mirostat struct llama_sampler_mirostat {