Merge 8d0033ad63
into 4e9a7f7f7f
This commit is contained in:
commit
07dd8e02b8
7 changed files with 41 additions and 5 deletions
|
@ -376,6 +376,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.min_p = std::stof(argv[i]);
|
sparams.min_p = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--top-a") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
sparams.top_a = std::stof(argv[i]);
|
||||||
} else if (arg == "--temp") {
|
} else if (arg == "--temp") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -992,6 +998,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||||
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||||
|
printf(" --top-a N top-a sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.top_a);
|
||||||
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
||||||
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
||||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
||||||
|
@ -1165,6 +1172,7 @@ std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::
|
||||||
{"top_p", llama_sampler_type::TOP_P},
|
{"top_p", llama_sampler_type::TOP_P},
|
||||||
{"typical_p", llama_sampler_type::TYPICAL_P},
|
{"typical_p", llama_sampler_type::TYPICAL_P},
|
||||||
{"min_p", llama_sampler_type::MIN_P},
|
{"min_p", llama_sampler_type::MIN_P},
|
||||||
|
{"top_a", llama_sampler_type::TOP_A},
|
||||||
{"tfs_z", llama_sampler_type::TFS_Z},
|
{"tfs_z", llama_sampler_type::TFS_Z},
|
||||||
{"temperature", llama_sampler_type::TEMPERATURE}
|
{"temperature", llama_sampler_type::TEMPERATURE}
|
||||||
};
|
};
|
||||||
|
@ -1178,6 +1186,7 @@ std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::
|
||||||
{"typical-p", llama_sampler_type::TYPICAL_P},
|
{"typical-p", llama_sampler_type::TYPICAL_P},
|
||||||
{"typical", llama_sampler_type::TYPICAL_P},
|
{"typical", llama_sampler_type::TYPICAL_P},
|
||||||
{"min-p", llama_sampler_type::MIN_P},
|
{"min-p", llama_sampler_type::MIN_P},
|
||||||
|
{"top-a", llama_sampler_type::TOP_A},
|
||||||
{"tfs-z", llama_sampler_type::TFS_Z},
|
{"tfs-z", llama_sampler_type::TFS_Z},
|
||||||
{"tfs", llama_sampler_type::TFS_Z},
|
{"tfs", llama_sampler_type::TFS_Z},
|
||||||
{"temp", llama_sampler_type::TEMPERATURE}
|
{"temp", llama_sampler_type::TEMPERATURE}
|
||||||
|
@ -1213,6 +1222,7 @@ std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & nam
|
||||||
{'p', llama_sampler_type::TOP_P},
|
{'p', llama_sampler_type::TOP_P},
|
||||||
{'y', llama_sampler_type::TYPICAL_P},
|
{'y', llama_sampler_type::TYPICAL_P},
|
||||||
{'m', llama_sampler_type::MIN_P},
|
{'m', llama_sampler_type::MIN_P},
|
||||||
|
{'a', llama_sampler_type::TOP_A},
|
||||||
{'f', llama_sampler_type::TFS_Z},
|
{'f', llama_sampler_type::TFS_Z},
|
||||||
{'t', llama_sampler_type::TEMPERATURE}
|
{'t', llama_sampler_type::TEMPERATURE}
|
||||||
};
|
};
|
||||||
|
@ -1235,6 +1245,7 @@ std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
|
||||||
case llama_sampler_type::TYPICAL_P: return "typical_p";
|
case llama_sampler_type::TYPICAL_P: return "typical_p";
|
||||||
case llama_sampler_type::TOP_P: return "top_p";
|
case llama_sampler_type::TOP_P: return "top_p";
|
||||||
case llama_sampler_type::MIN_P: return "min_p";
|
case llama_sampler_type::MIN_P: return "min_p";
|
||||||
|
case llama_sampler_type::TOP_A: return "top_a";
|
||||||
case llama_sampler_type::TEMPERATURE: return "temperature";
|
case llama_sampler_type::TEMPERATURE: return "temperature";
|
||||||
default : return "";
|
default : return "";
|
||||||
}
|
}
|
||||||
|
@ -1783,6 +1794,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||||
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||||
|
fprintf(stream, "top_a: %f # default: 0.0\n", sparams.top_a);
|
||||||
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
||||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||||
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
||||||
|
|
|
@ -98,10 +98,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
|
||||||
|
|
||||||
snprintf(result, sizeof(result),
|
snprintf(result, sizeof(result),
|
||||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, top_a = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||||
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
||||||
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
|
params.top_k, params.tfs_z, params.top_p, params.min_p, params.top_a, params.typical_p, params.temp,
|
||||||
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
||||||
|
|
||||||
return std::string(result);
|
return std::string(result);
|
||||||
|
@ -135,6 +135,7 @@ static void sampler_queue(
|
||||||
const int32_t top_k = params.top_k;
|
const int32_t top_k = params.top_k;
|
||||||
const float top_p = params.top_p;
|
const float top_p = params.top_p;
|
||||||
const float min_p = params.min_p;
|
const float min_p = params.min_p;
|
||||||
|
const float top_a = params.top_a;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = params.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = params.typical_p;
|
||||||
const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
|
const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
|
||||||
|
@ -146,6 +147,7 @@ static void sampler_queue(
|
||||||
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
||||||
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
||||||
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
||||||
|
case llama_sampler_type::TOP_A : llama_sample_top_a (ctx_main, &cur_p, top_a, min_keep); break;
|
||||||
case llama_sampler_type::TEMPERATURE:
|
case llama_sampler_type::TEMPERATURE:
|
||||||
if (dynatemp_range > 0) {
|
if (dynatemp_range > 0) {
|
||||||
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
||||||
|
|
|
@ -13,6 +13,7 @@ enum class llama_sampler_type : char {
|
||||||
TOP_K = 'k',
|
TOP_K = 'k',
|
||||||
TOP_P = 'p',
|
TOP_P = 'p',
|
||||||
MIN_P = 'm',
|
MIN_P = 'm',
|
||||||
|
TOP_A = 'a',
|
||||||
TFS_Z = 'f',
|
TFS_Z = 'f',
|
||||||
TYPICAL_P = 'y',
|
TYPICAL_P = 'y',
|
||||||
TEMPERATURE = 't'
|
TEMPERATURE = 't'
|
||||||
|
@ -26,6 +27,7 @@ typedef struct llama_sampling_params {
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
float min_p = 0.05f; // 0.0 = disabled
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
|
float top_a = 0.00f; // 0.0 = disabled
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||||
|
@ -46,6 +48,7 @@ typedef struct llama_sampling_params {
|
||||||
llama_sampler_type::TYPICAL_P,
|
llama_sampler_type::TYPICAL_P,
|
||||||
llama_sampler_type::TOP_P,
|
llama_sampler_type::TOP_P,
|
||||||
llama_sampler_type::MIN_P,
|
llama_sampler_type::MIN_P,
|
||||||
|
llama_sampler_type::TOP_A,
|
||||||
llama_sampler_type::TEMPERATURE
|
llama_sampler_type::TEMPERATURE
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -213,6 +213,8 @@ node index.js
|
||||||
|
|
||||||
`min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token (default: 0.05).
|
`min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token (default: 0.05).
|
||||||
|
|
||||||
|
`top_a`: Limit the next token selection to a subset of tokens with a probability above a*P^2, where P is the most probable token (default: 0.0, 0.0 = disabled).
|
||||||
|
|
||||||
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
|
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
|
||||||
|
|
||||||
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
|
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
|
||||||
|
|
|
@ -830,6 +830,7 @@ struct server_context {
|
||||||
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||||
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||||
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
||||||
|
slot.sparams.top_a = json_value(data, "top_a", default_sparams.top_a);
|
||||||
slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
||||||
slot.sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
slot.sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
||||||
slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||||
|
@ -1221,6 +1222,7 @@ struct server_context {
|
||||||
{"top_k", slot.sparams.top_k},
|
{"top_k", slot.sparams.top_k},
|
||||||
{"top_p", slot.sparams.top_p},
|
{"top_p", slot.sparams.top_p},
|
||||||
{"min_p", slot.sparams.min_p},
|
{"min_p", slot.sparams.min_p},
|
||||||
|
{"top_a", slot.sparams.top_a},
|
||||||
{"tfs_z", slot.sparams.tfs_z},
|
{"tfs_z", slot.sparams.tfs_z},
|
||||||
{"typical_p", slot.sparams.typical_p},
|
{"typical_p", slot.sparams.typical_p},
|
||||||
{"repeat_last_n", slot.sparams.penalty_last_n},
|
{"repeat_last_n", slot.sparams.penalty_last_n},
|
||||||
|
|
14
llama.cpp
14
llama.cpp
|
@ -10822,7 +10822,7 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
static void llama_sample_min_p_pow(struct llama_context * ctx, llama_token_data_array * candidates, float p, float pow, size_t min_keep) {
|
||||||
if (p <= 0.0f || !candidates->size) {
|
if (p <= 0.0f || !candidates->size) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -10839,7 +10839,7 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
max_logit = std::max(max_logit, candidates->data[i].logit);
|
max_logit = std::max(max_logit, candidates->data[i].logit);
|
||||||
}
|
}
|
||||||
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
|
const float min_logit = max_logit + logf(p) * pow; // min logit for p_i >= p * p_max^pow
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
if (candidates->data[i].logit >= min_logit) {
|
if (candidates->data[i].logit >= min_logit) {
|
||||||
|
@ -10865,7 +10865,7 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
|
||||||
candidates->sorted = true;
|
candidates->sorted = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
|
const float min_logit = candidates->data[0].logit + logf(p) * pow; // min logit for p_i >= p * p_max^pow
|
||||||
size_t i = 1; // first token always matches
|
size_t i = 1; // first token always matches
|
||||||
|
|
||||||
for (; i < candidates->size; ++i) {
|
for (; i < candidates->size; ++i) {
|
||||||
|
@ -10883,6 +10883,14 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
||||||
|
llama_sample_min_p_pow(ctx, candidates, p, 1.f, min_keep);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_sample_top_a(struct llama_context * ctx, llama_token_data_array * candidates, float a, size_t min_keep) {
|
||||||
|
llama_sample_min_p_pow(ctx, candidates, a, 2.f, min_keep);
|
||||||
|
}
|
||||||
|
|
||||||
void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
|
void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
|
||||||
if (z >= 1.0f || candidates->size <= 2) {
|
if (z >= 1.0f || candidates->size <= 2) {
|
||||||
return;
|
return;
|
||||||
|
|
7
llama.h
7
llama.h
|
@ -825,6 +825,13 @@ extern "C" {
|
||||||
float p,
|
float p,
|
||||||
size_t min_keep);
|
size_t min_keep);
|
||||||
|
|
||||||
|
/// @details Top-A sampling as described in https://github.com/BlinkDL/RWKV-LM/tree/4cb363e5aa31978d801a47bc89d28e927ab6912e#the-top-a-sampling-method
|
||||||
|
LLAMA_API void llama_sample_top_a(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
llama_token_data_array * candidates,
|
||||||
|
float a,
|
||||||
|
size_t min_keep);
|
||||||
|
|
||||||
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
||||||
LLAMA_API void llama_sample_tail_free(
|
LLAMA_API void llama_sample_tail_free(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue