sampling : refactor + optimize penalties sampler (#10803)
* sampling : refactor + optimize penalties sampler ggml-ci * common : apply ignore_eos as logit bias ggml-ci * batched : remove penalties sampler * params : allow penalty_last_n == -1 to be equal to context size ggml-ci * common : by default, move the penalties at the end of the sampling chain ggml-ci * common : ignore all EOG tokens Co-authored-by: Diego Devesa <slarengh@gmail.com> * common : move back the penalties at the front of the sampling chain ggml-ci * readme : restore hint about --ignore-eos flag [no ci] * llama : minor ggml-ci * webui : update --------- Co-authored-by: Diego Devesa <slarengh@gmail.com>
This commit is contained in:
parent
4ddd199f6f
commit
644fd71b44
17 changed files with 111 additions and 152 deletions
|
@ -135,7 +135,6 @@ struct slot_params {
|
|||
{"mirostat", sampling.mirostat},
|
||||
{"mirostat_tau", sampling.mirostat_tau},
|
||||
{"mirostat_eta", sampling.mirostat_eta},
|
||||
{"penalize_nl", sampling.penalize_nl},
|
||||
{"stop", antiprompt},
|
||||
{"max_tokens", n_predict}, // User configured n_predict
|
||||
{"n_keep", n_keep},
|
||||
|
@ -184,6 +183,7 @@ struct server_task {
|
|||
|
||||
static slot_params params_from_json_cmpl(
|
||||
const llama_model * model,
|
||||
const llama_context * ctx,
|
||||
const common_params & params_base,
|
||||
const json & data) {
|
||||
slot_params params;
|
||||
|
@ -226,7 +226,6 @@ struct server_task {
|
|||
params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat);
|
||||
params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau);
|
||||
params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta);
|
||||
params.sampling.penalize_nl = json_value(data, "penalize_nl", defaults.sampling.penalize_nl);
|
||||
params.sampling.seed = json_value(data, "seed", defaults.sampling.seed);
|
||||
params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs);
|
||||
params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep);
|
||||
|
@ -239,8 +238,27 @@ struct server_task {
|
|||
params.speculative.n_min = std::max(params.speculative.n_min, 2);
|
||||
params.speculative.n_max = std::max(params.speculative.n_max, 0);
|
||||
|
||||
// TODO: add more sanity checks for the input parameters
|
||||
|
||||
if (params.sampling.penalty_last_n < -1) {
|
||||
throw std::runtime_error("Error: repeat_last_n must be >= -1");
|
||||
}
|
||||
|
||||
if (params.sampling.dry_penalty_last_n < -1) {
|
||||
throw std::runtime_error("Error: dry_penalty_last_n must be >= -1");
|
||||
}
|
||||
|
||||
if (params.sampling.penalty_last_n == -1) {
|
||||
// note: should be the slot's context and not the full context, but it's ok
|
||||
params.sampling.penalty_last_n = llama_n_ctx(ctx);
|
||||
}
|
||||
|
||||
if (params.sampling.dry_penalty_last_n == -1) {
|
||||
params.sampling.dry_penalty_last_n = llama_n_ctx(ctx);
|
||||
}
|
||||
|
||||
if (params.sampling.dry_base < 1.0f) {
|
||||
params.sampling.dry_base = defaults.sampling.dry_base;
|
||||
params.sampling.dry_base = defaults.sampling.dry_base;
|
||||
}
|
||||
|
||||
// sequence breakers for DRY
|
||||
|
@ -1469,7 +1487,7 @@ struct server_context {
|
|||
n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
add_bos_token = llama_add_bos_token(model);
|
||||
has_eos_token = !llama_add_eos_token(model);
|
||||
has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (!params_base.speculative.model.empty()) {
|
||||
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
|
||||
|
@ -3381,7 +3399,7 @@ int main(int argc, char ** argv) {
|
|||
task.index = i;
|
||||
|
||||
task.prompt_tokens = std::move(tokenized_prompts[i]);
|
||||
task.params = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.params_base, data);
|
||||
task.params = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.ctx, ctx_server.params_base, data);
|
||||
task.id_selected_slot = json_value(data, "id_slot", -1);
|
||||
|
||||
// OAI-compat
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue