diff --git a/examples/server/README.md b/examples/server/README.md index 249368749..fe5cd8d5d 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -39,6 +39,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` +- `-n, --n-predict`: Set the maximum tokens to predict (default: -1) ## Build diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a0b46970b..5cf1044d9 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1920,7 +1920,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); printf(" --chat-template FORMAT_NAME"); - printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str()); + printf(" set chat template, possible values is: llama2, chatml (default %s)", sparams.chat_template.c_str()); + printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict); printf("\n"); }