From cf7137e8d6d087202b245461b67f3ebd706c5c14 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 17 Feb 2024 13:18:00 +0100
Subject: [PATCH] server: document --n-predict

---
 examples/server/README.md  | 1 +
 examples/server/server.cpp | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 249368749..fe5cd8d5d 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -39,6 +39,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
 - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
 - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
+- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
 
 ## Build
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a0b46970b..5cf1044d9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1920,7 +1920,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
     printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
     printf("  --chat-template FORMAT_NAME");
-    printf("                            set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str());
+    printf("                            set chat template, possible values is: llama2, chatml (default %s)", sparams.chat_template.c_str());
+    printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
     printf("\n");
 }