From eb082012278036b020f117b69545b85b5fa12870 Mon Sep 17 00:00:00 2001
From: FSSRepo <go778sgt@gmail.com>
Date: Fri, 13 Oct 2023 14:28:06 -0400
Subject: [PATCH] add changes to README.md

---
 examples/server/README.md  | 8 ++++++++
 examples/server/server.cpp | 8 ++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 8a079ae26..e28e5845b 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -24,6 +24,8 @@ Command line options:
 -   `--port`: Set the port to listen. Default: `8080`.
 -   `--path`: path from which to serve static files (default examples/server/public)
 -   `--embedding`: Enable embedding extraction, Default: disabled.
+-   `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
+-   `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
 
 ## Build
 
@@ -158,6 +160,12 @@ node index.js
 
     `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
 
+    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
+
+    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
+
+    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications.
+
 -   **POST** `/tokenize`: Tokenize a given text.
 
     *Options:*
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d839d6ce5..0741ebf11 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -78,8 +78,8 @@ enum slot_command {
 
 struct slot_params {
     bool stream = true;
-    uint32_t seed                           = -1;   // RNG seed
-    int32_t n_predict                       = -1;   // new tokens to predict
+    uint32_t seed                 = -1;   // RNG seed
+    int32_t n_predict             = -1;   // new tokens to predict
     std::string grammar           = "";  // optional BNF-like grammar to constrain sampling
     bool cache_prompt =           false;  // remember a the prompt to avoid reprocessing all prompt
     std::vector<std::string> antiprompt;
@@ -563,7 +563,7 @@ struct llama_server_context
     }
 
     void processSystemPromptData(json sys_props) {
-        system_prompt = sys_props.value("system_prompt", "");
+        system_prompt = sys_props.value("prompt", "");
         user_name = sys_props.value("anti_prompt", "");
         assistant_name = sys_props.value("assistant_name", "");
         notifySystemPromptChanged();
@@ -872,7 +872,7 @@ struct llama_server_context
             return true;
         }
 
-        // context shift
+        // context shift takes effect only when there is a single slot
         if(slots.size() == 1) {
             llama_client_slot slot = slots[0];
             if (slot.cache_tokens.size() >= (size_t)n_ctx)