From 9c99ef43d7009ccefba05e8116b9b6b5318ac8a2 Mon Sep 17 00:00:00 2001
From: pudepiedj <pudepiedj@gmail.com>
Date: Thu, 22 Feb 2024 16:45:05 +0000
Subject: [PATCH] small alterations

---
 Llamaserver.py             | 16 +++-------------
 examples/server/README.md  |  2 +-
 examples/server/server.cpp | 14 ++++++++------
 3 files changed, 12 insertions(+), 20 deletions(-)
diff --git a/Llamaserver.py b/Llamaserver.py
index d9646dd95..1eddca92f 100644
--- a/Llamaserver.py
+++ b/Llamaserver.py
@@ -47,13 +47,13 @@ def make_progress_bar(bar, count, num_requests):
             print(f"Bar is now {bar}\n")
             return bar
 
-def send_request(q, system, question, event, count, num_requests):
+def send_request(q, question, event, count, num_requests):
 
     delay = 0.1
 
     global bar
 
-    data = {'system prompt': system, 'prompt': question}
+    data = {'prompt': question}
     
     try:
         response = requests.post(url, headers=headers, json=data)
@@ -106,11 +106,6 @@ if __name__ == "__main__":
         'User-Agent': 'Llamaserver.py'
         }
 
-    system = "You are a helpful and cheerful \
-assistant who answers questions briefly, \
-clearly and without undue repetition \
-paying very close attention to the requirements of the task set."
-
     country_list = ["France", "Germany", "China", "USA", "Italy", "India",
                     "Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia",
                     "Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada",
@@ -123,7 +118,7 @@ paying very close attention to the requirements of the task set."
         # NOTE: don't pass the parameter as a function call; pass in args
         print(f"Processing request {i} / {num_requests}: {question}\n")
         event = threading.Event()
-        t = threading.Thread(target=send_request, args=(q, system, question, event, i, num_requests)) 
+        t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests)) 
         t.start()
         threads.append(t)
 
@@ -134,8 +129,3 @@ paying very close attention to the requirements of the task set."
     while not q.empty():
         text = q.get()  
         print_dict(json.loads(text))
-
-
-    
-
-
diff --git a/examples/server/README.md b/examples/server/README.md
index f6b9c7402..db4942439 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -504,7 +504,7 @@ After running the API server, you can use it in Python by setting the API base U
 openai.api_base = "http://<Your api-server IP>:port"
 ```
 
-Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
+Then you can utilize llama.cpp as an OpenAI **chat.completion** or **text_completion** API
 
 ### Extending or building alternative Web Front End
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c9aa4e68e..de1ae75ed 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -500,8 +500,8 @@ struct llama_server_context
             const int ga_w = params.grp_attn_w;
 
             if (ga_n != 1) {
-                GGML_ASSERT(ga_n > 0                    && "ga_n must be positive");                     // NOLINT
-                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");     // NOLINT
+                GGML_ASSERT(ga_n > 0                    && "ga_n must be positive");                       // NOLINT
+                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");             // NOLINT
                 //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
                 //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
                 LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
@@ -589,7 +589,7 @@ struct llama_server_context
 
             if (slot.id == id && slot.available())
             {
-                LOG_TEE("Using id-based available slot called by id: %d", slot.id);
+                LOG_TEE("Using id-based available slot called by id: %d\n", slot.id);
                 return &slot;
             }
 
@@ -2014,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n");
     printf("  --log-disable             disables logging to a file.\n");
     printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
-    printf("  -skvg, --show-graphics    enable graphics displaying kvcache occupancy (default: false)");
+    printf("  -skvg, --show-graphics    enable graphics displaying kvcache occupancy (default: false)\n");
     printf("  -skvi, --show-interactive-graphics\n");
     printf("                            enable graphics displaying kvcache occupancy with user pause (default: false)\n");
     printf("\n");
@@ -2022,8 +2022,10 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("                            advanced option to override model metadata by key. may be specified multiple times.\n");
     printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
-    printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
-    printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
+    printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled),\n");
+    printf("                            used together with group attention width `--grp-attn-w`\n");
+    printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512),\n");
+    printf("                            used together with group attention factor `--grp-attn-n`\n");
     printf("  --chat-template JINJA_TEMPLATE\n");
     printf("                            set custom jinja chat template (default: template taken from model's metadata)\n");
     printf("                            Note: only commonly used templates are accepted, since we don't have jinja parser\n");