small alterations

2024-02-22 16:45:05 +00:00 · 2024-02-22 16:45:05 +00:00 · 9c99ef43d7
commit 9c99ef43d7
parent 298207185d
3 changed files with 12 additions and 20 deletions
--- a/Llamaserver.py
+++ b/Llamaserver.py
@ -47,13 +47,13 @@ def make_progress_bar(bar, count, num_requests):
            print(f"Bar is now {bar}\n")
            return bar

-def send_request(q, system, question, event, count, num_requests):
+def send_request(q, question, event, count, num_requests):

    delay = 0.1

    global bar

-    data = {'system prompt': system, 'prompt': question}
+    data = {'prompt': question}
    
    try:
        response = requests.post(url, headers=headers, json=data)
@ -106,11 +106,6 @@ if __name__ == "__main__":
        'User-Agent': 'Llamaserver.py'
        }

-    system = "You are a helpful and cheerful \
-assistant who answers questions briefly, \
-clearly and without undue repetition \
-paying very close attention to the requirements of the task set."
-
    country_list = ["France", "Germany", "China", "USA", "Italy", "India",
                    "Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia",
                    "Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada",
@ -123,7 +118,7 @@ paying very close attention to the requirements of the task set."
        # NOTE: don't pass the parameter as a function call; pass in args
        print(f"Processing request {i} / {num_requests}: {question}\n")
        event = threading.Event()
-        t = threading.Thread(target=send_request, args=(q, system, question, event, i, num_requests)) 
+        t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests)) 
        t.start()
        threads.append(t)

@ -134,8 +129,3 @@ paying very close attention to the requirements of the task set."
    while not q.empty():
        text = q.get()  
        print_dict(json.loads(text))
-
-
-    
-
-
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -504,7 +504,7 @@ After running the API server, you can use it in Python by setting the API base U
 openai.api_base = "http://<Your api-server IP>:port"
 ```

-Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
+Then you can utilize llama.cpp as an OpenAI **chat.completion** or **text_completion** API

 ### Extending or building alternative Web Front End

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -500,8 +500,8 @@ struct llama_server_context
            const int ga_w = params.grp_attn_w;

            if (ga_n != 1) {
-                GGML_ASSERT(ga_n > 0                    && "ga_n must be positive");                     // NOLINT
-                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");     // NOLINT
+                GGML_ASSERT(ga_n > 0                    && "ga_n must be positive");                       // NOLINT
+                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");             // NOLINT
                //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
                //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
                LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
@ -589,7 +589,7 @@ struct llama_server_context

            if (slot.id == id && slot.available())
            {
-                LOG_TEE("Using id-based available slot called by id: %d", slot.id);
+                LOG_TEE("Using id-based available slot called by id: %d\n", slot.id);
                return &slot;
            }

@ -2014,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n");
    printf("  --log-disable             disables logging to a file.\n");
    printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
-    printf("  -skvg, --show-graphics    enable graphics displaying kvcache occupancy (default: false)");
+    printf("  -skvg, --show-graphics    enable graphics displaying kvcache occupancy (default: false)\n");
    printf("  -skvi, --show-interactive-graphics\n");
    printf("                            enable graphics displaying kvcache occupancy with user pause (default: false)\n");
    printf("\n");
@ -2022,8 +2022,10 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                            advanced option to override model metadata by key. may be specified multiple times.\n");
    printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
-    printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
-    printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
+    printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled),\n");
+    printf("                            used together with group attention width `--grp-attn-w`\n");
+    printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512),\n");
+    printf("                            used together with group attention factor `--grp-attn-n`\n");
    printf("  --chat-template JINJA_TEMPLATE\n");
    printf("                            set custom jinja chat template (default: template taken from model's metadata)\n");
    printf("                            Note: only commonly used templates are accepted, since we don't have jinja parser\n");