From 9c99ef43d7009ccefba05e8116b9b6b5318ac8a2 Mon Sep 17 00:00:00 2001 From: pudepiedj Date: Thu, 22 Feb 2024 16:45:05 +0000 Subject: [PATCH] small alterations --- Llamaserver.py | 16 +++------------- examples/server/README.md | 2 +- examples/server/server.cpp | 14 ++++++++------ 3 files changed, 12 insertions(+), 20 deletions(-) diff --git a/Llamaserver.py b/Llamaserver.py index d9646dd95..1eddca92f 100644 --- a/Llamaserver.py +++ b/Llamaserver.py @@ -47,13 +47,13 @@ def make_progress_bar(bar, count, num_requests): print(f"Bar is now {bar}\n") return bar -def send_request(q, system, question, event, count, num_requests): +def send_request(q, question, event, count, num_requests): delay = 0.1 global bar - data = {'system prompt': system, 'prompt': question} + data = {'prompt': question} try: response = requests.post(url, headers=headers, json=data) @@ -106,11 +106,6 @@ if __name__ == "__main__": 'User-Agent': 'Llamaserver.py' } - system = "You are a helpful and cheerful \ -assistant who answers questions briefly, \ -clearly and without undue repetition \ -paying very close attention to the requirements of the task set." - country_list = ["France", "Germany", "China", "USA", "Italy", "India", "Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia", "Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada", @@ -123,7 +118,7 @@ paying very close attention to the requirements of the task set." # NOTE: don't pass the parameter as a function call; pass in args print(f"Processing request {i} / {num_requests}: {question}\n") event = threading.Event() - t = threading.Thread(target=send_request, args=(q, system, question, event, i, num_requests)) + t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests)) t.start() threads.append(t) @@ -134,8 +129,3 @@ paying very close attention to the requirements of the task set." while not q.empty(): text = q.get() print_dict(json.loads(text)) - - - - - diff --git a/examples/server/README.md b/examples/server/README.md index f6b9c7402..db4942439 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -504,7 +504,7 @@ After running the API server, you can use it in Python by setting the API base U openai.api_base = "http://:port" ``` -Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API +Then you can utilize llama.cpp as an OpenAI **chat.completion** or **text_completion** API ### Extending or building alternative Web Front End diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c9aa4e68e..de1ae75ed 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -500,8 +500,8 @@ struct llama_server_context const int ga_w = params.grp_attn_w; if (ga_n != 1) { - GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT - GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT + GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT + GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w); @@ -589,7 +589,7 @@ struct llama_server_context if (slot.id == id && slot.available()) { - LOG_TEE("Using id-based available slot called by id: %d", slot.id); + LOG_TEE("Using id-based available slot called by id: %d\n", slot.id); return &slot; } @@ -2014,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); printf(" --log-disable disables logging to a file.\n"); printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n"); - printf(" -skvg, --show-graphics enable graphics displaying kvcache occupancy (default: false)"); + printf(" -skvg, --show-graphics enable graphics displaying kvcache occupancy (default: false)\n"); printf(" -skvi, --show-interactive-graphics\n"); printf(" enable graphics displaying kvcache occupancy with user pause (default: false)\n"); printf("\n"); @@ -2022,8 +2022,10 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); - printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); - printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); + printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled),\n"); + printf(" used together with group attention width `--grp-attn-w`\n"); + printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512),\n"); + printf(" used together with group attention factor `--grp-attn-n`\n"); printf(" --chat-template JINJA_TEMPLATE\n"); printf(" set custom jinja chat template (default: template taken from model's metadata)\n"); printf(" Note: only commonly used templates are accepted, since we don't have jinja parser\n");