small alterations

This commit is contained in:
pudepiedj 2024-02-22 16:45:05 +00:00
parent 298207185d
commit 9c99ef43d7
3 changed files with 12 additions and 20 deletions

View file

@ -47,13 +47,13 @@ def make_progress_bar(bar, count, num_requests):
print(f"Bar is now {bar}\n")
return bar
def send_request(q, system, question, event, count, num_requests):
def send_request(q, question, event, count, num_requests):
delay = 0.1
global bar
data = {'system prompt': system, 'prompt': question}
data = {'prompt': question}
try:
response = requests.post(url, headers=headers, json=data)
@ -106,11 +106,6 @@ if __name__ == "__main__":
'User-Agent': 'Llamaserver.py'
}
system = "You are a helpful and cheerful \
assistant who answers questions briefly, \
clearly and without undue repetition \
paying very close attention to the requirements of the task set."
country_list = ["France", "Germany", "China", "USA", "Italy", "India",
"Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia",
"Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada",
@ -123,7 +118,7 @@ paying very close attention to the requirements of the task set."
# NOTE: don't pass the parameter as a function call; pass in args
print(f"Processing request {i} / {num_requests}: {question}\n")
event = threading.Event()
t = threading.Thread(target=send_request, args=(q, system, question, event, i, num_requests))
t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests))
t.start()
threads.append(t)
@ -134,8 +129,3 @@ paying very close attention to the requirements of the task set."
while not q.empty():
text = q.get()
print_dict(json.loads(text))

View file

@ -504,7 +504,7 @@ After running the API server, you can use it in Python by setting the API base U
openai.api_base = "http://<Your api-server IP>:port"
```
Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
Then you can utilize llama.cpp as an OpenAI **chat.completion** or **text_completion** API
### Extending or building alternative Web Front End

View file

@ -500,8 +500,8 @@ struct llama_server_context
const int ga_w = params.grp_attn_w;
if (ga_n != 1) {
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
@ -589,7 +589,7 @@ struct llama_server_context
if (slot.id == id && slot.available())
{
LOG_TEE("Using id-based available slot called by id: %d", slot.id);
LOG_TEE("Using id-based available slot called by id: %d\n", slot.id);
return &slot;
}
@ -2014,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-disable disables logging to a file.\n");
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
printf(" -skvg, --show-graphics enable graphics displaying kvcache occupancy (default: false)");
printf(" -skvg, --show-graphics enable graphics displaying kvcache occupancy (default: false)\n");
printf(" -skvi, --show-interactive-graphics\n");
printf(" enable graphics displaying kvcache occupancy with user pause (default: false)\n");
printf("\n");
@ -2022,8 +2022,10 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled),\n");
printf(" used together with group attention width `--grp-attn-w`\n");
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512),\n");
printf(" used together with group attention factor `--grp-attn-n`\n");
printf(" --chat-template JINJA_TEMPLATE\n");
printf(" set custom jinja chat template (default: template taken from model's metadata)\n");
printf(" Note: only commonly used templates are accepted, since we don't have jinja parser\n");