small alterations
This commit is contained in:
parent
298207185d
commit
9c99ef43d7
3 changed files with 12 additions and 20 deletions
|
@ -47,13 +47,13 @@ def make_progress_bar(bar, count, num_requests):
|
||||||
print(f"Bar is now {bar}\n")
|
print(f"Bar is now {bar}\n")
|
||||||
return bar
|
return bar
|
||||||
|
|
||||||
def send_request(q, system, question, event, count, num_requests):
|
def send_request(q, question, event, count, num_requests):
|
||||||
|
|
||||||
delay = 0.1
|
delay = 0.1
|
||||||
|
|
||||||
global bar
|
global bar
|
||||||
|
|
||||||
data = {'system prompt': system, 'prompt': question}
|
data = {'prompt': question}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.post(url, headers=headers, json=data)
|
response = requests.post(url, headers=headers, json=data)
|
||||||
|
@ -106,11 +106,6 @@ if __name__ == "__main__":
|
||||||
'User-Agent': 'Llamaserver.py'
|
'User-Agent': 'Llamaserver.py'
|
||||||
}
|
}
|
||||||
|
|
||||||
system = "You are a helpful and cheerful \
|
|
||||||
assistant who answers questions briefly, \
|
|
||||||
clearly and without undue repetition \
|
|
||||||
paying very close attention to the requirements of the task set."
|
|
||||||
|
|
||||||
country_list = ["France", "Germany", "China", "USA", "Italy", "India",
|
country_list = ["France", "Germany", "China", "USA", "Italy", "India",
|
||||||
"Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia",
|
"Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia",
|
||||||
"Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada",
|
"Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada",
|
||||||
|
@ -123,7 +118,7 @@ paying very close attention to the requirements of the task set."
|
||||||
# NOTE: don't pass the parameter as a function call; pass in args
|
# NOTE: don't pass the parameter as a function call; pass in args
|
||||||
print(f"Processing request {i} / {num_requests}: {question}\n")
|
print(f"Processing request {i} / {num_requests}: {question}\n")
|
||||||
event = threading.Event()
|
event = threading.Event()
|
||||||
t = threading.Thread(target=send_request, args=(q, system, question, event, i, num_requests))
|
t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests))
|
||||||
t.start()
|
t.start()
|
||||||
threads.append(t)
|
threads.append(t)
|
||||||
|
|
||||||
|
@ -134,8 +129,3 @@ paying very close attention to the requirements of the task set."
|
||||||
while not q.empty():
|
while not q.empty():
|
||||||
text = q.get()
|
text = q.get()
|
||||||
print_dict(json.loads(text))
|
print_dict(json.loads(text))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -504,7 +504,7 @@ After running the API server, you can use it in Python by setting the API base U
|
||||||
openai.api_base = "http://<Your api-server IP>:port"
|
openai.api_base = "http://<Your api-server IP>:port"
|
||||||
```
|
```
|
||||||
|
|
||||||
Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
|
Then you can utilize llama.cpp as an OpenAI **chat.completion** or **text_completion** API
|
||||||
|
|
||||||
### Extending or building alternative Web Front End
|
### Extending or building alternative Web Front End
|
||||||
|
|
||||||
|
|
|
@ -500,8 +500,8 @@ struct llama_server_context
|
||||||
const int ga_w = params.grp_attn_w;
|
const int ga_w = params.grp_attn_w;
|
||||||
|
|
||||||
if (ga_n != 1) {
|
if (ga_n != 1) {
|
||||||
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
|
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
|
||||||
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
|
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
|
||||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
||||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
||||||
LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
|
LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
|
||||||
|
@ -589,7 +589,7 @@ struct llama_server_context
|
||||||
|
|
||||||
if (slot.id == id && slot.available())
|
if (slot.id == id && slot.available())
|
||||||
{
|
{
|
||||||
LOG_TEE("Using id-based available slot called by id: %d", slot.id);
|
LOG_TEE("Using id-based available slot called by id: %d\n", slot.id);
|
||||||
return &slot;
|
return &slot;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2014,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||||
printf(" --log-disable disables logging to a file.\n");
|
printf(" --log-disable disables logging to a file.\n");
|
||||||
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
||||||
printf(" -skvg, --show-graphics enable graphics displaying kvcache occupancy (default: false)");
|
printf(" -skvg, --show-graphics enable graphics displaying kvcache occupancy (default: false)\n");
|
||||||
printf(" -skvi, --show-interactive-graphics\n");
|
printf(" -skvi, --show-interactive-graphics\n");
|
||||||
printf(" enable graphics displaying kvcache occupancy with user pause (default: false)\n");
|
printf(" enable graphics displaying kvcache occupancy with user pause (default: false)\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
@ -2022,8 +2022,10 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled),\n");
|
||||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
printf(" used together with group attention width `--grp-attn-w`\n");
|
||||||
|
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512),\n");
|
||||||
|
printf(" used together with group attention factor `--grp-attn-n`\n");
|
||||||
printf(" --chat-template JINJA_TEMPLATE\n");
|
printf(" --chat-template JINJA_TEMPLATE\n");
|
||||||
printf(" set custom jinja chat template (default: template taken from model's metadata)\n");
|
printf(" set custom jinja chat template (default: template taken from model's metadata)\n");
|
||||||
printf(" Note: only commonly used templates are accepted, since we don't have jinja parser\n");
|
printf(" Note: only commonly used templates are accepted, since we don't have jinja parser\n");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue