small alterations
This commit is contained in:
parent
298207185d
commit
9c99ef43d7
3 changed files with 12 additions and 20 deletions
|
@ -47,13 +47,13 @@ def make_progress_bar(bar, count, num_requests):
|
|||
print(f"Bar is now {bar}\n")
|
||||
return bar
|
||||
|
||||
def send_request(q, system, question, event, count, num_requests):
|
||||
def send_request(q, question, event, count, num_requests):
|
||||
|
||||
delay = 0.1
|
||||
|
||||
global bar
|
||||
|
||||
data = {'system prompt': system, 'prompt': question}
|
||||
data = {'prompt': question}
|
||||
|
||||
try:
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
|
@ -106,11 +106,6 @@ if __name__ == "__main__":
|
|||
'User-Agent': 'Llamaserver.py'
|
||||
}
|
||||
|
||||
system = "You are a helpful and cheerful \
|
||||
assistant who answers questions briefly, \
|
||||
clearly and without undue repetition \
|
||||
paying very close attention to the requirements of the task set."
|
||||
|
||||
country_list = ["France", "Germany", "China", "USA", "Italy", "India",
|
||||
"Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia",
|
||||
"Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada",
|
||||
|
@ -123,7 +118,7 @@ paying very close attention to the requirements of the task set."
|
|||
# NOTE: don't pass the parameter as a function call; pass in args
|
||||
print(f"Processing request {i} / {num_requests}: {question}\n")
|
||||
event = threading.Event()
|
||||
t = threading.Thread(target=send_request, args=(q, system, question, event, i, num_requests))
|
||||
t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests))
|
||||
t.start()
|
||||
threads.append(t)
|
||||
|
||||
|
@ -134,8 +129,3 @@ paying very close attention to the requirements of the task set."
|
|||
while not q.empty():
|
||||
text = q.get()
|
||||
print_dict(json.loads(text))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -504,7 +504,7 @@ After running the API server, you can use it in Python by setting the API base U
|
|||
openai.api_base = "http://<Your api-server IP>:port"
|
||||
```
|
||||
|
||||
Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
|
||||
Then you can utilize llama.cpp as an OpenAI **chat.completion** or **text_completion** API
|
||||
|
||||
### Extending or building alternative Web Front End
|
||||
|
||||
|
|
|
@ -500,8 +500,8 @@ struct llama_server_context
|
|||
const int ga_w = params.grp_attn_w;
|
||||
|
||||
if (ga_n != 1) {
|
||||
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
|
||||
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
|
||||
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
|
||||
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
||||
LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
|
||||
|
@ -589,7 +589,7 @@ struct llama_server_context
|
|||
|
||||
if (slot.id == id && slot.available())
|
||||
{
|
||||
LOG_TEE("Using id-based available slot called by id: %d", slot.id);
|
||||
LOG_TEE("Using id-based available slot called by id: %d\n", slot.id);
|
||||
return &slot;
|
||||
}
|
||||
|
||||
|
@ -2014,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||
printf(" --log-disable disables logging to a file.\n");
|
||||
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
||||
printf(" -skvg, --show-graphics enable graphics displaying kvcache occupancy (default: false)");
|
||||
printf(" -skvg, --show-graphics enable graphics displaying kvcache occupancy (default: false)\n");
|
||||
printf(" -skvi, --show-interactive-graphics\n");
|
||||
printf(" enable graphics displaying kvcache occupancy with user pause (default: false)\n");
|
||||
printf("\n");
|
||||
|
@ -2022,8 +2022,10 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled),\n");
|
||||
printf(" used together with group attention width `--grp-attn-w`\n");
|
||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512),\n");
|
||||
printf(" used together with group attention factor `--grp-attn-n`\n");
|
||||
printf(" --chat-template JINJA_TEMPLATE\n");
|
||||
printf(" set custom jinja chat template (default: template taken from model's metadata)\n");
|
||||
printf(" Note: only commonly used templates are accepted, since we don't have jinja parser\n");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue