improved python client lower threadpool
This commit is contained in:
parent
f7e29e5248
commit
1b04d5907b
4 changed files with 49 additions and 32 deletions
|
@ -17,10 +17,16 @@ def print_dict(data):
|
||||||
print_dict(entry)
|
print_dict(entry)
|
||||||
elif isinstance(data, str):
|
elif isinstance(data, str):
|
||||||
print(f"Incoming string is {data}.\n")
|
print(f"Incoming string is {data}.\n")
|
||||||
|
else:
|
||||||
|
print("No intelligible data received.\n")
|
||||||
return
|
return
|
||||||
|
|
||||||
def print_response(text):
|
def title_print(text):
|
||||||
|
|
||||||
|
length = len(text)
|
||||||
|
print("\n" + "*" * length)
|
||||||
print(text)
|
print(text)
|
||||||
|
print("*" * length + "\n")
|
||||||
|
|
||||||
def make_empty_bar(num_requests):
|
def make_empty_bar(num_requests):
|
||||||
bar = []
|
bar = []
|
||||||
|
@ -28,7 +34,7 @@ def make_empty_bar(num_requests):
|
||||||
bar.append("\u2589")
|
bar.append("\u2589")
|
||||||
bar = ' '.join(bar)
|
bar = ' '.join(bar)
|
||||||
bar = bar.replace(' ','')
|
bar = bar.replace(' ','')
|
||||||
print(f"Bar is now {bar}.\n")
|
# print(f"Bar is now {bar}.\n")
|
||||||
return bar
|
return bar
|
||||||
|
|
||||||
def make_progress_bar(bar, count, num_requests):
|
def make_progress_bar(bar, count, num_requests):
|
||||||
|
@ -38,39 +44,41 @@ def make_progress_bar(bar, count, num_requests):
|
||||||
if i == count:
|
if i == count:
|
||||||
# print(f"Bar position {i} is {bar[i]}\n")
|
# print(f"Bar position {i} is {bar[i]}\n")
|
||||||
bar = bar[:i*stride1] + "\u23F1" + bar[i*stride1 + stride2:]
|
bar = bar[:i*stride1] + "\u23F1" + bar[i*stride1 + stride2:]
|
||||||
print(f"Bar is now {bar}\n")
|
print(f"Bar is now {bar}\n")
|
||||||
return bar
|
return bar
|
||||||
|
|
||||||
def send_request(q, question, event, count, num_requests):
|
def send_request(q, system, question, event, count, num_requests):
|
||||||
|
|
||||||
delay = 0.1
|
delay = 0.1
|
||||||
|
|
||||||
global bar
|
global bar
|
||||||
|
|
||||||
data = {'prompt': question}
|
data = {'system prompt': system, 'prompt': question}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.post(url, headers=headers, json=data)
|
response = requests.post(url, headers=headers, json=data)
|
||||||
if response.status_code in [200,300]:
|
if response.status_code in [200,300]:
|
||||||
print(f"Current Queue Size: {q.qsize()}; processing request {count} / {num_requests}\n")
|
with lockbar:
|
||||||
print(f"Status Code for {question}: {response.status_code}\n")
|
print(f"Current Client Queue Size: {q.qsize()}; processing request {count} / {num_requests}\n")
|
||||||
print(f"Response to {question}:\n")
|
print(f"Status Code for {question}: {response.status_code}\n")
|
||||||
if isinstance(response.text, str):
|
print(f"Response to {question}:\n")
|
||||||
data = json.loads(response.text)
|
if isinstance(response.text, str):
|
||||||
if isinstance(data, dict):
|
data = json.loads(response.text)
|
||||||
print_dict(data)
|
if isinstance(data, dict):
|
||||||
elif isinstance(data, str):
|
print_dict(data)
|
||||||
print(data)
|
elif isinstance(data, str):
|
||||||
else:
|
print(data)
|
||||||
print("\nServer returned data of wrong type.\n")
|
else:
|
||||||
# put the response text in the queue
|
print("\nServer returned data of wrong type.\n")
|
||||||
q.put(response.text)
|
# put the response text in the queue
|
||||||
if not q.empty():
|
q.put(response.text)
|
||||||
print(f"Completed task {count} / {num_requests}\n")
|
if not q.empty():
|
||||||
bar = make_progress_bar(bar, count, num_requests)
|
#with lockbar: # lock automatically releases when the update is done
|
||||||
q.task_done()
|
title_print(f"Completed task {count} / {num_requests}")
|
||||||
|
bar = make_progress_bar(bar, count, num_requests)
|
||||||
|
q.task_done()
|
||||||
elif response.status_code == 429 and not q.empty():
|
elif response.status_code == 429 and not q.empty():
|
||||||
# event.set()
|
event.set()
|
||||||
print("Server return too many requests; back off!! Reset event.")
|
print("Server return too many requests; back off!! Reset event.")
|
||||||
else:
|
else:
|
||||||
print(f"Server responded with code {response.status_code}\n")
|
print(f"Server responded with code {response.status_code}\n")
|
||||||
|
@ -82,11 +90,12 @@ def send_request(q, question, event, count, num_requests):
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
global bar
|
global bar
|
||||||
|
lockbar = threading.Lock()
|
||||||
|
|
||||||
url = "http://localhost:8080/completion"
|
url = "http://localhost:8080/completion"
|
||||||
|
|
||||||
num_requests = 256
|
num_requests = 40
|
||||||
q = Queue(maxsize = 256)
|
q = Queue(maxsize = 64)
|
||||||
threads = []
|
threads = []
|
||||||
|
|
||||||
bar = make_empty_bar(num_requests)
|
bar = make_empty_bar(num_requests)
|
||||||
|
@ -97,6 +106,11 @@ if __name__ == "__main__":
|
||||||
'User-Agent': 'Llamaserver.py'
|
'User-Agent': 'Llamaserver.py'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
system = "You are a helpful and cheerful \
|
||||||
|
assistant who answers questions briefly, \
|
||||||
|
clearly and without undue repetition \
|
||||||
|
paying very close attention to the requirements of the task set."
|
||||||
|
|
||||||
country_list = ["France", "Germany", "China", "USA", "Italy", "India",
|
country_list = ["France", "Germany", "China", "USA", "Italy", "India",
|
||||||
"Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia",
|
"Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia",
|
||||||
"Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada",
|
"Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada",
|
||||||
|
@ -109,7 +123,7 @@ if __name__ == "__main__":
|
||||||
# NOTE: don't pass the parameter as a function call; pass in args
|
# NOTE: don't pass the parameter as a function call; pass in args
|
||||||
print(f"Processing request {i} / {num_requests}: {question}\n")
|
print(f"Processing request {i} / {num_requests}: {question}\n")
|
||||||
event = threading.Event()
|
event = threading.Event()
|
||||||
t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests))
|
t = threading.Thread(target=send_request, args=(q, system, question, event, i, num_requests))
|
||||||
t.start()
|
t.start()
|
||||||
threads.append(t)
|
threads.append(t)
|
||||||
|
|
||||||
|
|
|
@ -96,7 +96,7 @@
|
||||||
// the value here (8u, 16u, 32u, etc) is what governs max threads at 5126
|
// the value here (8u, 16u, 32u, etc) is what governs max threads at 5126
|
||||||
#ifndef CPPHTTPLIB_THREAD_POOL_COUNT
|
#ifndef CPPHTTPLIB_THREAD_POOL_COUNT
|
||||||
#define CPPHTTPLIB_THREAD_POOL_COUNT \
|
#define CPPHTTPLIB_THREAD_POOL_COUNT \
|
||||||
((std::max)(128u, std::thread::hardware_concurrency() > 0 \
|
((std::max)(32u, std::thread::hardware_concurrency() > 0 \
|
||||||
? std::thread::hardware_concurrency() - 1 \
|
? std::thread::hardware_concurrency() - 1 \
|
||||||
: 0))
|
: 0))
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -30,6 +30,7 @@
|
||||||
#include <condition_variable>
|
#include <condition_variable>
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
#include <iostream> // do we still need this?
|
#include <iostream> // do we still need this?
|
||||||
|
|
||||||
|
@ -350,7 +351,7 @@ static void kvgraphics(std::vector<llama_client_slot>& slots, int cache_size) {
|
||||||
printf("\033[1;0H\033[K**************************\n\033[KKVcache occupancy by slot:\n\033[K**************************\n");
|
printf("\033[1;0H\033[K**************************\n\033[KKVcache occupancy by slot:\n\033[K**************************\n");
|
||||||
|
|
||||||
for(int i=0; i<num_blocks; i++) {
|
for(int i=0; i<num_blocks; i++) {
|
||||||
//printf("\033[K"); // clear the current line
|
printf("\033[K"); // clear the current line
|
||||||
for(int j=0; j < max_length; j++) {
|
for(int j=0; j < max_length; j++) {
|
||||||
int used = slots[i].cache_tokens.size() * max_length / slot_cache_size;
|
int used = slots[i].cache_tokens.size() * max_length / slot_cache_size;
|
||||||
if((j < max_length / 2) && (j < used)) {
|
if((j < max_length / 2) && (j < used)) {
|
||||||
|
@ -382,7 +383,7 @@ static void kvgraphics(std::vector<llama_client_slot>& slots, int cache_size) {
|
||||||
}
|
}
|
||||||
printf(" %4zu/%5zu %2d %s %s %s\n", slots[i].cache_tokens.size(), slot_cache_size, slots[i].id, slot_symbol1.c_str(), slot_symbol2.c_str(), slot_symbol3.c_str());
|
printf(" %4zu/%5zu %2d %s %s %s\n", slots[i].cache_tokens.size(), slot_cache_size, slots[i].id, slot_symbol1.c_str(), slot_symbol2.c_str(), slot_symbol3.c_str());
|
||||||
}
|
}
|
||||||
printf("\n\033[%dJ", num_blocks+5); // move cursor to end of cache display
|
printf("\n\033[%dJ", num_blocks+5); // move cursor to end of cache display
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_server_context
|
struct llama_server_context
|
||||||
|
@ -2664,6 +2665,7 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
ggml_time_init();
|
||||||
|
|
||||||
LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
|
LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
|
||||||
{"commit", LLAMA_COMMIT}});
|
{"commit", LLAMA_COMMIT}});
|
||||||
|
|
|
@ -291,9 +291,10 @@ struct llama_server_queue {
|
||||||
// Start the main loop. Called from the very end of server.cpp
|
// Start the main loop. Called from the very end of server.cpp
|
||||||
void start_loop() {
|
void start_loop() {
|
||||||
running = true;
|
running = true;
|
||||||
|
//LOG_TEE("In start_loop have new task number %d.\n", id);
|
||||||
while (true) {
|
while (true) {
|
||||||
// new task arrived
|
// new task arrived
|
||||||
LOG_TEE("In start_loop have new task number %d.\n", id);
|
// LOG_TEE("In start_loop have new task number %d.\n", id);
|
||||||
{
|
{
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
|
@ -377,7 +378,7 @@ struct llama_server_response {
|
||||||
typedef std::function<void(int, int, task_result&)> callback_multitask_t;
|
typedef std::function<void(int, int, task_result&)> callback_multitask_t;
|
||||||
callback_multitask_t callback_update_multitask;
|
callback_multitask_t callback_update_multitask;
|
||||||
// for keeping track of all tasks waiting for the result
|
// for keeping track of all tasks waiting for the result
|
||||||
std::set<int> waiting_task_ids; // so this stores waiting tasks with no obvious limit
|
std::set<int> waiting_task_ids; // this stores waiting tasks with no obvious limit
|
||||||
// the main result queue
|
// the main result queue
|
||||||
std::vector<task_result> queue_results;
|
std::vector<task_result> queue_results;
|
||||||
std::mutex mutex_results;
|
std::mutex mutex_results;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue