diff --git a/Llamaserver.py b/Llamaserver.py index 629987ac9..d9646dd95 100644 --- a/Llamaserver.py +++ b/Llamaserver.py @@ -17,10 +17,16 @@ def print_dict(data): print_dict(entry) elif isinstance(data, str): print(f"Incoming string is {data}.\n") + else: + print("No intelligible data received.\n") return -def print_response(text): +def title_print(text): + + length = len(text) + print("\n" + "*" * length) print(text) + print("*" * length + "\n") def make_empty_bar(num_requests): bar = [] @@ -28,7 +34,7 @@ def make_empty_bar(num_requests): bar.append("\u2589") bar = ' '.join(bar) bar = bar.replace(' ','') - print(f"Bar is now {bar}.\n") + # print(f"Bar is now {bar}.\n") return bar def make_progress_bar(bar, count, num_requests): @@ -38,39 +44,41 @@ def make_progress_bar(bar, count, num_requests): if i == count: # print(f"Bar position {i} is {bar[i]}\n") bar = bar[:i*stride1] + "\u23F1" + bar[i*stride1 + stride2:] - print(f"Bar is now {bar}\n") - return bar + print(f"Bar is now {bar}\n") + return bar -def send_request(q, question, event, count, num_requests): +def send_request(q, system, question, event, count, num_requests): delay = 0.1 global bar - data = {'prompt': question} + data = {'system prompt': system, 'prompt': question} try: response = requests.post(url, headers=headers, json=data) if response.status_code in [200,300]: - print(f"Current Queue Size: {q.qsize()}; processing request {count} / {num_requests}\n") - print(f"Status Code for {question}: {response.status_code}\n") - print(f"Response to {question}:\n") - if isinstance(response.text, str): - data = json.loads(response.text) - if isinstance(data, dict): - print_dict(data) - elif isinstance(data, str): - print(data) - else: - print("\nServer returned data of wrong type.\n") - # put the response text in the queue - q.put(response.text) - if not q.empty(): - print(f"Completed task {count} / {num_requests}\n") - bar = make_progress_bar(bar, count, num_requests) - q.task_done() + with lockbar: + print(f"Current Client Queue Size: {q.qsize()}; processing request {count} / {num_requests}\n") + print(f"Status Code for {question}: {response.status_code}\n") + print(f"Response to {question}:\n") + if isinstance(response.text, str): + data = json.loads(response.text) + if isinstance(data, dict): + print_dict(data) + elif isinstance(data, str): + print(data) + else: + print("\nServer returned data of wrong type.\n") + # put the response text in the queue + q.put(response.text) + if not q.empty(): + #with lockbar: # lock automatically releases when the update is done + title_print(f"Completed task {count} / {num_requests}") + bar = make_progress_bar(bar, count, num_requests) + q.task_done() elif response.status_code == 429 and not q.empty(): - # event.set() + event.set() print("Server return too many requests; back off!! Reset event.") else: print(f"Server responded with code {response.status_code}\n") @@ -82,11 +90,12 @@ def send_request(q, question, event, count, num_requests): if __name__ == "__main__": global bar + lockbar = threading.Lock() url = "http://localhost:8080/completion" - num_requests = 256 - q = Queue(maxsize = 256) + num_requests = 40 + q = Queue(maxsize = 64) threads = [] bar = make_empty_bar(num_requests) @@ -97,6 +106,11 @@ if __name__ == "__main__": 'User-Agent': 'Llamaserver.py' } + system = "You are a helpful and cheerful \ +assistant who answers questions briefly, \ +clearly and without undue repetition \ +paying very close attention to the requirements of the task set." + country_list = ["France", "Germany", "China", "USA", "Italy", "India", "Ukraine", "Japan", "Australia", "New Zealand", "Indonesia", "Nigeria", "Saudi Arabia", "Israel", "Egypt", "Kenya", "Chile", "Mexico", "Canada", @@ -109,7 +123,7 @@ if __name__ == "__main__": # NOTE: don't pass the parameter as a function call; pass in args print(f"Processing request {i} / {num_requests}: {question}\n") event = threading.Event() - t = threading.Thread(target=send_request, args=(q, question, event, i, num_requests)) + t = threading.Thread(target=send_request, args=(q, system, question, event, i, num_requests)) t.start() threads.append(t) diff --git a/examples/server/httplib.h b/examples/server/httplib.h index 12cdf97eb..2d763fa40 100644 --- a/examples/server/httplib.h +++ b/examples/server/httplib.h @@ -96,7 +96,7 @@ // the value here (8u, 16u, 32u, etc) is what governs max threads at 5126 #ifndef CPPHTTPLIB_THREAD_POOL_COUNT #define CPPHTTPLIB_THREAD_POOL_COUNT \ - ((std::max)(128u, std::thread::hardware_concurrency() > 0 \ + ((std::max)(32u, std::thread::hardware_concurrency() > 0 \ ? std::thread::hardware_concurrency() - 1 \ : 0)) #endif diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ea7d649ca..00df17353 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include // do we still need this? @@ -350,7 +351,7 @@ static void kvgraphics(std::vector& slots, int cache_size) { printf("\033[1;0H\033[K**************************\n\033[KKVcache occupancy by slot:\n\033[K**************************\n"); for(int i=0; i& slots, int cache_size) { } printf(" %4zu/%5zu %2d %s %s %s\n", slots[i].cache_tokens.size(), slot_cache_size, slots[i].id, slot_symbol1.c_str(), slot_symbol2.c_str(), slot_symbol3.c_str()); } - printf("\n\033[%dJ", num_blocks+5); // move cursor to end of cache display + printf("\n\033[%dJ", num_blocks+5); // move cursor to end of cache display } struct llama_server_context @@ -2664,6 +2665,7 @@ int main(int argc, char **argv) llama_backend_init(); llama_numa_init(params.numa); + ggml_time_init(); LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER}, {"commit", LLAMA_COMMIT}}); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index ca844b2ba..c5a4329e7 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -291,9 +291,10 @@ struct llama_server_queue { // Start the main loop. Called from the very end of server.cpp void start_loop() { running = true; + //LOG_TEE("In start_loop have new task number %d.\n", id); while (true) { // new task arrived - LOG_TEE("In start_loop have new task number %d.\n", id); + // LOG_TEE("In start_loop have new task number %d.\n", id); { while (true) { @@ -377,7 +378,7 @@ struct llama_server_response { typedef std::function callback_multitask_t; callback_multitask_t callback_update_multitask; // for keeping track of all tasks waiting for the result - std::set waiting_task_ids; // so this stores waiting tasks with no obvious limit + std::set waiting_task_ids; // this stores waiting tasks with no obvious limit // the main result queue std::vector queue_results; std::mutex mutex_results;