From 011ea9852ab76a96c5c728ffbfc662071585b5e4 Mon Sep 17 00:00:00 2001 From: pudepiedj Date: Fri, 23 Feb 2024 07:48:00 +0000 Subject: [PATCH] Minor updates --- Llamaserver.py | 2 +- examples/server/server.cpp | 16 +++------------- examples/server/utils.hpp | 7 ++++++- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/Llamaserver.py b/Llamaserver.py index 1eddca92f..ea163a605 100644 --- a/Llamaserver.py +++ b/Llamaserver.py @@ -94,7 +94,7 @@ if __name__ == "__main__": url = "http://localhost:8080/completion" - num_requests = 40 + num_requests = 20 q = Queue(maxsize = 64) threads = [] diff --git a/examples/server/server.cpp b/examples/server/server.cpp index de1ae75ed..ada0cd182 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1589,7 +1589,7 @@ struct llama_server_context slot.t_last_used = ggml_time_us(); LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); - queue_tasks.notify_slot_changed(); + queue_tasks.notify_slot_changed(); // why don't we immediately reallocate the released slot without waiting? Is this what -cb does? continue; } @@ -2254,22 +2254,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } else if (arg == "-skvg" || arg == "--show-graphics") { - if (i >= argc) - { - invalid_param = true; - break; - } - llama.skvgraphics = true; + llama.skvgraphics = true; // -skvg takes no parameter so we don't test ++i >= argc llama.skvinteract = false; } else if (arg == "-skvi" || arg == "--show-interactive-graphics") { - if (i >= argc) - { - invalid_param = true; - break; - } - llama.skvgraphics = true; + llama.skvgraphics = true; // -skvi takes no parameter so we don't test ++i >= argc llama.skvinteract = true; } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index c5a4329e7..28dbfe970 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -272,11 +272,14 @@ struct llama_server_queue { // Call when the state of one slot is changed void notify_slot_changed() { // move deferred tasks back to main loop + // does this mean when ONE slot finished we move ALL deferred tasks back to the main queue? Why? + // it seems that we move everything back to the main queue but we don't allocate a task to the slot just released + // lock so nothing gets added while we are clearing the deferred queue std::unique_lock lock(mutex_tasks); for (auto & task : queue_tasks_deferred) { queue_tasks.push_back(std::move(task)); } - queue_tasks_deferred.clear(); + queue_tasks_deferred.clear(); // and clear the deferred tasks completely? } // end the start_loop routine @@ -387,12 +390,14 @@ struct llama_server_response { void add_waiting_task_id(int task_id) { std::unique_lock lock(mutex_results); waiting_task_ids.insert(task_id); + printf("\033[21;0H"); LOG_TEE("Waiting task list size after addition: %zu.\n", waiting_task_ids.size()); } void remove_waiting_task_id(int task_id) { std::unique_lock lock(mutex_results); waiting_task_ids.erase(task_id); + printf("\033[21;0H"); LOG_TEE("Waiting task list size after removal: %zu.\n", waiting_task_ids.size()); }