From 011ea9852ab76a96c5c728ffbfc662071585b5e4 Mon Sep 17 00:00:00 2001
From: pudepiedj <pudepiedj@gmail.com>
Date: Fri, 23 Feb 2024 07:48:00 +0000
Subject: [PATCH] Minor updates

---
 Llamaserver.py             |  2 +-
 examples/server/server.cpp | 16 +++-------------
 examples/server/utils.hpp  |  7 ++++++-
 3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/Llamaserver.py b/Llamaserver.py
index 1eddca92f..ea163a605 100644
--- a/Llamaserver.py
+++ b/Llamaserver.py
@@ -94,7 +94,7 @@ if __name__ == "__main__":
     
     url = "http://localhost:8080/completion"
 
-    num_requests = 40
+    num_requests = 20
     q = Queue(maxsize = 64)
     threads = []
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index de1ae75ed..ada0cd182 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1589,7 +1589,7 @@ struct llama_server_context
                 slot.t_last_used = ggml_time_us();
 
                 LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
-                queue_tasks.notify_slot_changed();
+                queue_tasks.notify_slot_changed();  // why don't we immediately reallocate the released slot without waiting? Is this what -cb does?
 
                 continue;
             }
@@ -2254,22 +2254,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         }
         else if (arg == "-skvg" || arg == "--show-graphics")
         {
-            if (i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            llama.skvgraphics = true;
+            llama.skvgraphics = true;       // -skvg takes no parameter so we don't test ++i >= argc
             llama.skvinteract = false;
         }
         else if (arg == "-skvi" || arg == "--show-interactive-graphics")
         {
-            if (i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            llama.skvgraphics = true;
+            llama.skvgraphics = true;       // -skvi takes no parameter so we don't test ++i >= argc
             llama.skvinteract = true;
         }
         else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index c5a4329e7..28dbfe970 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -272,11 +272,14 @@ struct llama_server_queue {
     // Call when the state of one slot is changed
     void notify_slot_changed() {
         // move deferred tasks back to main loop
+        // does this mean when ONE slot finished we move ALL deferred tasks back to the main queue? Why?
+        // it seems that we move everything back to the main queue but we don't allocate a task to the slot just released
+        // lock so nothing gets added while we are clearing the deferred queue
         std::unique_lock<std::mutex> lock(mutex_tasks);
         for (auto & task : queue_tasks_deferred) {
             queue_tasks.push_back(std::move(task));
         }
-        queue_tasks_deferred.clear();
+        queue_tasks_deferred.clear();   // and clear the deferred tasks completely?
     }
 
     // end the start_loop routine
@@ -387,12 +390,14 @@ struct llama_server_response {
     void add_waiting_task_id(int task_id) {
         std::unique_lock<std::mutex> lock(mutex_results);
         waiting_task_ids.insert(task_id);
+        printf("\033[21;0H");
         LOG_TEE("Waiting task list size after addition: %zu.\n", waiting_task_ids.size());
     }
 
     void remove_waiting_task_id(int task_id) {
         std::unique_lock<std::mutex> lock(mutex_results);
         waiting_task_ids.erase(task_id);
+        printf("\033[21;0H");
         LOG_TEE("Waiting task list size after removal: %zu.\n", waiting_task_ids.size());
     }