Minor updates

2024-02-23 07:48:00 +00:00 · 2024-02-23 07:48:00 +00:00 · 011ea9852a
commit 011ea9852a
parent 9c99ef43d7
3 changed files with 10 additions and 15 deletions
--- a/Llamaserver.py
+++ b/Llamaserver.py
@ -94,7 +94,7 @@ if __name__ == "__main__":
    url = "http://localhost:8080/completion"
-    num_requests = 40
+    num_requests = 20
    q = Queue(maxsize = 64)
    threads = []
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1589,7 +1589,7 @@ struct llama_server_context
                slot.t_last_used = ggml_time_us();
                LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
-                queue_tasks.notify_slot_changed();
+                queue_tasks.notify_slot_changed();  // why don't we immediately reallocate the released slot without waiting? Is this what -cb does?
                continue;
            }
@ -2254,22 +2254,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        }
        else if (arg == "-skvg" || arg == "--show-graphics")
        {
-            if (i >= argc)
+            llama.skvgraphics = true;       // -skvg takes no parameter so we don't test ++i >= argc
            {
                invalid_param = true;
                break;
            }
            llama.skvgraphics = true;
            llama.skvinteract = false;
        }
        else if (arg == "-skvi" || arg == "--show-interactive-graphics")
        {
-            if (i >= argc)
+            llama.skvgraphics = true;       // -skvi takes no parameter so we don't test ++i >= argc
            {
                invalid_param = true;
                break;
            }
            llama.skvgraphics = true;
            llama.skvinteract = true;
        }
        else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -272,11 +272,14 @@ struct llama_server_queue {
    // Call when the state of one slot is changed
    void notify_slot_changed() {
        // move deferred tasks back to main loop
        // does this mean when ONE slot finished we move ALL deferred tasks back to the main queue? Why?
        // it seems that we move everything back to the main queue but we don't allocate a task to the slot just released
        // lock so nothing gets added while we are clearing the deferred queue
        std::unique_lock<std::mutex> lock(mutex_tasks);
        for (auto & task : queue_tasks_deferred) {
            queue_tasks.push_back(std::move(task));
        }
-        queue_tasks_deferred.clear();
+        queue_tasks_deferred.clear();   // and clear the deferred tasks completely?
    }
    // end the start_loop routine
@ -387,12 +390,14 @@ struct llama_server_response {
    void add_waiting_task_id(int task_id) {
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.insert(task_id);
        printf("\033[21;0H");
        LOG_TEE("Waiting task list size after addition: %zu.\n", waiting_task_ids.size());
    }
    void remove_waiting_task_id(int task_id) {
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.erase(task_id);
        printf("\033[21;0H");
        LOG_TEE("Waiting task list size after removal: %zu.\n", waiting_task_ids.size());
    }