server : fix slot reuse

2023-10-22 21:57:23 +03:00 · 2023-10-22 21:57:23 +03:00 · 83e1490187
commit 83e1490187
parent 8fe7ca4875
1 changed files with 23 additions and 17 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -161,7 +161,6 @@ struct task_result {
 enum slot_state
 {
    IDLE,
-    SLEEPING,
    PROCESSING,
 };

@ -347,6 +346,9 @@ struct llama_client_slot
    slot_state state = IDLE;
    slot_command command = NONE;

+    // used to determine the slot that has been used the longest
+    int64_t t_last_used = -1;
+
    // generation props
    int32_t n_ctx       = 0;  // context size per slot
    int32_t n_past      = 0;
@ -435,7 +437,7 @@ struct llama_client_slot
    }

    bool is_processing() const {
-        return ((state == IDLE || state == SLEEPING) && command == LOAD_PROMPT) || state == PROCESSING;
+        return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING;
    }

    void add_token_string(const completion_token_output &token) {
@ -643,14 +645,24 @@ struct llama_server_context
    }

    llama_client_slot* get_slot(int id) {
+        int64_t t_last = ggml_time_us();
+        llama_client_slot *last_used = nullptr;
+
        for (llama_client_slot & slot : slots)
        {
-            if ((id == -1 && slot.available()) || slot.id == id)
+            if (slot.id == id && slot.available())
            {
                return &slot;
            }
+
+            if (slot.available() && slot.t_last_used < t_last)
+            {
+                last_used = &slot;
+                t_last = slot.t_last_used;
+            }
        }
-        return nullptr;
+
+        return last_used;
    }

    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
@ -1484,22 +1496,16 @@ struct llama_server_context
            // release the slot
            if (slot.state == PROCESSING && slot.command == RELEASE)
            {
-                slot.state = slot.params.cache_prompt ? SLEEPING : IDLE;
-                if (slot.state == SLEEPING) {
-                    LOG_TEE("slot %i has %i tokens in cache.\n", slot.id, (int) slot.cache_tokens.size());
-                }
-                else
-                {
-                    LOG_TEE("slot %i released\n", slot.id);
-                }
+                slot.state = IDLE;
                slot.command = NONE;
+                slot.t_last_used = ggml_time_us();
+
+                LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
+
                continue;
            }

-            if (
-                slot.state == IDLE ||
-                slot.state == SLEEPING ||
-                slot.command == RELEASE)
+            if (slot.state == IDLE || slot.command == RELEASE)
            {
                continue;
            }
@ -1521,7 +1527,7 @@ struct llama_server_context
            for (auto & slot : slots)
            {
                // need process the prompt
-                if ((slot.state == IDLE || slot.state == SLEEPING) && slot.command == LOAD_PROMPT)
+                if (slot.state == IDLE && slot.command == LOAD_PROMPT)
                {
                    slot.state = PROCESSING;
                    slot.command = NONE;