server : update state machine logic to process system prompts

2023-10-24 22:36:47 +03:00 · 2023-10-24 22:36:47 +03:00 · ee201791a1
commit ee201791a1
parent 01be4169bf
1 changed files with 16 additions and 23 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -454,7 +454,7 @@ struct llama_client_slot
    }

    void release() {
-        if (state == PROCESSING)
+        if (state == IDLE || state == PROCESSING)
        {
            t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
            command = RELEASE;
@ -894,8 +894,6 @@ struct llama_server_context
        {
            slot.release();
        }
-        wait_all_are_idle();
-        all_slots_are_idle = true;

        system_need_update = true;
    }
@ -911,22 +909,6 @@ struct llama_server_context
        }
    }

-    void wait_all_are_idle() {
-        bool wait = true;
-        while (wait)
-        {
-            wait = false;
-            for (auto &slot : slots)
-            {
-                if (!slot.available())
-                {
-                    wait = true;
-                    break;
-                }
-            }
-        }
-    }
-
    static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
                                        const stop_type type, llama_client_slot &slot)
    {
@ -1433,7 +1415,7 @@ struct llama_server_context
        process_tasks();

        // update the system prompt wait until all slots are idle state
-        if (system_need_update)
+        if (system_need_update && all_slots_are_idle)
        {
            LOG_TEE("updating system prompt\n");
            update_system_prompt();
@ -1487,7 +1469,7 @@ struct llama_server_context
        for (auto & slot : slots)
        {
            // release the slot
-            if (slot.state == PROCESSING && slot.command == RELEASE)
+            if (slot.command == RELEASE)
            {
                slot.state = IDLE;
                slot.command = NONE;
@ -1498,7 +1480,7 @@ struct llama_server_context
                continue;
            }

-            if (slot.state == IDLE || slot.command == RELEASE)
+            if (slot.state == IDLE)
            {
                continue;
            }
@ -1519,6 +1501,17 @@ struct llama_server_context
        {
            for (auto & slot : slots)
            {
+                const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
+
+                // empty prompt passed -> release the slot and send empty response
+                if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
+                {
+                    slot.release();
+                    slot.print_timings();
+                    send_final_response(slot);
+                    continue;
+                }
+
                // need process the prompt
                if (slot.state == IDLE && slot.command == LOAD_PROMPT)
                {
@ -1738,8 +1731,8 @@ struct llama_server_context
                if (!process_token(result, slot))
                {
                    slot.release();
-                    send_final_response(slot);
                    slot.print_timings();
+                    send_final_response(slot);
                }

                slot.i_batch = -1;