diff --git a/examples/server/server.cpp b/examples/server/server.cpp index daf8d4243..9aef21dd3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -161,7 +161,6 @@ struct task_result { enum slot_state { IDLE, - SLEEPING, PROCESSING, }; @@ -347,6 +346,9 @@ struct llama_client_slot slot_state state = IDLE; slot_command command = NONE; + // used to determine the slot that has been used the longest + int64_t t_last_used = -1; + // generation props int32_t n_ctx = 0; // context size per slot int32_t n_past = 0; @@ -435,7 +437,7 @@ struct llama_client_slot } bool is_processing() const { - return ((state == IDLE || state == SLEEPING) && command == LOAD_PROMPT) || state == PROCESSING; + return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING; } void add_token_string(const completion_token_output &token) { @@ -643,14 +645,24 @@ struct llama_server_context } llama_client_slot* get_slot(int id) { + int64_t t_last = ggml_time_us(); + llama_client_slot *last_used = nullptr; + for (llama_client_slot & slot : slots) { - if ((id == -1 && slot.available()) || slot.id == id) + if (slot.id == id && slot.available()) { return &slot; } + + if (slot.available() && slot.t_last_used < t_last) + { + last_used = &slot; + t_last = slot.t_last_used; + } } - return nullptr; + + return last_used; } bool launch_slot_with_data(llama_client_slot* &slot, json data) { @@ -1484,22 +1496,16 @@ struct llama_server_context // release the slot if (slot.state == PROCESSING && slot.command == RELEASE) { - slot.state = slot.params.cache_prompt ? SLEEPING : IDLE; - if (slot.state == SLEEPING) { - LOG_TEE("slot %i has %i tokens in cache.\n", slot.id, (int) slot.cache_tokens.size()); - } - else - { - LOG_TEE("slot %i released\n", slot.id); - } + slot.state = IDLE; slot.command = NONE; + slot.t_last_used = ggml_time_us(); + + LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); + continue; } - if ( - slot.state == IDLE || - slot.state == SLEEPING || - slot.command == RELEASE) + if (slot.state == IDLE || slot.command == RELEASE) { continue; } @@ -1521,7 +1527,7 @@ struct llama_server_context for (auto & slot : slots) { // need process the prompt - if ((slot.state == IDLE || slot.state == SLEEPING) && slot.command == LOAD_PROMPT) + if (slot.state == IDLE && slot.command == LOAD_PROMPT) { slot.state = PROCESSING; slot.command = NONE;