server : fix slot reuse
This commit is contained in:
parent
8fe7ca4875
commit
83e1490187
1 changed files with 23 additions and 17 deletions
|
@ -161,7 +161,6 @@ struct task_result {
|
||||||
enum slot_state
|
enum slot_state
|
||||||
{
|
{
|
||||||
IDLE,
|
IDLE,
|
||||||
SLEEPING,
|
|
||||||
PROCESSING,
|
PROCESSING,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -347,6 +346,9 @@ struct llama_client_slot
|
||||||
slot_state state = IDLE;
|
slot_state state = IDLE;
|
||||||
slot_command command = NONE;
|
slot_command command = NONE;
|
||||||
|
|
||||||
|
// used to determine the slot that has been used the longest
|
||||||
|
int64_t t_last_used = -1;
|
||||||
|
|
||||||
// generation props
|
// generation props
|
||||||
int32_t n_ctx = 0; // context size per slot
|
int32_t n_ctx = 0; // context size per slot
|
||||||
int32_t n_past = 0;
|
int32_t n_past = 0;
|
||||||
|
@ -435,7 +437,7 @@ struct llama_client_slot
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_processing() const {
|
bool is_processing() const {
|
||||||
return ((state == IDLE || state == SLEEPING) && command == LOAD_PROMPT) || state == PROCESSING;
|
return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING;
|
||||||
}
|
}
|
||||||
|
|
||||||
void add_token_string(const completion_token_output &token) {
|
void add_token_string(const completion_token_output &token) {
|
||||||
|
@ -643,14 +645,24 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_client_slot* get_slot(int id) {
|
llama_client_slot* get_slot(int id) {
|
||||||
|
int64_t t_last = ggml_time_us();
|
||||||
|
llama_client_slot *last_used = nullptr;
|
||||||
|
|
||||||
for (llama_client_slot & slot : slots)
|
for (llama_client_slot & slot : slots)
|
||||||
{
|
{
|
||||||
if ((id == -1 && slot.available()) || slot.id == id)
|
if (slot.id == id && slot.available())
|
||||||
{
|
{
|
||||||
return &slot;
|
return &slot;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (slot.available() && slot.t_last_used < t_last)
|
||||||
|
{
|
||||||
|
last_used = &slot;
|
||||||
|
t_last = slot.t_last_used;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return nullptr;
|
|
||||||
|
return last_used;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
|
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
|
||||||
|
@ -1484,22 +1496,16 @@ struct llama_server_context
|
||||||
// release the slot
|
// release the slot
|
||||||
if (slot.state == PROCESSING && slot.command == RELEASE)
|
if (slot.state == PROCESSING && slot.command == RELEASE)
|
||||||
{
|
{
|
||||||
slot.state = slot.params.cache_prompt ? SLEEPING : IDLE;
|
slot.state = IDLE;
|
||||||
if (slot.state == SLEEPING) {
|
|
||||||
LOG_TEE("slot %i has %i tokens in cache.\n", slot.id, (int) slot.cache_tokens.size());
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
LOG_TEE("slot %i released\n", slot.id);
|
|
||||||
}
|
|
||||||
slot.command = NONE;
|
slot.command = NONE;
|
||||||
|
slot.t_last_used = ggml_time_us();
|
||||||
|
|
||||||
|
LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (
|
if (slot.state == IDLE || slot.command == RELEASE)
|
||||||
slot.state == IDLE ||
|
|
||||||
slot.state == SLEEPING ||
|
|
||||||
slot.command == RELEASE)
|
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1521,7 +1527,7 @@ struct llama_server_context
|
||||||
for (auto & slot : slots)
|
for (auto & slot : slots)
|
||||||
{
|
{
|
||||||
// need process the prompt
|
// need process the prompt
|
||||||
if ((slot.state == IDLE || slot.state == SLEEPING) && slot.command == LOAD_PROMPT)
|
if (slot.state == IDLE && slot.command == LOAD_PROMPT)
|
||||||
{
|
{
|
||||||
slot.state = PROCESSING;
|
slot.state = PROCESSING;
|
||||||
slot.command = NONE;
|
slot.command = NONE;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue