server : update state machine logic to process system prompts

This commit is contained in:
Georgi Gerganov 2023-10-24 22:36:47 +03:00
parent 01be4169bf
commit ee201791a1
No known key found for this signature in database
GPG key ID: 449E073F9DC10735

View file

@ -454,7 +454,7 @@ struct llama_client_slot
}
void release() {
if (state == PROCESSING)
if (state == IDLE || state == PROCESSING)
{
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
command = RELEASE;
@ -894,8 +894,6 @@ struct llama_server_context
{
slot.release();
}
wait_all_are_idle();
all_slots_are_idle = true;
system_need_update = true;
}
@ -911,22 +909,6 @@ struct llama_server_context
}
}
void wait_all_are_idle() {
bool wait = true;
while (wait)
{
wait = false;
for (auto &slot : slots)
{
if (!slot.available())
{
wait = true;
break;
}
}
}
}
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
const stop_type type, llama_client_slot &slot)
{
@ -1433,7 +1415,7 @@ struct llama_server_context
process_tasks();
// update the system prompt wait until all slots are idle state
if (system_need_update)
if (system_need_update && all_slots_are_idle)
{
LOG_TEE("updating system prompt\n");
update_system_prompt();
@ -1487,7 +1469,7 @@ struct llama_server_context
for (auto & slot : slots)
{
// release the slot
if (slot.state == PROCESSING && slot.command == RELEASE)
if (slot.command == RELEASE)
{
slot.state = IDLE;
slot.command = NONE;
@ -1498,7 +1480,7 @@ struct llama_server_context
continue;
}
if (slot.state == IDLE || slot.command == RELEASE)
if (slot.state == IDLE)
{
continue;
}
@ -1519,6 +1501,17 @@ struct llama_server_context
{
for (auto & slot : slots)
{
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
// empty prompt passed -> release the slot and send empty response
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
{
slot.release();
slot.print_timings();
send_final_response(slot);
continue;
}
// need process the prompt
if (slot.state == IDLE && slot.command == LOAD_PROMPT)
{
@ -1738,8 +1731,8 @@ struct llama_server_context
if (!process_token(result, slot))
{
slot.release();
send_final_response(slot);
slot.print_timings();
send_final_response(slot);
}
slot.i_batch = -1;