server : update state machine logic to process system prompts
This commit is contained in:
parent
01be4169bf
commit
ee201791a1
1 changed files with 16 additions and 23 deletions
|
@ -454,7 +454,7 @@ struct llama_client_slot
|
||||||
}
|
}
|
||||||
|
|
||||||
void release() {
|
void release() {
|
||||||
if (state == PROCESSING)
|
if (state == IDLE || state == PROCESSING)
|
||||||
{
|
{
|
||||||
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
|
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
|
||||||
command = RELEASE;
|
command = RELEASE;
|
||||||
|
@ -894,8 +894,6 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
slot.release();
|
slot.release();
|
||||||
}
|
}
|
||||||
wait_all_are_idle();
|
|
||||||
all_slots_are_idle = true;
|
|
||||||
|
|
||||||
system_need_update = true;
|
system_need_update = true;
|
||||||
}
|
}
|
||||||
|
@ -911,22 +909,6 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void wait_all_are_idle() {
|
|
||||||
bool wait = true;
|
|
||||||
while (wait)
|
|
||||||
{
|
|
||||||
wait = false;
|
|
||||||
for (auto &slot : slots)
|
|
||||||
{
|
|
||||||
if (!slot.available())
|
|
||||||
{
|
|
||||||
wait = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
|
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
|
||||||
const stop_type type, llama_client_slot &slot)
|
const stop_type type, llama_client_slot &slot)
|
||||||
{
|
{
|
||||||
|
@ -1433,7 +1415,7 @@ struct llama_server_context
|
||||||
process_tasks();
|
process_tasks();
|
||||||
|
|
||||||
// update the system prompt wait until all slots are idle state
|
// update the system prompt wait until all slots are idle state
|
||||||
if (system_need_update)
|
if (system_need_update && all_slots_are_idle)
|
||||||
{
|
{
|
||||||
LOG_TEE("updating system prompt\n");
|
LOG_TEE("updating system prompt\n");
|
||||||
update_system_prompt();
|
update_system_prompt();
|
||||||
|
@ -1487,7 +1469,7 @@ struct llama_server_context
|
||||||
for (auto & slot : slots)
|
for (auto & slot : slots)
|
||||||
{
|
{
|
||||||
// release the slot
|
// release the slot
|
||||||
if (slot.state == PROCESSING && slot.command == RELEASE)
|
if (slot.command == RELEASE)
|
||||||
{
|
{
|
||||||
slot.state = IDLE;
|
slot.state = IDLE;
|
||||||
slot.command = NONE;
|
slot.command = NONE;
|
||||||
|
@ -1498,7 +1480,7 @@ struct llama_server_context
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot.state == IDLE || slot.command == RELEASE)
|
if (slot.state == IDLE)
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1519,6 +1501,17 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
for (auto & slot : slots)
|
for (auto & slot : slots)
|
||||||
{
|
{
|
||||||
|
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
|
||||||
|
|
||||||
|
// empty prompt passed -> release the slot and send empty response
|
||||||
|
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
|
||||||
|
{
|
||||||
|
slot.release();
|
||||||
|
slot.print_timings();
|
||||||
|
send_final_response(slot);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// need process the prompt
|
// need process the prompt
|
||||||
if (slot.state == IDLE && slot.command == LOAD_PROMPT)
|
if (slot.state == IDLE && slot.command == LOAD_PROMPT)
|
||||||
{
|
{
|
||||||
|
@ -1738,8 +1731,8 @@ struct llama_server_context
|
||||||
if (!process_token(result, slot))
|
if (!process_token(result, slot))
|
||||||
{
|
{
|
||||||
slot.release();
|
slot.release();
|
||||||
send_final_response(slot);
|
|
||||||
slot.print_timings();
|
slot.print_timings();
|
||||||
|
send_final_response(slot);
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.i_batch = -1;
|
slot.i_batch = -1;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue