server: only add back deferred tasks when one slot is available

This commit is contained in:
ngxson 2024-01-22 23:06:12 +01:00
parent 1bd867894d
commit d083c81761
2 changed files with 12 additions and 10 deletions

View file

@ -1383,6 +1383,7 @@ struct llama_server_context
slot.t_last_used = ggml_time_us(); slot.t_last_used = ggml_time_us();
LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
queue_tasks.notify_slot_changed();
continue; continue;
} }

View file

@ -209,7 +209,7 @@ struct llama_server_queue {
return task.id; return task.id;
} }
// Add a new task, but defer until the next loop // Add a new task, but defer until one slot is available
void defer(task_server task) { void defer(task_server task) {
std::unique_lock<std::mutex> lock(mutex_tasks); std::unique_lock<std::mutex> lock(mutex_tasks);
queue_tasks_deferred.push_back(std::move(task)); queue_tasks_deferred.push_back(std::move(task));
@ -236,6 +236,16 @@ struct llama_server_queue {
callback_all_task_finished = callback; callback_all_task_finished = callback;
} }
// Call when the state of one slot is changed
void notify_slot_changed() {
// move deferred tasks back to main loop
std::unique_lock<std::mutex> lock(mutex_tasks);
for (auto & task : queue_tasks_deferred) {
queue_tasks.push_back(std::move(task));
}
queue_tasks_deferred.clear();
}
// Start the main loop. This call is blocking // Start the main loop. This call is blocking
void start_loop() { void start_loop() {
while (true) { while (true) {
@ -255,15 +265,6 @@ struct llama_server_queue {
LOG_VERBOSE("callback_new_task", {}); LOG_VERBOSE("callback_new_task", {});
callback_new_task(task); callback_new_task(task);
} }
// move deferred tasks back to main loop
{
std::unique_lock<std::mutex> lock(mutex_tasks);
for (auto & task : queue_tasks_deferred) {
queue_tasks.push_back(std::move(task));
}
queue_tasks_deferred.clear();
lock.unlock();
}
LOG_VERBOSE("callback_all_task_finished", {}); LOG_VERBOSE("callback_all_task_finished", {});
// process and update all the multitasks // process and update all the multitasks
auto queue_iterator = queue_multitasks.begin(); auto queue_iterator = queue_multitasks.begin();