Minor updates

This commit is contained in:
pudepiedj 2024-02-23 07:48:00 +00:00
parent 9c99ef43d7
commit 011ea9852a
3 changed files with 10 additions and 15 deletions

View file

@ -94,7 +94,7 @@ if __name__ == "__main__":
url = "http://localhost:8080/completion"
num_requests = 40
num_requests = 20
q = Queue(maxsize = 64)
threads = []

View file

@ -1589,7 +1589,7 @@ struct llama_server_context
slot.t_last_used = ggml_time_us();
LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
queue_tasks.notify_slot_changed();
queue_tasks.notify_slot_changed(); // why don't we immediately reallocate the released slot without waiting? Is this what -cb does?
continue;
}
@ -2254,22 +2254,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
else if (arg == "-skvg" || arg == "--show-graphics")
{
if (i >= argc)
{
invalid_param = true;
break;
}
llama.skvgraphics = true;
llama.skvgraphics = true; // -skvg takes no parameter so we don't test ++i >= argc
llama.skvinteract = false;
}
else if (arg == "-skvi" || arg == "--show-interactive-graphics")
{
if (i >= argc)
{
invalid_param = true;
break;
}
llama.skvgraphics = true;
llama.skvgraphics = true; // -skvi takes no parameter so we don't test ++i >= argc
llama.skvinteract = true;
}
else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")

View file

@ -272,11 +272,14 @@ struct llama_server_queue {
// Call when the state of one slot is changed
void notify_slot_changed() {
// move deferred tasks back to main loop
// does this mean when ONE slot finished we move ALL deferred tasks back to the main queue? Why?
// it seems that we move everything back to the main queue but we don't allocate a task to the slot just released
// lock so nothing gets added while we are clearing the deferred queue
std::unique_lock<std::mutex> lock(mutex_tasks);
for (auto & task : queue_tasks_deferred) {
queue_tasks.push_back(std::move(task));
}
queue_tasks_deferred.clear();
queue_tasks_deferred.clear(); // and clear the deferred tasks completely?
}
// end the start_loop routine
@ -387,12 +390,14 @@ struct llama_server_response {
void add_waiting_task_id(int task_id) {
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.insert(task_id);
printf("\033[21;0H");
LOG_TEE("Waiting task list size after addition: %zu.\n", waiting_task_ids.size());
}
void remove_waiting_task_id(int task_id) {
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.erase(task_id);
printf("\033[21;0H");
LOG_TEE("Waiting task list size after removal: %zu.\n", waiting_task_ids.size());
}