Minor updates
This commit is contained in:
parent
9c99ef43d7
commit
011ea9852a
3 changed files with 10 additions and 15 deletions
|
@ -94,7 +94,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
url = "http://localhost:8080/completion"
|
url = "http://localhost:8080/completion"
|
||||||
|
|
||||||
num_requests = 40
|
num_requests = 20
|
||||||
q = Queue(maxsize = 64)
|
q = Queue(maxsize = 64)
|
||||||
threads = []
|
threads = []
|
||||||
|
|
||||||
|
|
|
@ -1589,7 +1589,7 @@ struct llama_server_context
|
||||||
slot.t_last_used = ggml_time_us();
|
slot.t_last_used = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
||||||
queue_tasks.notify_slot_changed();
|
queue_tasks.notify_slot_changed(); // why don't we immediately reallocate the released slot without waiting? Is this what -cb does?
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -2254,22 +2254,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
else if (arg == "-skvg" || arg == "--show-graphics")
|
else if (arg == "-skvg" || arg == "--show-graphics")
|
||||||
{
|
{
|
||||||
if (i >= argc)
|
llama.skvgraphics = true; // -skvg takes no parameter so we don't test ++i >= argc
|
||||||
{
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
llama.skvgraphics = true;
|
|
||||||
llama.skvinteract = false;
|
llama.skvinteract = false;
|
||||||
}
|
}
|
||||||
else if (arg == "-skvi" || arg == "--show-interactive-graphics")
|
else if (arg == "-skvi" || arg == "--show-interactive-graphics")
|
||||||
{
|
{
|
||||||
if (i >= argc)
|
llama.skvgraphics = true; // -skvi takes no parameter so we don't test ++i >= argc
|
||||||
{
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
llama.skvgraphics = true;
|
|
||||||
llama.skvinteract = true;
|
llama.skvinteract = true;
|
||||||
}
|
}
|
||||||
else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
|
else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
|
||||||
|
|
|
@ -272,11 +272,14 @@ struct llama_server_queue {
|
||||||
// Call when the state of one slot is changed
|
// Call when the state of one slot is changed
|
||||||
void notify_slot_changed() {
|
void notify_slot_changed() {
|
||||||
// move deferred tasks back to main loop
|
// move deferred tasks back to main loop
|
||||||
|
// does this mean when ONE slot finished we move ALL deferred tasks back to the main queue? Why?
|
||||||
|
// it seems that we move everything back to the main queue but we don't allocate a task to the slot just released
|
||||||
|
// lock so nothing gets added while we are clearing the deferred queue
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
for (auto & task : queue_tasks_deferred) {
|
for (auto & task : queue_tasks_deferred) {
|
||||||
queue_tasks.push_back(std::move(task));
|
queue_tasks.push_back(std::move(task));
|
||||||
}
|
}
|
||||||
queue_tasks_deferred.clear();
|
queue_tasks_deferred.clear(); // and clear the deferred tasks completely?
|
||||||
}
|
}
|
||||||
|
|
||||||
// end the start_loop routine
|
// end the start_loop routine
|
||||||
|
@ -387,12 +390,14 @@ struct llama_server_response {
|
||||||
void add_waiting_task_id(int task_id) {
|
void add_waiting_task_id(int task_id) {
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
std::unique_lock<std::mutex> lock(mutex_results);
|
||||||
waiting_task_ids.insert(task_id);
|
waiting_task_ids.insert(task_id);
|
||||||
|
printf("\033[21;0H");
|
||||||
LOG_TEE("Waiting task list size after addition: %zu.\n", waiting_task_ids.size());
|
LOG_TEE("Waiting task list size after addition: %zu.\n", waiting_task_ids.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
void remove_waiting_task_id(int task_id) {
|
void remove_waiting_task_id(int task_id) {
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
std::unique_lock<std::mutex> lock(mutex_results);
|
||||||
waiting_task_ids.erase(task_id);
|
waiting_task_ids.erase(task_id);
|
||||||
|
printf("\033[21;0H");
|
||||||
LOG_TEE("Waiting task list size after removal: %zu.\n", waiting_task_ids.size());
|
LOG_TEE("Waiting task list size after removal: %zu.\n", waiting_task_ids.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue