From c53d84ec16e4f6e826e6fac2b92fb382ea8525a1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 6 Mar 2024 23:23:17 +0200 Subject: [PATCH] server : avoid n_available var ggml-ci --- examples/server/server.cpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ac5e76406..6ca4e9ba0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1583,16 +1583,16 @@ struct llama_server_context { // check if all slots are idle { - bool all_slots_are_idle = true; + bool all_idle = true; for (auto & slot : slots) { if (slot.state != IDLE || slot.command != NONE) { - all_slots_are_idle = false; + all_idle = false; break; } } - if (all_slots_are_idle) { + if (all_idle) { LOG_INFO("all slots are idle", {}); if (system_prompt.empty() && clean_kv_cache) { kv_cache_clear(); @@ -1688,8 +1688,6 @@ struct llama_server_context { // assign workload to the slots if (params.cont_batching || batch.n_tokens == 0) { - int n_available = n_batch; - for (auto & slot : slots) { const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty()); @@ -1830,7 +1828,7 @@ struct llama_server_context { if (slot.embedding) { // cannot fit the prompt in the current batch - will try next iter - if (slot.n_prompt_tokens > n_available) { + if (batch.n_tokens + slot.n_prompt_tokens > n_batch) { continue; } } @@ -1850,7 +1848,7 @@ struct llama_server_context { int32_t ga_n = slot.ga_n; int32_t ga_w = slot.ga_w; - for (; slot.n_past < slot.n_prompt_tokens && n_available > 0; ++slot.n_past, --n_available) { + for (; slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch; ++slot.n_past) { if (slot.ga_n != 1) { while (slot_npast >= ga_i + ga_w) { const int bd = (ga_w/ga_n)*(ga_n - 1); @@ -1869,7 +1867,7 @@ struct llama_server_context { slot_npast++; } - // entire prompt has been processed + // entire prompt has been processed - start decoding new tokens if (slot.n_past == slot.n_prompt_tokens) { slot.state = PROCESSING; slot.command = NONE; @@ -1898,7 +1896,7 @@ struct llama_server_context { } } - if (n_available == 0) { + if (batch.n_tokens >= n_batch) { break; } }