server : avoid n_available var

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-03-06 23:23:17 +02:00
parent c50a510092
commit c53d84ec16
No known key found for this signature in database
GPG key ID: 449E073F9DC10735

View file

@ -1583,16 +1583,16 @@ struct llama_server_context {
// check if all slots are idle // check if all slots are idle
{ {
bool all_slots_are_idle = true; bool all_idle = true;
for (auto & slot : slots) { for (auto & slot : slots) {
if (slot.state != IDLE || slot.command != NONE) { if (slot.state != IDLE || slot.command != NONE) {
all_slots_are_idle = false; all_idle = false;
break; break;
} }
} }
if (all_slots_are_idle) { if (all_idle) {
LOG_INFO("all slots are idle", {}); LOG_INFO("all slots are idle", {});
if (system_prompt.empty() && clean_kv_cache) { if (system_prompt.empty() && clean_kv_cache) {
kv_cache_clear(); kv_cache_clear();
@ -1688,8 +1688,6 @@ struct llama_server_context {
// assign workload to the slots // assign workload to the slots
if (params.cont_batching || batch.n_tokens == 0) { if (params.cont_batching || batch.n_tokens == 0) {
int n_available = n_batch;
for (auto & slot : slots) { for (auto & slot : slots) {
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()); const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
@ -1830,7 +1828,7 @@ struct llama_server_context {
if (slot.embedding) { if (slot.embedding) {
// cannot fit the prompt in the current batch - will try next iter // cannot fit the prompt in the current batch - will try next iter
if (slot.n_prompt_tokens > n_available) { if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
continue; continue;
} }
} }
@ -1850,7 +1848,7 @@ struct llama_server_context {
int32_t ga_n = slot.ga_n; int32_t ga_n = slot.ga_n;
int32_t ga_w = slot.ga_w; int32_t ga_w = slot.ga_w;
for (; slot.n_past < slot.n_prompt_tokens && n_available > 0; ++slot.n_past, --n_available) { for (; slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch; ++slot.n_past) {
if (slot.ga_n != 1) { if (slot.ga_n != 1) {
while (slot_npast >= ga_i + ga_w) { while (slot_npast >= ga_i + ga_w) {
const int bd = (ga_w/ga_n)*(ga_n - 1); const int bd = (ga_w/ga_n)*(ga_n - 1);
@ -1869,7 +1867,7 @@ struct llama_server_context {
slot_npast++; slot_npast++;
} }
// entire prompt has been processed // entire prompt has been processed - start decoding new tokens
if (slot.n_past == slot.n_prompt_tokens) { if (slot.n_past == slot.n_prompt_tokens) {
slot.state = PROCESSING; slot.state = PROCESSING;
slot.command = NONE; slot.command = NONE;
@ -1898,7 +1896,7 @@ struct llama_server_context {
} }
} }
if (n_available == 0) { if (batch.n_tokens >= n_batch) {
break; break;
} }
} }