server : avoid n_available var
ggml-ci
This commit is contained in:
parent
c50a510092
commit
c53d84ec16
1 changed files with 7 additions and 9 deletions
|
@ -1583,16 +1583,16 @@ struct llama_server_context {
|
||||||
|
|
||||||
// check if all slots are idle
|
// check if all slots are idle
|
||||||
{
|
{
|
||||||
bool all_slots_are_idle = true;
|
bool all_idle = true;
|
||||||
|
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
if (slot.state != IDLE || slot.command != NONE) {
|
if (slot.state != IDLE || slot.command != NONE) {
|
||||||
all_slots_are_idle = false;
|
all_idle = false;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (all_slots_are_idle) {
|
if (all_idle) {
|
||||||
LOG_INFO("all slots are idle", {});
|
LOG_INFO("all slots are idle", {});
|
||||||
if (system_prompt.empty() && clean_kv_cache) {
|
if (system_prompt.empty() && clean_kv_cache) {
|
||||||
kv_cache_clear();
|
kv_cache_clear();
|
||||||
|
@ -1688,8 +1688,6 @@ struct llama_server_context {
|
||||||
|
|
||||||
// assign workload to the slots
|
// assign workload to the slots
|
||||||
if (params.cont_batching || batch.n_tokens == 0) {
|
if (params.cont_batching || batch.n_tokens == 0) {
|
||||||
int n_available = n_batch;
|
|
||||||
|
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
|
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
|
||||||
|
|
||||||
|
@ -1830,7 +1828,7 @@ struct llama_server_context {
|
||||||
|
|
||||||
if (slot.embedding) {
|
if (slot.embedding) {
|
||||||
// cannot fit the prompt in the current batch - will try next iter
|
// cannot fit the prompt in the current batch - will try next iter
|
||||||
if (slot.n_prompt_tokens > n_available) {
|
if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1850,7 +1848,7 @@ struct llama_server_context {
|
||||||
int32_t ga_n = slot.ga_n;
|
int32_t ga_n = slot.ga_n;
|
||||||
int32_t ga_w = slot.ga_w;
|
int32_t ga_w = slot.ga_w;
|
||||||
|
|
||||||
for (; slot.n_past < slot.n_prompt_tokens && n_available > 0; ++slot.n_past, --n_available) {
|
for (; slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch; ++slot.n_past) {
|
||||||
if (slot.ga_n != 1) {
|
if (slot.ga_n != 1) {
|
||||||
while (slot_npast >= ga_i + ga_w) {
|
while (slot_npast >= ga_i + ga_w) {
|
||||||
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
||||||
|
@ -1869,7 +1867,7 @@ struct llama_server_context {
|
||||||
slot_npast++;
|
slot_npast++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// entire prompt has been processed
|
// entire prompt has been processed - start decoding new tokens
|
||||||
if (slot.n_past == slot.n_prompt_tokens) {
|
if (slot.n_past == slot.n_prompt_tokens) {
|
||||||
slot.state = PROCESSING;
|
slot.state = PROCESSING;
|
||||||
slot.command = NONE;
|
slot.command = NONE;
|
||||||
|
@ -1898,7 +1896,7 @@ struct llama_server_context {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_available == 0) {
|
if (batch.n_tokens >= n_batch) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue