server : fix context limit check to use slot.n_past

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-10-12 15:33:47 +03:00
parent 8a1f4393ee
commit b75afe34c2
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 10 additions and 3 deletions

View file

@ -1102,12 +1102,13 @@ struct server_context {
} }
// if context shift is disabled, we stop when it reaches the context limit // if context shift is disabled, we stop when it reaches the context limit
if (slot.n_decoded >= slot.n_ctx) { if (slot.n_past >= slot.n_ctx) {
slot.truncated = true; slot.truncated = true;
slot.stopped_limit = true; slot.stopped_limit = true;
slot.has_next_token = false; slot.has_next_token = false;
SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx); SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx);
} }
if (llama_token_is_eog(model, result.tok)) { if (llama_token_is_eog(model, result.tok)) {
@ -1797,7 +1798,7 @@ struct server_context {
// apply context-shift if needed // apply context-shift if needed
// TODO: simplify and improve // TODO: simplify and improve
for (server_slot & slot : slots) { for (server_slot & slot : slots) {
if (slot.is_processing() && slot.n_past >= slot.n_ctx - 1) { if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
if (!params.ctx_shift) { if (!params.ctx_shift) {
// this check is redundant (for good) // this check is redundant (for good)
// we should never get here, because generation should already stopped in process_token() // we should never get here, because generation should already stopped in process_token()
@ -1960,6 +1961,8 @@ struct server_context {
} else { } else {
if (!params.ctx_shift) { if (!params.ctx_shift) {
// if context shift is disabled, we make sure prompt size is smaller than KV size // if context shift is disabled, we make sure prompt size is smaller than KV size
// TODO: there should be a separate parameter that control prompt truncation
// context shift should be applied only during the generation phase
if (slot.n_prompt_tokens >= slot.n_ctx) { if (slot.n_prompt_tokens >= slot.n_ctx) {
slot.release(); slot.release();
send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST); send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);

View file

@ -13,6 +13,10 @@ Feature: llama.cpp server
And 32 as batch size And 32 as batch size
And 2 slots And 2 slots
# the prompt is 301 tokens
# the slot context is 256/2 = 128 tokens
# the prompt is truncated to keep the last 109 tokens
# 64 tokens are generated thanks to shifting the context when it gets full
Scenario: Inference with context shift Scenario: Inference with context shift
And 64 server max tokens to predict And 64 server max tokens to predict
Then the server is starting Then the server is starting