diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6962ece58..0dd2fc8b2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1102,12 +1102,13 @@ struct server_context { } // if context shift is disabled, we stop when it reaches the context limit - if (slot.n_decoded >= slot.n_ctx) { + if (slot.n_past >= slot.n_ctx) { slot.truncated = true; slot.stopped_limit = true; slot.has_next_token = false; - SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx); + SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n", + slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx); } if (llama_token_is_eog(model, result.tok)) { @@ -1797,7 +1798,7 @@ struct server_context { // apply context-shift if needed // TODO: simplify and improve for (server_slot & slot : slots) { - if (slot.is_processing() && slot.n_past >= slot.n_ctx - 1) { + if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) { if (!params.ctx_shift) { // this check is redundant (for good) // we should never get here, because generation should already stopped in process_token() @@ -1960,6 +1961,8 @@ struct server_context { } else { if (!params.ctx_shift) { // if context shift is disabled, we make sure prompt size is smaller than KV size + // TODO: there should be a separate parameter that control prompt truncation + // context shift should be applied only during the generation phase if (slot.n_prompt_tokens >= slot.n_ctx) { slot.release(); send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST); diff --git a/examples/server/tests/features/ctx_shift.feature b/examples/server/tests/features/ctx_shift.feature index ba3afcf06..ae6c6b01b 100644 --- a/examples/server/tests/features/ctx_shift.feature +++ b/examples/server/tests/features/ctx_shift.feature @@ -13,6 +13,10 @@ Feature: llama.cpp server And 32 as batch size And 2 slots + # the prompt is 301 tokens + # the slot context is 256/2 = 128 tokens + # the prompt is truncated to keep the last 109 tokens + # 64 tokens are generated thanks to shifting the context when it gets full Scenario: Inference with context shift And 64 server max tokens to predict Then the server is starting