server : fix context limit check to use slot.n_past

ggml-ci
2024-10-12 15:33:47 +03:00 · 2024-10-12 15:33:47 +03:00 · b75afe34c2
commit b75afe34c2
parent 8a1f4393ee
2 changed files with 10 additions and 3 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1102,12 +1102,13 @@ struct server_context {
        }

        // if context shift is disabled, we stop when it reaches the context limit
-        if (slot.n_decoded >= slot.n_ctx) {
+        if (slot.n_past >= slot.n_ctx) {
            slot.truncated      = true;
            slot.stopped_limit  = true;
            slot.has_next_token = false;

-            SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx);
+            SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
+                    slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx);
        }

        if (llama_token_is_eog(model, result.tok)) {
@ -1797,7 +1798,7 @@ struct server_context {
        // apply context-shift if needed
        // TODO: simplify and improve
        for (server_slot & slot : slots) {
-            if (slot.is_processing() && slot.n_past >= slot.n_ctx - 1) {
+            if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
                if (!params.ctx_shift) {
                    // this check is redundant (for good)
                    // we should never get here, because generation should already stopped in process_token()
@ -1960,6 +1961,8 @@ struct server_context {
                        } else {
                            if (!params.ctx_shift) {
                                // if context shift is disabled, we make sure prompt size is smaller than KV size
+                                // TODO: there should be a separate parameter that control prompt truncation
+                                //       context shift should be applied only during the generation phase
                                if (slot.n_prompt_tokens >= slot.n_ctx) {
                                    slot.release();
                                    send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
--- a/examples/server/tests/features/ctx_shift.feature
+++ b/examples/server/tests/features/ctx_shift.feature
@ -13,6 +13,10 @@ Feature: llama.cpp server
    And   32 as batch size
    And   2 slots

+    # the prompt is 301 tokens
+    # the slot context is 256/2 = 128 tokens
+    # the prompt is truncated to keep the last 109 tokens
+    # 64 tokens are generated thanks to shifting the context when it gets full
  Scenario: Inference with context shift
    And   64 server max tokens to predict
    Then  the server is starting