server : add --no-context-shift option (#9607)

* server : add --no-context-shift option * small fix * Update examples/server/tests/features/embeddings.feature Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * tests : minor fix * revert usage of GGML_ASSERT * update server documentation --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-09-23 22:23:54 +02:00 · 2024-09-23 22:23:54 +02:00 · 0b3bf966f4
commit 0b3bf966f4
parent f0c7b5edf8
6 changed files with 139 additions and 22 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1180,6 +1180,15 @@ struct server_context {
            SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
        }

+        // if context shift is disabled, we stop when it reaches the context limit
+        if (slot.n_decoded >= slot.n_ctx) {
+            slot.truncated      = true;
+            slot.stopped_limit  = true;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx);
+        }
+
        if (llama_token_is_eog(model, result.tok)) {
            slot.stopped_eos    = true;
            slot.has_next_token = false;
@ -1480,7 +1489,7 @@ struct server_context {
            if (result.error) {
                error_handler(result.data);
                cancel_tasks(id_tasks);
-                break;
+                return;
            }

            size_t idx = result.data["index"];
@ -1827,6 +1836,14 @@ struct server_context {
        for (server_slot & slot : slots) {
            if (slot.ga_n == 1) {
                if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
+                    if (!params.ctx_shift) {
+                        // this check is redundant (for good)
+                        // we should never get here, because generation should already stopped in process_token()
+                        slot.release();
+                        send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER);
+                        continue;
+                    }
+
                    // Shift context
                    const int n_keep    = slot.params.n_keep + add_bos_token;
                    const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
@ -1961,6 +1978,14 @@ struct server_context {
                                continue;
                            }
                        } else {
+                            if (!params.ctx_shift) {
+                                // if context shift is disabled, we make sure prompt size is smaller than KV size
+                                if ((int) system_tokens.size() + slot.n_prompt_tokens >= slot.n_ctx) {
+                                    slot.release();
+                                    send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
+                                    continue;
+                                }
+                            }
                            if (slot.params.n_keep < 0) {
                                slot.params.n_keep = slot.n_prompt_tokens;
                            }