From 3c8b10560ae45aa74fce01af9f7997bc39419f91 Mon Sep 17 00:00:00 2001
From: VJHack <flymyplane21@gmail.com>
Date: Fri, 22 Nov 2024 22:45:41 -0600
Subject: [PATCH] handle generation until context is filled

---
 examples/server/server.cpp                    |  2 +-
 .../server/tests/features/n_predict.feature   | 32 +++++++++++++++++++
 examples/server/tests/features/steps/steps.py |  6 ++--
 3 files changed, 37 insertions(+), 3 deletions(-)
 create mode 100644 examples/server/tests/features/n_predict.feature

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index e428954fa..8b3582048 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -244,7 +244,7 @@ struct server_slot {
         if (params.n_predict != -1) {
             n_remaining = params.n_predict - n_decoded;
         } else if (global_params.n_predict == -2) {
-            n_remaining = n_ctx - n_past;
+            n_remaining = n_ctx - n_past - 1;
         } else if (global_params.n_predict != -1) {
             n_remaining = global_params.n_predict - n_decoded;
         }
diff --git a/examples/server/tests/features/n_predict.feature b/examples/server/tests/features/n_predict.feature
new file mode 100644
index 000000000..1f41dbe3c
--- /dev/null
+++ b/examples/server/tests/features/n_predict.feature
@@ -0,0 +1,32 @@
+@llama.cpp
+@n_predict
+Feature: llama.cpp server
+
+  Background: Server startup
+    Given a server listening on localhost:8080
+    And   a model file test-model.gguf
+    And   a model alias tinyllama-2
+    And   42 as server seed
+    And   64 KV cache size
+
+  Scenario: Generate N tokens
+    And   12 max tokens to predict
+    Then  the server is starting
+    Then  the server is healthy
+    Given a prompt:
+    """
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+    """
+    And   a completion request with no api error
+    Then  12 tokens are predicted
+
+  Scenario: Generate tokens until context is full
+    And   -2 server max tokens to predict
+    Then  the server is starting
+    Then  the server is healthy
+    Given a prompt:
+    """
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+    """
+    And   a completion request with no api error
+    Then  11 tokens are predicted
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 540a2ecd5..fe229116f 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -154,8 +154,10 @@ def step_n_slots(context, n_slots: int):
 
 @step('{n_predict:d} server max tokens to predict')
 def step_server_n_predict(context, n_predict: int):
-    context.n_server_predict = n_predict if n_predict > 0 else None
-
+    if n_predict > 0 or n_predict in (-1, -2):
+        context.n_server_predict = n_predict
+    else:
+        context.n_server_predict = None
 
 @step('{slot_save_path} as slot save path')
 def step_slot_save_path(context, slot_save_path: str):