handle generation until context is filled

2024-11-22 22:45:41 -06:00 · 2024-11-22 22:45:41 -06:00 · 3c8b10560a
commit 3c8b10560a
parent 63978cb6dc
3 changed files with 37 additions and 3 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -244,7 +244,7 @@ struct server_slot {
        if (params.n_predict != -1) {
            n_remaining = params.n_predict - n_decoded;
        } else if (global_params.n_predict == -2) {
-            n_remaining = n_ctx - n_past;
+            n_remaining = n_ctx - n_past - 1;
        } else if (global_params.n_predict != -1) {
            n_remaining = global_params.n_predict - n_decoded;
        }
--- a/examples/server/tests/features/n_predict.feature
+++ b/examples/server/tests/features/n_predict.feature
@ -0,0 +1,32 @@
@llama.cpp
@n_predict
 Feature: llama.cpp server
  Background: Server startup
    Given a server listening on localhost:8080
    And   a model file test-model.gguf
    And   a model alias tinyllama-2
    And   42 as server seed
    And   64 KV cache size
  Scenario: Generate N tokens
    And   12 max tokens to predict
    Then  the server is starting
    Then  the server is healthy
    Given a prompt:
    """
    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
    """
    And   a completion request with no api error
    Then  12 tokens are predicted
  Scenario: Generate tokens until context is full
    And   -2 server max tokens to predict
    Then  the server is starting
    Then  the server is healthy
    Given a prompt:
    """
    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
    """
    And   a completion request with no api error
    Then  11 tokens are predicted
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -154,8 +154,10 @@ def step_n_slots(context, n_slots: int):
@step('{n_predict:d} server max tokens to predict')
 def step_server_n_predict(context, n_predict: int):
-    context.n_server_predict = n_predict if n_predict > 0 else None
+    if n_predict > 0 or n_predict in (-1, -2):
-
+        context.n_server_predict = n_predict
    else:
        context.n_server_predict = None
@step('{slot_save_path} as slot save path')
 def step_slot_save_path(context, slot_save_path: str):