From 3c8b10560ae45aa74fce01af9f7997bc39419f91 Mon Sep 17 00:00:00 2001 From: VJHack Date: Fri, 22 Nov 2024 22:45:41 -0600 Subject: [PATCH] handle generation until context is filled --- examples/server/server.cpp | 2 +- .../server/tests/features/n_predict.feature | 32 +++++++++++++++++++ examples/server/tests/features/steps/steps.py | 6 ++-- 3 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 examples/server/tests/features/n_predict.feature diff --git a/examples/server/server.cpp b/examples/server/server.cpp index e428954fa..8b3582048 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -244,7 +244,7 @@ struct server_slot { if (params.n_predict != -1) { n_remaining = params.n_predict - n_decoded; } else if (global_params.n_predict == -2) { - n_remaining = n_ctx - n_past; + n_remaining = n_ctx - n_past - 1; } else if (global_params.n_predict != -1) { n_remaining = global_params.n_predict - n_decoded; } diff --git a/examples/server/tests/features/n_predict.feature b/examples/server/tests/features/n_predict.feature new file mode 100644 index 000000000..1f41dbe3c --- /dev/null +++ b/examples/server/tests/features/n_predict.feature @@ -0,0 +1,32 @@ +@llama.cpp +@n_predict +Feature: llama.cpp server + + Background: Server startup + Given a server listening on localhost:8080 + And a model file test-model.gguf + And a model alias tinyllama-2 + And 42 as server seed + And 64 KV cache size + + Scenario: Generate N tokens + And 12 max tokens to predict + Then the server is starting + Then the server is healthy + Given a prompt: + """ + Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. + """ + And a completion request with no api error + Then 12 tokens are predicted + + Scenario: Generate tokens until context is full + And -2 server max tokens to predict + Then the server is starting + Then the server is healthy + Given a prompt: + """ + Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. + """ + And a completion request with no api error + Then 11 tokens are predicted diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 540a2ecd5..fe229116f 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -154,8 +154,10 @@ def step_n_slots(context, n_slots: int): @step('{n_predict:d} server max tokens to predict') def step_server_n_predict(context, n_predict: int): - context.n_server_predict = n_predict if n_predict > 0 else None - + if n_predict > 0 or n_predict in (-1, -2): + context.n_server_predict = n_predict + else: + context.n_server_predict = None @step('{slot_save_path} as slot save path') def step_slot_save_path(context, slot_save_path: str):