From 4dcb3ea9439a36dcdf71db295d0c8b4fcbffc678 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Sun, 29 Sep 2024 01:09:41 +0100
Subject: [PATCH] `tests`: allow artificial slowdown of sampling for tests

---
 common/arg.cpp                                | 7 +++++++
 common/common.h                               | 2 ++
 examples/server/server.cpp                    | 3 +++
 examples/server/tests/features/steps/steps.py | 6 ++++++
 4 files changed, 18 insertions(+)

diff --git a/common/arg.cpp b/common/arg.cpp
index 8266a16c2..1ae55b22c 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1879,6 +1879,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.slot_prompt_similarity = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--testing-sampler-delay-millis"}, "N",
+        format("for tests: delay in milliseconds to add to each sampling (default: %d)", params.testing_sampler_delay_millis),
+        [](gpt_params & params, int value) {
+            params.testing_sampler_delay_millis = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--lora-init-without-apply"},
         format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
diff --git a/common/common.h b/common/common.h
index 8b84cf9ad..154d59846 100644
--- a/common/common.h
+++ b/common/common.h
@@ -299,6 +299,8 @@ struct gpt_params {
 
     float slot_prompt_similarity = 0.5f;
 
+    int testing_sampler_delay_millis = 0;
+
     // batched-bench params
     bool is_pp_shared = false;
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1ce4d7e26..c308e23ca 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2348,6 +2348,9 @@ struct server_context {
                 }
 
                 completion_token_output result;
+                if (params.testing_sampler_delay_millis > 0) {
+                    std::this_thread::sleep_for(std::chrono::milliseconds(params.testing_sampler_delay_millis));
+                }
                 const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
 
                 gpt_sampler_accept(slot.smpl, id, true);
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 2611614ba..31bfb0b2b 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -78,6 +78,7 @@ def step_server_config(context, server_fqdn: str, server_port: str):
     context.response_format = None
     context.temperature = None
     context.lora_file = None
+    context.testing_sampler_delay_millis = None
     context.disable_ctx_shift = False
 
     context.tasks_result = []
@@ -455,6 +456,9 @@ def step_impl(context, n_ga):
 def step_impl(context, n_ga_w):
     context.n_ga_w = n_ga_w
 
+@step('{testing_sampler_delay_millis:d} milliseconds delay in sampler for testing')
+def step_testing_sampler_delay_millis(context, testing_sampler_delay_millis):
+    context.testing_sampler_delay_millis = testing_sampler_delay_millis
 
 @step('a passkey prompt template')
 def step_prompt_passkey(context):
@@ -1436,6 +1440,8 @@ def start_server_background(context):
         server_args.append('--verbose')
     if context.lora_file:
         server_args.extend(['--lora', context.lora_file])
+    if context.testing_sampler_delay_millis:
+        server_args.extend(['--testing-sampler-delay-millis', context.testing_sampler_delay_millis])
     if context.disable_ctx_shift:
         server_args.extend(['--no-context-shift'])