From 076346db8a85ceabe485a852d577d05bdfb2f308 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 28 Dec 2024 16:16:57 +0100
Subject: [PATCH] fix condition

---
 examples/server/server.cpp              | 23 ++++++++++++++++-------
 examples/server/tests/README.md         |  6 ++++++
 examples/server/tests/unit/test_lora.py |  5 ++---
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d6e32084a..a5caf6ac9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2558,12 +2558,22 @@ struct server_context {
         // start populating the batch for this iteration
         common_batch_clear(batch);
 
+        // track if given slot can be batched with slots already in the batch
+        server_slot * slot_batched = nullptr;
+
         // frist, add sampled tokens from any ongoing sequences
         for (auto & slot : slots) {
             if (slot.state != SLOT_STATE_GENERATING) {
                 continue;
             }
 
+            // check if we can batch this slot with the previous one
+            if (!slot_batched) {
+                slot_batched = &slot;
+            } else if (slot_batched && !slot_batched->can_batch_with(slot)) {
+                continue;
+            }
+
             slot.i_batch = batch.n_tokens;
 
             common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
@@ -2582,17 +2592,16 @@ struct server_context {
         int32_t n_batch  = llama_n_batch(ctx);
         int32_t n_ubatch = llama_n_ubatch(ctx);
 
-        // track if given slot can be batched with slots already in the batch
-        server_slot * slot_batched = nullptr;
-
         // next, batch any pending prompts without exceeding n_batch
         if (params_base.cont_batching || batch.n_tokens == 0) {
             for (auto & slot : slots) {
                 // check if we can batch this slot with the previous one
-                if (!slot_batched) {
-                    slot_batched = &slot;
-                } else if (slot_batched && !slot_batched->can_batch_with(slot)) {
-                    continue;
+                if (slot.is_processing()) {
+                    if (!slot_batched) {
+                        slot_batched = &slot;
+                    } else if (slot_batched && !slot_batched->can_batch_with(slot)) {
+                        continue;
+                    }
                 }
 
                 // this slot still has a prompt to be processed
diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md
index fa3d0a2f5..5787276ab 100644
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -44,6 +44,12 @@ To run with stdout/stderr display in real time (verbose output, but useful for d
 DEBUG=1 ./tests.sh -s -v -x
 ```
 
+To run single test unit:
+
+```shell
+./tests.sh unit/test_{name of test case here}.py -v -x
+```
+
 Hint: You can compile and run test in single command, useful for local developement:
 
 ```shell
diff --git a/examples/server/tests/unit/test_lora.py b/examples/server/tests/unit/test_lora.py
index 0751f156b..68a7be17e 100644
--- a/examples/server/tests/unit/test_lora.py
+++ b/examples/server/tests/unit/test_lora.py
@@ -52,12 +52,11 @@ def test_lora_per_request():
     lora_config = [
         ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
         ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
-        ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
-        ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
+        ( [{"id": 0, "scale": 0.3}], "(special|thing|gifted)+" ),
+        ( [{"id": 0, "scale": 0.7}], "(far|from|home|away)+" ),
         ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
         ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
     ]
-    # FIXME: tesing with scale between 0.0 and 1.0 (i.e. 0.2, 0.5, 0.7) produces unreliable results
 
     tasks = [(
         server.make_request,