From 09b77b4c9edd1daddb0926b9c2df3aa00b6a79c3 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 24 Feb 2024 13:01:48 +0100
Subject: [PATCH] server: #5655 - continue to update other slots on embedding
 concurrent request.

server: tests: add multi users embeddings as fixed
---
 examples/server/server.cpp                    |  2 +-
 examples/server/tests/features/issues.feature | 34 +------------------
 .../server/tests/features/parallel.feature    | 23 +++++++++++++
 3 files changed, 25 insertions(+), 34 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 9fb436c2a..19a8c1067 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1836,7 +1836,7 @@ struct llama_server_context
                     send_embedding(slot);
                     slot.release();
                     slot.i_batch = -1;
-                    return true;
+                    continue;
                 }
 
                 completion_token_output result;
diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature
index 542006d9a..bf5a175a3 100644
--- a/examples/server/tests/features/issues.feature
+++ b/examples/server/tests/features/issues.feature
@@ -1,36 +1,4 @@
 # List of ongoing issues
 @bug
 Feature: Issues
-    # Issue #5655
-  Scenario: Multi users embeddings
-    Given a server listening on localhost:8080
-    And   a model file stories260K.gguf
-    And   a model alias tinyllama-2
-    And   42 as server seed
-    And   64 KV cache size
-    And   2 slots
-    And   continuous batching
-    And   embeddings extraction
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And a prompt:
-      """
-      Write a very long poem.
-      """
-    And a prompt:
-      """
-      Write a very long joke.
-      """
-    Given concurrent embedding requests
-    Then the server is busy
-    Then the server is idle
-    Then all embeddings are generated
+  # No confirmed issue at the moment
diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature
index 802d624ff..ff1c13571 100644
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@@ -8,6 +8,7 @@ Feature: Parallel
     And   42 as server seed
     And   64 KV cache size
     And   2 slots
+    And   embeddings extraction
     And   continuous batching
     Then  the server is starting
     Then  the server is healthy
@@ -75,3 +76,25 @@ Feature: Parallel
     Then the server is busy
     Then the server is idle
     Then all prompts are predicted
+
+  Scenario: Multi users embeddings
+    Given a prompt:
+      """
+      Write a very long story about AI.
+      """
+    And a prompt:
+      """
+      Write another very long music lyrics.
+      """
+    And a prompt:
+      """
+      Write a very long poem.
+      """
+    And a prompt:
+      """
+      Write a very long joke.
+      """
+    Given concurrent embedding requests
+    Then the server is busy
+    Then the server is idle
+    Then all embeddings are generated