server: #5655 - continue to update other slots on embedding concurrent request.

server: tests: add multi users embeddings as fixed
2024-02-24 13:01:48 +01:00 · 2024-02-24 13:01:48 +01:00 · 09b77b4c9e
commit 09b77b4c9e
parent 525213d2f5
3 changed files with 25 additions and 34 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1836,7 +1836,7 @@ struct llama_server_context
                    send_embedding(slot);
                    slot.release();
                    slot.i_batch = -1;
-                    return true;
+                    continue;
                }

                completion_token_output result;
--- a/examples/server/tests/features/issues.feature
+++ b/examples/server/tests/features/issues.feature
@ -1,36 +1,4 @@
 # List of ongoing issues
@bug
 Feature: Issues
-    # Issue #5655
-  Scenario: Multi users embeddings
-    Given a server listening on localhost:8080
-    And   a model file stories260K.gguf
-    And   a model alias tinyllama-2
-    And   42 as server seed
-    And   64 KV cache size
-    And   2 slots
-    And   continuous batching
-    And   embeddings extraction
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And a prompt:
-      """
-      Write a very long poem.
-      """
-    And a prompt:
-      """
-      Write a very long joke.
-      """
-    Given concurrent embedding requests
-    Then the server is busy
-    Then the server is idle
-    Then all embeddings are generated
+  # No confirmed issue at the moment
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@ -8,6 +8,7 @@ Feature: Parallel
    And   42 as server seed
    And   64 KV cache size
    And   2 slots
+    And   embeddings extraction
    And   continuous batching
    Then  the server is starting
    Then  the server is healthy
@ -75,3 +76,25 @@ Feature: Parallel
    Then the server is busy
    Then the server is idle
    Then all prompts are predicted
+
+  Scenario: Multi users embeddings
+    Given a prompt:
+      """
+      Write a very long story about AI.
+      """
+    And a prompt:
+      """
+      Write another very long music lyrics.
+      """
+    And a prompt:
+      """
+      Write a very long poem.
+      """
+    And a prompt:
+      """
+      Write a very long joke.
+      """
+    Given concurrent embedding requests
+    Then the server is busy
+    Then the server is idle
+    Then all embeddings are generated