From 09b77b4c9edd1daddb0926b9c2df3aa00b6a79c3 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 24 Feb 2024 13:01:48 +0100 Subject: [PATCH] server: #5655 - continue to update other slots on embedding concurrent request. server: tests: add multi users embeddings as fixed --- examples/server/server.cpp | 2 +- examples/server/tests/features/issues.feature | 34 +------------------ .../server/tests/features/parallel.feature | 23 +++++++++++++ 3 files changed, 25 insertions(+), 34 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9fb436c2a..19a8c1067 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1836,7 +1836,7 @@ struct llama_server_context send_embedding(slot); slot.release(); slot.i_batch = -1; - return true; + continue; } completion_token_output result; diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature index 542006d9a..bf5a175a3 100644 --- a/examples/server/tests/features/issues.feature +++ b/examples/server/tests/features/issues.feature @@ -1,36 +1,4 @@ # List of ongoing issues @bug Feature: Issues - # Issue #5655 - Scenario: Multi users embeddings - Given a server listening on localhost:8080 - And a model file stories260K.gguf - And a model alias tinyllama-2 - And 42 as server seed - And 64 KV cache size - And 2 slots - And continuous batching - And embeddings extraction - Then the server is starting - Then the server is healthy - - Given a prompt: - """ - Write a very long story about AI. - """ - And a prompt: - """ - Write another very long music lyrics. - """ - And a prompt: - """ - Write a very long poem. - """ - And a prompt: - """ - Write a very long joke. - """ - Given concurrent embedding requests - Then the server is busy - Then the server is idle - Then all embeddings are generated + # No confirmed issue at the moment diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature index 802d624ff..ff1c13571 100644 --- a/examples/server/tests/features/parallel.feature +++ b/examples/server/tests/features/parallel.feature @@ -8,6 +8,7 @@ Feature: Parallel And 42 as server seed And 64 KV cache size And 2 slots + And embeddings extraction And continuous batching Then the server is starting Then the server is healthy @@ -75,3 +76,25 @@ Feature: Parallel Then the server is busy Then the server is idle Then all prompts are predicted + + Scenario: Multi users embeddings + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And a prompt: + """ + Write a very long poem. + """ + And a prompt: + """ + Write a very long joke. + """ + Given concurrent embedding requests + Then the server is busy + Then the server is idle + Then all embeddings are generated