diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d3e6c5e83..745df6b5c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2184,6 +2184,7 @@ struct server_context { auto res = std::make_unique(); res->id = task.id; + res->slots_data = slots_data; res->n_idle_slots = n_idle_slots; res->n_processing_slots = n_processing_slots; res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); diff --git a/examples/server/tests/unit/test_basic.py b/examples/server/tests/unit/test_basic.py index d82d54a5a..b8d68989b 100644 --- a/examples/server/tests/unit/test_basic.py +++ b/examples/server/tests/unit/test_basic.py @@ -33,6 +33,17 @@ def test_server_models(): assert len(res.body["data"]) == 1 assert res.body["data"][0]["id"] == server.model_alias + +def test_server_slots(): + global server + server.server_slots = True + server.start() + res = server.make_request("GET", "/slots") + assert res.status_code == 200 + assert len(res.body) == server.n_slots + assert res.body[0]["n_ctx"] > 0 + + def test_load_split_model(): global server server.model_hf_repo = "ggml-org/models" diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index d4ee360f8..6573cc17f 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -30,7 +30,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte ], }) assert res.status_code == 200 - assert "cmpl" in res.body["id"] + assert "cmpl" in res.body["id"] # make sure the completion id has the expected format assert res.body["model"] == model if model is not None else server.model_alias assert res.body["usage"]["prompt_tokens"] == n_prompt assert res.body["usage"]["completion_tokens"] == n_predicted @@ -66,7 +66,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future if last_cmpl_id is None: last_cmpl_id = data["id"] - assert last_cmpl_id == data["id"] + assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream if choice["finish_reason"] in ["stop", "length"]: assert data["usage"]["prompt_tokens"] == n_prompt assert data["usage"]["completion_tokens"] == n_predicted diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index e17a05ff6..c352943f2 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -64,6 +64,7 @@ class ServerProcess: server_embeddings: bool | None = False server_reranking: bool | None = False server_metrics: bool | None = False + server_slots: bool | None = False draft: int | None = None api_key: str | None = None response_format: str | None = None @@ -129,6 +130,8 @@ class ServerProcess: server_args.append("--reranking") if self.server_metrics: server_args.append("--metrics") + if self.server_slots: + server_args.append("--slots") if self.model_alias: server_args.extend(["--alias", self.model_alias]) if self.n_ctx: