server: Add "tokens per second" information in the backend (#10548)

* add cmake rvv support * add timings * remove space * update readme * fix * fix code * remove empty line * add test --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2024-12-02 21:45:54 +08:00 · 2024-12-02 21:45:54 +08:00 · 64ed2091b2
commit 64ed2091b2
parent 991f8aabee
5 changed files with 44 additions and 1 deletions
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@ -146,3 +146,20 @@ def test_invalid_chat_completion_req(messages):
    })
    assert res.status_code == 400 or res.status_code == 500
    assert "error" in res.body
+
+
+def test_chat_completion_with_timings_per_token():
+    global server
+    server.start()
+    res = server.make_stream_request("POST", "/chat/completions", data={
+        "max_tokens": 10,
+        "messages": [{"role": "user", "content": "test"}],
+        "stream": True,
+        "timings_per_token": True,
+    })
+    for data in res:
+        assert "timings" in data
+        assert "prompt_per_second" in data["timings"]
+        assert "predicted_per_second" in data["timings"]
+        assert "predicted_n" in data["timings"]
+        assert data["timings"]["predicted_n"] <= 10