server: concurrency fix + monitoring - add /metrics prometheus compatible endpoint (#5708)

* server: monitoring - add /metrics prometheus compatible endpoint * server: concurrency issue, when 2 task are waiting for results, only one call thread is notified * server: metrics - move to a dedicated struct
2024-02-25 13:49:43 +01:00 · 2024-02-25 13:49:43 +01:00 · d52d7819b8
commit d52d7819b8
parent 1289408817
7 changed files with 191 additions and 8 deletions
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@ -16,6 +16,8 @@ def before_scenario(context, scenario):


 def after_scenario(context, scenario):
+    if context.server_process is None:
+        return
    if scenario.status == "failed":
        if 'GITHUB_ACTIONS' in os.environ:
            print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -13,6 +13,7 @@ Feature: llama.cpp server
    And   1 slots
    And   embeddings extraction
    And   32 server max tokens to predict
+    And   prometheus compatible metrics exposed
    Then  the server is starting
    Then  the server is healthy

@ -25,6 +26,7 @@ Feature: llama.cpp server
    And   <n_predict> max tokens to predict
    And   a completion request with no api error
    Then  <n_predicted> tokens are predicted matching <re_content>
+    And   prometheus metrics are exposed

    Examples: Prompts
      | prompt                           | n_predict | re_content                   | n_predicted |
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -13,6 +13,7 @@ import aiohttp
 import openai
 from behave import step
 from behave.api.async_step import async_run_until_complete
+from prometheus_client import parser


@step(u"a server listening on {server_fqdn}:{server_port}")
@ -34,6 +35,8 @@ def step_server_config(context, server_fqdn, server_port):
    context.server_api_key = None
    context.server_continuous_batching = False
    context.server_embeddings = False
+    context.server_metrics = False
+    context.server_process = None
    context.server_seed = None
    context.user_api_key = None

@ -82,6 +85,11 @@ def step_server_embeddings(context):
    context.server_embeddings = True


+@step(u'prometheus compatible metrics exposed')
+def step_server_metrics(context):
+    context.server_metrics = True
+
+
@step(u"the server is starting")
 def step_start_server(context):
    start_server_background(context)
@ -424,6 +432,23 @@ def step_check_options_header_value(context, cors_header, cors_header_value):
    assert context.options_response.headers[cors_header] == cors_header_value


+@step(u'prometheus metrics are exposed')
+@async_run_until_complete
+async def step_prometheus_metrics_exported(context):
+    async with aiohttp.ClientSession() as session:
+        async with await session.get(f'{context.base_url}/metrics') as metrics_response:
+            assert metrics_response.status == 200
+            assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
+            metrics_raw = await metrics_response.text()
+            metric_exported = False
+            for metric in parser.text_string_to_metric_families(metrics_raw):
+                match metric.name:
+                    case "llamacpp:kv_cache_usage_ratio":
+                        assert len(metric.samples) > 0
+                        metric_exported = True
+            assert metric_exported, "No metrics exported"
+
+
 async def concurrent_requests(context, f_completion, *args, **kwargs):
    n_prompts = len(context.prompts)
    if context.debug:
@ -753,6 +778,8 @@ def start_server_background(context):
        server_args.append('--cont-batching')
    if context.server_embeddings:
        server_args.append('--embedding')
+    if context.server_metrics:
+        server_args.append('--metrics')
    if context.model_alias is not None:
        server_args.extend(['--alias', context.model_alias])
    if context.n_ctx is not None:
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@ -1,3 +1,4 @@
 aiohttp~=3.9.3
 behave~=1.2.6
 openai~=0.25.0
+prometheus-client~=0.20.0