Add "/chat/completions" as alias for "/v1/chat/completions"

2024-02-25 22:58:47 -07:00 · 2024-02-25 22:58:47 -07:00 · 6f318cf76c
commit 6f318cf76c
parent 4c4cb30736
3 changed files with 115 additions and 68 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2927,9 +2927,7 @@ int main(int argc, char **argv)
                res.set_content(models.dump(), "application/json; charset=utf-8");
            });
-
+    const auto chat_completions = [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
    // TODO: add mount point without "/v1" prefix -- how?
    svr.Post("/v1/chat/completions", [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
    {
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        if (!validate_api_key(req, res)) {
@ -3007,7 +3005,10 @@ int main(int argc, char **argv)
            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
        }
-            });
+    };
    svr.Post("/chat/completions", chat_completions);
    svr.Post("/v1/chat/completions", chat_completions);
    svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
            {
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@ -53,6 +53,28 @@ Feature: Parallel
      | disabled  | 128       |
      | enabled   | 64        |
  Scenario Outline: Multi users OAI completions compatibility no v1
    Given a system prompt You are a writer.
    And   a model tinyllama-2
    Given a prompt:
      """
      Write a very long book.
      """
    And a prompt:
      """
      Write another a poem.
      """
    And <n_predict> max tokens to predict
    And streaming is <streaming>
    Given concurrent OAI completions requests no v1
    Then the server is busy
    Then the server is idle
    Then all prompts are predicted with <n_predict> tokens
    Examples:
      | streaming | n_predict |
      | disabled  | 128       |
      | enabled   | 64        |
  Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969
    Given a prompt:
      """
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -222,6 +222,7 @@ async def step_oai_chat_completions(context, api_error):
    completion = await oai_chat_completions(context.prompts.pop(),
                                            context.system_prompt,
                                            context.base_url,
                                            '/v1/chat',
                                            False,
                                            model=context.model if hasattr(context, 'model') else None,
@ -279,6 +280,28 @@ async def step_oai_chat_completions(context):
                                         # user_prompt is inserted automatically
                                         context.system_prompt,
                                         context.base_url,
                                         '/v1/chat/completions',
                                         True,  # async_client
                                         model=context.model
                                         if hasattr(context, 'model') else None,
                                         n_predict=context.n_predict
                                         if hasattr(context, 'n_predict') else None,
                                         enable_streaming=context.enable_streaming
                                         if hasattr(context, 'enable_streaming') else None,
                                         server_seed=context.server_seed
                                         if hasattr(context, 'server_seed') else None,
                                         user_api_key=context.user_api_key
                                         if hasattr(context, 'user_api_key') else None)
@step(u'concurrent OAI completions requests no v1')
@async_run_until_complete
 async def step_oai_chat_completions(context):
    await concurrent_completion_requests(context, oai_chat_completions,
                                         # user_prompt is inserted automatically
                                         context.system_prompt,
                                         context.base_url,
                                         '/chat/completions',
                                         True,  # async_client
                                         model=context.model
                                         if hasattr(context, 'model') else None,
@ -449,6 +472,7 @@ async def request_completion(prompt,
 async def oai_chat_completions(user_prompt,
                               system_prompt,
                               base_url,
                               base_path,
                               async_client,
                               debug=False,
                               model=None,
@ -489,7 +513,7 @@ async def oai_chat_completions(user_prompt,
        origin = 'llama.cpp'
        headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
        async with aiohttp.ClientSession() as session:
-            async with session.post(f'{base_url}/v1/chat/completions',
+            async with session.post(f'{base_url}{base_path}',
                                    json=payload,
                                    headers=headers) as response:
                if enable_streaming:
@ -531,7 +555,7 @@ async def oai_chat_completions(user_prompt,
    else:
        try:
            openai.api_key = user_api_key
-            openai.api_base = f'{base_url}/v1/chat'
+            openai.api_base = f'{base_url}{base_path}'
            chat_completion = openai.Completion.create(
                messages=payload['messages'],
                model=model,