Add "/chat/completions" as alias for "/v1/chat/completions"

This commit is contained in:
jorgealias 2024-02-25 22:58:47 -07:00
parent 4c4cb30736
commit 6f318cf76c
3 changed files with 115 additions and 68 deletions

View file

@ -2927,9 +2927,7 @@ int main(int argc, char **argv)
res.set_content(models.dump(), "application/json; charset=utf-8"); res.set_content(models.dump(), "application/json; charset=utf-8");
}); });
const auto chat_completions = [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
// TODO: add mount point without "/v1" prefix -- how?
svr.Post("/v1/chat/completions", [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
{ {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
if (!validate_api_key(req, res)) { if (!validate_api_key(req, res)) {
@ -3007,7 +3005,10 @@ int main(int argc, char **argv)
res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
} }
}); };
svr.Post("/chat/completions", chat_completions);
svr.Post("/v1/chat/completions", chat_completions);
svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
{ {

View file

@ -53,6 +53,28 @@ Feature: Parallel
| disabled | 128 | | disabled | 128 |
| enabled | 64 | | enabled | 64 |
Scenario Outline: Multi users OAI completions compatibility no v1
Given a system prompt You are a writer.
And a model tinyllama-2
Given a prompt:
"""
Write a very long book.
"""
And a prompt:
"""
Write another a poem.
"""
And <n_predict> max tokens to predict
And streaming is <streaming>
Given concurrent OAI completions requests no v1
Then the server is busy
Then the server is idle
Then all prompts are predicted with <n_predict> tokens
Examples:
| streaming | n_predict |
| disabled | 128 |
| enabled | 64 |
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969 Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
Given a prompt: Given a prompt:
""" """

View file

@ -222,6 +222,7 @@ async def step_oai_chat_completions(context, api_error):
completion = await oai_chat_completions(context.prompts.pop(), completion = await oai_chat_completions(context.prompts.pop(),
context.system_prompt, context.system_prompt,
context.base_url, context.base_url,
'/v1/chat',
False, False,
model=context.model if hasattr(context, 'model') else None, model=context.model if hasattr(context, 'model') else None,
@ -279,6 +280,28 @@ async def step_oai_chat_completions(context):
# user_prompt is inserted automatically # user_prompt is inserted automatically
context.system_prompt, context.system_prompt,
context.base_url, context.base_url,
'/v1/chat/completions',
True, # async_client
model=context.model
if hasattr(context, 'model') else None,
n_predict=context.n_predict
if hasattr(context, 'n_predict') else None,
enable_streaming=context.enable_streaming
if hasattr(context, 'enable_streaming') else None,
server_seed=context.server_seed
if hasattr(context, 'server_seed') else None,
user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None)
@step(u'concurrent OAI completions requests no v1')
@async_run_until_complete
async def step_oai_chat_completions(context):
await concurrent_completion_requests(context, oai_chat_completions,
# user_prompt is inserted automatically
context.system_prompt,
context.base_url,
'/chat/completions',
True, # async_client True, # async_client
model=context.model model=context.model
if hasattr(context, 'model') else None, if hasattr(context, 'model') else None,
@ -449,6 +472,7 @@ async def request_completion(prompt,
async def oai_chat_completions(user_prompt, async def oai_chat_completions(user_prompt,
system_prompt, system_prompt,
base_url, base_url,
base_path,
async_client, async_client,
debug=False, debug=False,
model=None, model=None,
@ -489,7 +513,7 @@ async def oai_chat_completions(user_prompt,
origin = 'llama.cpp' origin = 'llama.cpp'
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.post(f'{base_url}/v1/chat/completions', async with session.post(f'{base_url}{base_path}',
json=payload, json=payload,
headers=headers) as response: headers=headers) as response:
if enable_streaming: if enable_streaming:
@ -531,7 +555,7 @@ async def oai_chat_completions(user_prompt,
else: else:
try: try:
openai.api_key = user_api_key openai.api_key = user_api_key
openai.api_base = f'{base_url}/v1/chat' openai.api_base = f'{base_url}{base_path}'
chat_completion = openai.Completion.create( chat_completion = openai.Completion.create(
messages=payload['messages'], messages=payload['messages'],
model=model, model=model,