server: tests: embeddings use a real embeddings model (#5908)
This commit is contained in:
parent
bfb121fd2e
commit
79ef3c0585
6 changed files with 161 additions and 93 deletions
3
.github/workflows/server.yml
vendored
3
.github/workflows/server.yml
vendored
|
@ -58,7 +58,8 @@ jobs:
|
||||||
cmake \
|
cmake \
|
||||||
python3-pip \
|
python3-pip \
|
||||||
wget \
|
wget \
|
||||||
psmisc
|
psmisc \
|
||||||
|
language-pack-en
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
|
97
examples/server/tests/features/embeddings.feature
Normal file
97
examples/server/tests/features/embeddings.feature
Normal file
|
@ -0,0 +1,97 @@
|
||||||
|
@llama.cpp
|
||||||
|
@embeddings
|
||||||
|
Feature: llama.cpp server
|
||||||
|
|
||||||
|
Background: Server startup
|
||||||
|
Given a server listening on localhost:8080
|
||||||
|
And a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
|
||||||
|
And a model alias bert-bge-small
|
||||||
|
And 42 as server seed
|
||||||
|
And 2 slots
|
||||||
|
And 512 as batch size
|
||||||
|
And 1024 KV cache size
|
||||||
|
And embeddings extraction
|
||||||
|
Then the server is starting
|
||||||
|
Then the server is healthy
|
||||||
|
|
||||||
|
Scenario: Embedding
|
||||||
|
When embeddings are computed for:
|
||||||
|
"""
|
||||||
|
What is the capital of Bulgaria ?
|
||||||
|
"""
|
||||||
|
Then embeddings are generated
|
||||||
|
|
||||||
|
Scenario: OAI Embeddings compatibility
|
||||||
|
Given a model bert-bge-small
|
||||||
|
When an OAI compatible embeddings computation request for:
|
||||||
|
"""
|
||||||
|
What is the capital of Spain ?
|
||||||
|
"""
|
||||||
|
Then embeddings are generated
|
||||||
|
|
||||||
|
Scenario: OAI Embeddings compatibility with multiple inputs
|
||||||
|
Given a model bert-bge-small
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
In which country Paris is located ?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Is Madrid the capital of Spain ?
|
||||||
|
"""
|
||||||
|
When an OAI compatible embeddings computation request for multiple inputs
|
||||||
|
Then embeddings are generated
|
||||||
|
|
||||||
|
Scenario: Multi users embeddings
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long story about AI.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write another very long music lyrics.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long poem.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long joke.
|
||||||
|
"""
|
||||||
|
Given concurrent embedding requests
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
Then all embeddings are generated
|
||||||
|
|
||||||
|
Scenario: Multi users OAI compatibility embeddings
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
In which country Paris is located ?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Is Madrid the capital of Spain ?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
What is the biggest US city ?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
What is the capital of Bulgaria ?
|
||||||
|
"""
|
||||||
|
And a model bert-bge-small
|
||||||
|
Given concurrent OAI embedding requests
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
Then all embeddings are generated
|
||||||
|
|
||||||
|
@wip
|
||||||
|
Scenario: All embeddings should be the same
|
||||||
|
Given 20 fixed prompts
|
||||||
|
And a model bert-bge-small
|
||||||
|
Given concurrent OAI embedding requests
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
Then all embeddings are the same
|
|
@ -9,7 +9,6 @@ Feature: Parallel
|
||||||
And 512 as batch size
|
And 512 as batch size
|
||||||
And 64 KV cache size
|
And 64 KV cache size
|
||||||
And 2 slots
|
And 2 slots
|
||||||
And embeddings extraction
|
|
||||||
And continuous batching
|
And continuous batching
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
Then the server is healthy
|
Then the server is healthy
|
||||||
|
@ -99,48 +98,3 @@ Feature: Parallel
|
||||||
Then the server is busy
|
Then the server is busy
|
||||||
Then the server is idle
|
Then the server is idle
|
||||||
Then all prompts are predicted
|
Then all prompts are predicted
|
||||||
|
|
||||||
Scenario: Multi users embeddings
|
|
||||||
Given a prompt:
|
|
||||||
"""
|
|
||||||
Write a very long story about AI.
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
Write another very long music lyrics.
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
Write a very long poem.
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
Write a very long joke.
|
|
||||||
"""
|
|
||||||
Given concurrent embedding requests
|
|
||||||
Then the server is busy
|
|
||||||
Then the server is idle
|
|
||||||
Then all embeddings are generated
|
|
||||||
|
|
||||||
Scenario: Multi users OAI compatibility embeddings
|
|
||||||
Given a prompt:
|
|
||||||
"""
|
|
||||||
In which country Paris is located ?
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
Is Madrid the capital of Spain ?
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
What is the biggest US city ?
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
What is the capital of Bulgaria ?
|
|
||||||
"""
|
|
||||||
And a model tinyllama-2
|
|
||||||
Given concurrent OAI embedding requests
|
|
||||||
Then the server is busy
|
|
||||||
Then the server is idle
|
|
||||||
Then all embeddings are generated
|
|
||||||
|
|
|
@ -49,34 +49,6 @@ Feature: llama.cpp server
|
||||||
| llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled |
|
| llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled |
|
||||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled |
|
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled |
|
||||||
|
|
||||||
Scenario: Embedding
|
|
||||||
When embeddings are computed for:
|
|
||||||
"""
|
|
||||||
What is the capital of Bulgaria ?
|
|
||||||
"""
|
|
||||||
Then embeddings are generated
|
|
||||||
|
|
||||||
Scenario: OAI Embeddings compatibility
|
|
||||||
Given a model tinyllama-2
|
|
||||||
When an OAI compatible embeddings computation request for:
|
|
||||||
"""
|
|
||||||
What is the capital of Spain ?
|
|
||||||
"""
|
|
||||||
Then embeddings are generated
|
|
||||||
|
|
||||||
Scenario: OAI Embeddings compatibility with multiple inputs
|
|
||||||
Given a model tinyllama-2
|
|
||||||
Given a prompt:
|
|
||||||
"""
|
|
||||||
In which country Paris is located ?
|
|
||||||
"""
|
|
||||||
And a prompt:
|
|
||||||
"""
|
|
||||||
Is Madrid the capital of Spain ?
|
|
||||||
"""
|
|
||||||
When an OAI compatible embeddings computation request for multiple inputs
|
|
||||||
Then embeddings are generated
|
|
||||||
|
|
||||||
Scenario: Tokenize / Detokenize
|
Scenario: Tokenize / Detokenize
|
||||||
When tokenizing:
|
When tokenizing:
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -10,6 +10,7 @@ from contextlib import closing
|
||||||
from re import RegexFlag
|
from re import RegexFlag
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
import numpy as np
|
||||||
import openai
|
import openai
|
||||||
from behave import step
|
from behave import step
|
||||||
from behave.api.async_step import async_run_until_complete
|
from behave.api.async_step import async_run_until_complete
|
||||||
|
@ -34,6 +35,7 @@ def step_server_config(context, server_fqdn, server_port):
|
||||||
context.n_ga_w = None
|
context.n_ga_w = None
|
||||||
context.n_gpu_layer = None
|
context.n_gpu_layer = None
|
||||||
context.n_predict = None
|
context.n_predict = None
|
||||||
|
context.n_prompts = 0
|
||||||
context.n_server_predict = None
|
context.n_server_predict = None
|
||||||
context.n_slots = None
|
context.n_slots = None
|
||||||
context.prompt_prefix = None
|
context.prompt_prefix = None
|
||||||
|
@ -202,6 +204,7 @@ def step_n_tokens_predicted(context, predicted_n):
|
||||||
@step(u'a user prompt {user_prompt}')
|
@step(u'a user prompt {user_prompt}')
|
||||||
def step_user_prompt(context, user_prompt):
|
def step_user_prompt(context, user_prompt):
|
||||||
context.prompts.append(user_prompt)
|
context.prompts.append(user_prompt)
|
||||||
|
context.n_prompts = len(context.prompts)
|
||||||
|
|
||||||
|
|
||||||
@step(u'a system prompt {system_prompt}')
|
@step(u'a system prompt {system_prompt}')
|
||||||
|
@ -289,6 +292,11 @@ def step_impl(context, n_ga_w):
|
||||||
def step_prompt_passkey(context):
|
def step_prompt_passkey(context):
|
||||||
context.prompt_passkey = context.text
|
context.prompt_passkey = context.text
|
||||||
|
|
||||||
|
@step(u'{n_prompts:d} fixed prompts')
|
||||||
|
def step_fixed_prompts(context, n_prompts):
|
||||||
|
context.prompts.extend([str(0)*1024 for i in range(n_prompts)])
|
||||||
|
context.n_prompts = n_prompts
|
||||||
|
|
||||||
|
|
||||||
@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
|
@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
|
||||||
def step_prompt_passkey(context, passkey, i_pos):
|
def step_prompt_passkey(context, passkey, i_pos):
|
||||||
|
@ -301,6 +309,7 @@ def step_prompt_passkey(context, passkey, i_pos):
|
||||||
passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
|
passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
|
||||||
print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
|
print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
|
||||||
context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
|
context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
|
||||||
|
context.n_prompts = len(context.prompts)
|
||||||
|
|
||||||
|
|
||||||
@step(u'an OAI compatible chat completions request with {api_error} api error')
|
@step(u'an OAI compatible chat completions request with {api_error} api error')
|
||||||
|
@ -341,11 +350,13 @@ async def step_oai_chat_completions(context, api_error):
|
||||||
@step(u'a prompt')
|
@step(u'a prompt')
|
||||||
def step_a_prompt(context):
|
def step_a_prompt(context):
|
||||||
context.prompts.append(context.text)
|
context.prompts.append(context.text)
|
||||||
|
context.n_prompts = len(context.prompts)
|
||||||
|
|
||||||
|
|
||||||
@step(u'a prompt {prompt}')
|
@step(u'a prompt {prompt}')
|
||||||
def step_a_prompt_prompt(context, prompt):
|
def step_a_prompt_prompt(context, prompt):
|
||||||
context.prompts.append(prompt)
|
context.prompts.append(prompt)
|
||||||
|
context.n_prompts = len(context.prompts)
|
||||||
|
|
||||||
|
|
||||||
@step(u'concurrent completion requests')
|
@step(u'concurrent completion requests')
|
||||||
|
@ -430,25 +441,47 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
|
||||||
@step(u'embeddings are computed for')
|
@step(u'embeddings are computed for')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_compute_embedding(context):
|
async def step_compute_embedding(context):
|
||||||
|
context.n_prompts = 1
|
||||||
context.embeddings = await request_embedding(context.text, base_url=context.base_url)
|
context.embeddings = await request_embedding(context.text, base_url=context.base_url)
|
||||||
|
|
||||||
|
|
||||||
|
@step(u'all embeddings are the same')
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_all_embeddings_are_the_same(context):
|
||||||
|
n_embedding_requests = await gather_tasks_results(context)
|
||||||
|
assert n_embedding_requests > 0
|
||||||
|
embeddings = []
|
||||||
|
for i in range(n_embedding_requests):
|
||||||
|
embedding = context.tasks_result.pop().pop()
|
||||||
|
embeddings.append(embedding)
|
||||||
|
assert_embeddings(embedding)
|
||||||
|
n = len(embeddings)
|
||||||
|
for i in range(n-1):
|
||||||
|
for j in range(i+1, n):
|
||||||
|
embedding1 = np.array(embeddings[i])
|
||||||
|
embedding2 = np.array(embeddings[j])
|
||||||
|
if context.debug:
|
||||||
|
print(f"embedding1: {embedding1[-8:]}\n")
|
||||||
|
print(f"embedding2: {embedding2[-8:]}\n")
|
||||||
|
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
|
||||||
|
msg = f"Similarity between {i} and {j}: {similarity:.10f}"
|
||||||
|
if context.debug:
|
||||||
|
print(f"{msg}\n")
|
||||||
|
assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg
|
||||||
|
|
||||||
@step(u'embeddings are generated')
|
@step(u'embeddings are generated')
|
||||||
def step_assert_embeddings(context):
|
def step_assert_embeddings(context):
|
||||||
if len(context.prompts) == 0:
|
assert context.n_prompts == len(context.embeddings), (f"unexpected response:\n"
|
||||||
assert_embeddings(context.embeddings)
|
f"context.n_prompts={context.n_prompts}\n"
|
||||||
else:
|
f"context.embeddings={context.embeddings}")
|
||||||
assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n"
|
for embedding in context.embeddings:
|
||||||
f"context.prompts={context.prompts}\n"
|
assert_embeddings(embedding)
|
||||||
f"context.embeddings={context.embeddings}")
|
|
||||||
for embedding in context.embeddings:
|
|
||||||
context.prompts.pop()
|
|
||||||
assert_embeddings(embedding)
|
|
||||||
|
|
||||||
|
|
||||||
@step(u'an OAI compatible embeddings computation request for')
|
@step(u'an OAI compatible embeddings computation request for')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_oai_compute_embeddings(context):
|
async def step_oai_compute_embeddings(context):
|
||||||
|
context.n_prompts = 1
|
||||||
context.embeddings = await request_oai_embeddings(context.text,
|
context.embeddings = await request_oai_embeddings(context.text,
|
||||||
base_url=context.base_url,
|
base_url=context.base_url,
|
||||||
user_api_key=context.user_api_key,
|
user_api_key=context.user_api_key,
|
||||||
|
@ -462,6 +495,7 @@ async def step_oai_compute_embeddings_multiple_inputs(context):
|
||||||
base_url=context.base_url,
|
base_url=context.base_url,
|
||||||
user_api_key=context.user_api_key,
|
user_api_key=context.user_api_key,
|
||||||
model=context.model)
|
model=context.model)
|
||||||
|
context.prompts.clear()
|
||||||
|
|
||||||
|
|
||||||
@step(u'concurrent embedding requests')
|
@step(u'concurrent embedding requests')
|
||||||
|
@ -488,9 +522,9 @@ async def step_concurrent_oai_embedding_requests(context):
|
||||||
@async_run_until_complete()
|
@async_run_until_complete()
|
||||||
async def all_embeddings_are_generated(context):
|
async def all_embeddings_are_generated(context):
|
||||||
n_embedding_requests = await gather_tasks_results(context)
|
n_embedding_requests = await gather_tasks_results(context)
|
||||||
assert n_embedding_requests > 0
|
assert n_embedding_requests == context.n_prompts
|
||||||
for i in range(n_embedding_requests):
|
for i in range(n_embedding_requests):
|
||||||
assert_embeddings(context.tasks_result.pop())
|
assert_embeddings(context.tasks_result.pop().pop())
|
||||||
|
|
||||||
|
|
||||||
@step(u'tokenizing')
|
@step(u'tokenizing')
|
||||||
|
@ -588,11 +622,11 @@ def step_supported_models(context, i_model, param, preposition, param_value):
|
||||||
|
|
||||||
|
|
||||||
async def concurrent_requests(context, f_completion, *args, **kwargs):
|
async def concurrent_requests(context, f_completion, *args, **kwargs):
|
||||||
n_prompts = len(context.prompts)
|
context.n_prompts = len(context.prompts)
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print(f"starting {n_prompts} concurrent completion requests...")
|
print(f"starting {context.n_prompts} concurrent completion requests...")
|
||||||
assert n_prompts > 0
|
assert context.n_prompts > 0
|
||||||
for prompt_no in range(n_prompts):
|
for prompt_no in range(context.n_prompts):
|
||||||
shifted_args = [context.prompts.pop(), *args]
|
shifted_args = [context.prompts.pop(), *args]
|
||||||
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
|
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
|
||||||
await asyncio.sleep(0.1)
|
await asyncio.sleep(0.1)
|
||||||
|
@ -765,7 +799,7 @@ async def request_embedding(content, base_url=None):
|
||||||
}) as response:
|
}) as response:
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
response_json = await response.json()
|
response_json = await response.json()
|
||||||
return response_json['embedding']
|
return [response_json['embedding']]
|
||||||
|
|
||||||
|
|
||||||
async def request_oai_embeddings(input,
|
async def request_oai_embeddings(input,
|
||||||
|
@ -775,6 +809,7 @@ async def request_oai_embeddings(input,
|
||||||
user_api_key = user_api_key if user_api_key is not None else 'nope'
|
user_api_key = user_api_key if user_api_key is not None else 'nope'
|
||||||
if async_client:
|
if async_client:
|
||||||
origin = 'llama.cpp'
|
origin = 'llama.cpp'
|
||||||
|
headers=[]
|
||||||
if user_api_key is not None:
|
if user_api_key is not None:
|
||||||
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
|
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
|
@ -790,7 +825,13 @@ async def request_oai_embeddings(input,
|
||||||
response_json = await response.json()
|
response_json = await response.json()
|
||||||
assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
|
assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
|
||||||
assert response_json['object'] == 'list'
|
assert response_json['object'] == 'list'
|
||||||
return response_json['data']
|
if isinstance(input, collections.abc.Sequence):
|
||||||
|
embeddings = []
|
||||||
|
for an_oai_embeddings in response_json['data']:
|
||||||
|
embeddings.append(an_oai_embeddings['embedding'])
|
||||||
|
else:
|
||||||
|
embeddings = [response_json['data']['embedding']]
|
||||||
|
return embeddings
|
||||||
else:
|
else:
|
||||||
openai.api_key = user_api_key
|
openai.api_key = user_api_key
|
||||||
openai.api_base = f'{base_url}/v1'
|
openai.api_base = f'{base_url}/v1'
|
||||||
|
@ -804,7 +845,7 @@ async def request_oai_embeddings(input,
|
||||||
for an_oai_embeddings in oai_embeddings.data:
|
for an_oai_embeddings in oai_embeddings.data:
|
||||||
embeddings.append(an_oai_embeddings.embedding)
|
embeddings.append(an_oai_embeddings.embedding)
|
||||||
else:
|
else:
|
||||||
embeddings = oai_embeddings.data.embedding
|
embeddings = [oai_embeddings.data.embedding]
|
||||||
return embeddings
|
return embeddings
|
||||||
|
|
||||||
|
|
||||||
|
@ -899,6 +940,8 @@ def assert_embeddings(embeddings):
|
||||||
assert len(embeddings) > 0
|
assert len(embeddings) > 0
|
||||||
embeddings_computed = False
|
embeddings_computed = False
|
||||||
for emb in embeddings:
|
for emb in embeddings:
|
||||||
|
if not isinstance(emb, float):
|
||||||
|
assert False, f"Bad embeddings: {embeddings}"
|
||||||
if emb != 0:
|
if emb != 0:
|
||||||
embeddings_computed = True
|
embeddings_computed = True
|
||||||
assert embeddings_computed, f"Embeddings: {embeddings}"
|
assert embeddings_computed, f"Embeddings: {embeddings}"
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
aiohttp~=3.9.3
|
aiohttp~=3.9.3
|
||||||
behave~=1.2.6
|
behave~=1.2.6
|
||||||
huggingface_hub~=0.20.3
|
huggingface_hub~=0.20.3
|
||||||
|
numpy~=1.24.4
|
||||||
openai~=0.25.0
|
openai~=0.25.0
|
||||||
prometheus-client~=0.20.0
|
prometheus-client~=0.20.0
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue