diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 3cdcc5ca3..569e675b7 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -1,6 +1,6 @@ # Server Integration Test -Functional server tests suite. +Server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) with [behave](https://behave.readthedocs.io/en/latest/). ### Install dependencies `pip install -r requirements.txt` @@ -9,3 +9,8 @@ Functional server tests suite. 1. Build the server 2. download a GGUF model: `./scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf` 3. Start the test: `./tests.sh stories260K.gguf -ngl 23` + +### Skipped scenario + +Scenario must be annotated with `@llama.cpp` to be included in the scope. +`@bug` annotation aims to link a scenario with a GitHub issue. \ No newline at end of file diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index df376b0f2..5f6b161c8 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -98,3 +98,21 @@ Feature: llama.cpp server And all slots are idle Then all prompts are predicted + + @llama.cpp + Scenario: Embedding + When embeddings are computed for: + """ + What is the capital of France ? + """ + Then embeddings are generated + + + @llama.cpp + Scenario: OAI Embeddings compatibility + Given a model tinyllama-2 + When an OAI compatible embeddings computation request for: + """ + What is the capital of Spain ? + """ + Then embeddings are generated \ No newline at end of file diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 75e893afa..140e02626 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -20,7 +20,6 @@ def step_server_config(context, server_fqdn, server_port, n_slots, seed): context.prompts = [] openai.api_key = 'llama.cpp' - openai.api_base = f'{context.base_url}/v1/chat' @step(u"the server is {expecting_status}") @@ -141,6 +140,30 @@ def step_all_prompts_are_predicted(context): assert_n_tokens_predicted(completion) +@step(u'embeddings are computed for') +def step_compute_embedding(context): + response = requests.post(f'{context.base_url}/embedding', json={ + "content": context.text, + }) + assert response.status_code == 200 + context.embeddings = response.json()['embedding'] + + +@step(u'embeddings are generated') +def step_compute_embeddings(context): + assert len(context.embeddings) > 0 + + +@step(u'an OAI compatible embeddings computation request for') +def step_oai_compute_embedding(context): + openai.api_base = f'{context.base_url}/v1' + embeddings = openai.Embedding.create( + model=context.model, + input=context.text, + ) + context.embeddings = embeddings + + def concurrent_requests(context, f_completion): context.completions.clear() context.completion_threads.clear() @@ -162,6 +185,7 @@ def request_completion(context, prompt, n_predict=None): def oai_chat_completions(context, user_prompt): + openai.api_base = f'{context.base_url}/v1/chat' chat_completion = openai.Completion.create( messages=[ {