server: init functional test

2024-02-18 17:13:04 +01:00 · 2024-02-18 17:13:04 +01:00 · 157bcf2286
commit 157bcf2286
parent 4ed8e4fbef
7 changed files with 173 additions and 0 deletions
--- a/.github/workflows/server-test.yml
+++ b/.github/workflows/server-test.yml
@ -0,0 +1,70 @@
 # Server test scenario
 name: Server Integration Tests
 # FIXME put only necessary triggers
 on:
  push:
    branches:
      - master
      - test/server-add-ci-test # FIXME remove
    paths: ['.github/workflows/server-test.yml', '**/CMakeLists.txt', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', 'examples/server/**.*']
 jobs:
  ubuntu-latest-cmake:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
          cmake .. -DCMAKE_BUILD_TYPE=Release
          cmake --build . --config Release -j $(nproc)
      - name: Tests dependencies
        id: test_dependencies
        run: |
          pip install -r examples/server/tests/requirements.txt
      - name: Download test model
        id: download_model
        run: |
          ./scripts/hf.sh --repo TheBloke/Tinyllama-2-1b-miniguanaco-GGUF --file tinyllama-2-1b-miniguanaco.Q2_K.gguf
      - name: Server Integration Tests
        id: server_integration_test
        run: |
          ./build/bin/server \
            -m tinyllama-2-1b-miniguanaco.Q2_K.gguf \
            --ctx-size 512 \
            --parallel 4 \
            --n-predict 512 \
            --batch-size 128 \
            --threads 4 \
            --threads-batch 128 \
            --alias phi-2 \
            --embedding \
            --cont-batching &
          sh -c '\
            max_attempts=30; \
            attempts=${max_attempts}; \
            echo "waiting for server to be ready..."; \
            until curl --silent --show-error --fail "http://localhost:8080/health" | jq -r '.status' | grep ok; do \
              attempts=$(( attempts - 1)); \
              [ "${attempts}" -eq 0 ] && { echo "Server did not startup" >&2; exit 1; }; \
              sleep $(( (max_attempts - attempts) * 2 )); \
            done;'
          cd examples/server/tests
          behave
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@ -0,0 +1,9 @@
 # Server Integration Test
 Functional server tests suite.
 ### Install dependencies
 `pip install -r requirements.txt`
 ### Run tests
 `python -m behave`
--- a/examples/server/tests/features/completions.feature
+++ b/examples/server/tests/features/completions.feature
@ -0,0 +1,11 @@
 Feature: Completion request
  Scenario Outline: run a completion request
      Given a prompt <prompt>
      When we request a completion
      Then tokens are predicted
    Examples: Prompts
      | prompt                                                         |
      | I believe the meaning of life is                               |
      | Write a detailed analogy between mathematics and a lighthouse. |
--- a/examples/server/tests/features/oai.feature
+++ b/examples/server/tests/features/oai.feature
@ -0,0 +1,13 @@
 Feature: OpenAI compatible completions request
  Scenario Outline: run a completion on the OAI endpoint
    Given a system prompt <system_prompt>
    And a user prompt <user_prompt>
    And a model <model>
    When we request the oai completions endpoint
    Then the oai response contains completion tokens
    Examples: Prompts
      | model          | system_prompt                | user_prompt                            |
      | tinyllama-2    | You are ChatGPT.             | I believe the meaning of life is       |
      | tinyllama-2    | You are a coding assistant.  | Write the fibonacci function in c++    |
--- a/examples/server/tests/features/steps/completion.py
+++ b/examples/server/tests/features/steps/completion.py
@ -0,0 +1,24 @@
 from behave import *
 import requests
@given(u'a prompt {prompt}')
 def step_prompt(context, prompt):
    context.prompt = prompt
@when(u'we request a completion')
 def step_request_completion(context):
    response = requests.post('http://localhost:8080/completion', json={
        "prompt": context.prompt
    })
    status_code = response.status_code
    assert status_code == 200
    context.response_data = response.json()
@then(u'tokens are predicted')
 def step_request_completion(context):
    assert len(context.response_data['content']) > 0
    assert context.response_data['timings']['predicted_n'] > 0
--- a/examples/server/tests/features/steps/oai.py
+++ b/examples/server/tests/features/steps/oai.py
@ -0,0 +1,44 @@
 from behave import *
 import openai
 openai.api_key = 'llama.cpp'
 openai.api_base = "http://localhost:8080/v1/chat"
@given(u'a user prompt {user_prompt}')
 def step_user_prompt(context, user_prompt):
    context.user_prompt = user_prompt
@given(u'a system prompt {system_prompt}')
 def step_system_prompt(context, system_prompt):
    context.system_prompt = system_prompt
@given(u'a model {model}')
 def step_model(context, model):
    context.model = model
@when(u'we request the oai completions endpoint')
 def step_oai_completions(context):
    context.chat_completion = openai.Completion.create(
        messages=[
            {
                "role": "system",
                "content": context.system_prompt,
            },
            {
                "role": "user",
                "content": context.user_prompt,
            }
        ],
        model=context.model,
    )
@then(u'the oai response contains completion tokens')
 def step_oai_response_has_completion_tokens(context):
    assert len(context.chat_completion.choices) == 1
    assert len(context.chat_completion.choices[0].message) > 0
    assert context.chat_completion.usage.completion_tokens > 0
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@ -0,0 +1,2 @@
 behave~=1.2.6
 openai~=0.25.0