Merge remote-tracking branch 'origin/master' into jinja

This commit is contained in:
ochafik 2025-01-13 19:56:27 +00:00
commit cb72cf1fc3
215 changed files with 23423 additions and 18704 deletions

View file

@ -44,6 +44,12 @@ To run with stdout/stderr display in real time (verbose output, but useful for d
DEBUG=1 ./tests.sh -s -v -x
```
To run single test unit:
```shell
./tests.sh unit/test_{name of test case here}.py -v -x
```
Hint: You can compile and run test in single command, useful for local developement:
```shell

View file

@ -5,3 +5,4 @@ numpy~=1.26.4
openai~=1.55.3
prometheus-client~=0.20.0
requests~=2.32.3
wget~=3.2

View file

@ -85,7 +85,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
def test_chat_completion_with_openai_library():
global server
server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
res = client.chat.completions.create(
model="gpt-3.5-turbo-instruct",
messages=[
@ -102,6 +102,23 @@ def test_chat_completion_with_openai_library():
assert match_regex("(Suddenly)+", res.choices[0].message.content)
def test_chat_template():
global server
server.chat_template = "llama3"
server.debug = True # to get the "__verbose" object in the response
server.start()
res = server.make_request("POST", "/chat/completions", data={
"max_tokens": 8,
"messages": [
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
]
})
assert res.status_code == 200
assert "__verbose" in res.body
assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
@pytest.mark.parametrize("response_format,n_predicted,re_content", [
({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
@ -172,7 +189,7 @@ def test_chat_completion_with_timings_per_token():
def test_logprobs():
global server
server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
res = client.chat.completions.create(
model="gpt-3.5-turbo-instruct",
temperature=0.0,
@ -199,7 +216,7 @@ def test_logprobs():
def test_logprobs_stream():
global server
server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
res = client.chat.completions.create(
model="gpt-3.5-turbo-instruct",
temperature=0.0,

View file

@ -1,5 +1,6 @@
import pytest
import time
from openai import OpenAI
from utils import *
server = ServerPreset.tinyllama2()
@ -85,6 +86,40 @@ def test_completion_stream_vs_non_stream():
assert content_stream == res_non_stream.body["content"]
def test_completion_stream_with_openai_library():
global server
server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
res = client.completions.create(
model="davinci-002",
prompt="I believe the meaning of life is",
max_tokens=8,
)
assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
assert res.choices[0].finish_reason == "length"
assert res.choices[0].text is not None
assert match_regex("(going|bed)+", res.choices[0].text)
def test_completion_with_openai_library():
global server
server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
res = client.completions.create(
model="davinci-002",
prompt="I believe the meaning of life is",
max_tokens=8,
stream=True,
)
output_text = ''
for data in res:
choice = data.choices[0]
if choice.finish_reason is None:
assert choice.text is not None
output_text += choice.text
assert match_regex("(going|bed)+", output_text)
@pytest.mark.parametrize("n_slots", [1, 2])
def test_consistent_result_same_seed(n_slots: int):
global server

View file

@ -18,7 +18,7 @@ def test_infill_without_input_extra():
"input_suffix": "}\n",
})
assert res.status_code == 200
assert match_regex("(Ann|small|shiny)+", res.body["content"])
assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"])
def test_infill_with_input_extra():

View file

@ -1,5 +1,4 @@
import pytest
import os
from utils import *
server = ServerPreset.stories15m_moe()
@ -10,15 +9,7 @@ LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe
def create_server():
global server
server = ServerPreset.stories15m_moe()
# download lora file if needed
file_name = LORA_FILE_URL.split('/').pop()
lora_file = f'../../../{file_name}'
if not os.path.exists(lora_file):
print(f"Downloading {LORA_FILE_URL} to {lora_file}")
with open(lora_file, 'wb') as f:
f.write(requests.get(LORA_FILE_URL).content)
print(f"Done downloading lora file")
server.lora_files = [lora_file]
server.lora_files = [download_file(LORA_FILE_URL)]
@pytest.mark.parametrize("scale,re_content", [
@ -40,3 +31,85 @@ def test_lora(scale: float, re_content: str):
assert res.status_code == 200
assert match_regex(re_content, res.body["content"])
def test_lora_per_request():
global server
server.n_slots = 4
server.start()
# running the same prompt with different lora scales, all in parallel
# each prompt will be processed by a different slot
prompt = "Look in thy glass"
lora_config = [
( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
( [{"id": 0, "scale": 0.3}], "(special|thing|gifted)+" ),
( [{"id": 0, "scale": 0.7}], "(far|from|home|away)+" ),
( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
]
tasks = [(
server.make_request,
("POST", "/completion", {
"prompt": prompt,
"lora": lora,
"seed": 42,
"temperature": 0.0,
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
})
) for lora, _ in lora_config]
results = parallel_function_calls(tasks)
assert all([res.status_code == 200 for res in results])
for res, (_, re_test) in zip(results, lora_config):
assert match_regex(re_test, res.body["content"])
@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
def test_with_big_model():
server = ServerProcess()
server.model_hf_repo = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
server.model_hf_file = "Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf"
server.model_alias = "Llama-3.2-8B-Instruct"
server.n_slots = 4
server.n_ctx = server.n_slots * 1024
server.n_predict = 64
server.temperature = 0.0
server.seed = 42
server.lora_files = [
download_file("https://huggingface.co/ngxson/Llama-3-Instruct-abliteration-LoRA-8B-F16-GGUF/resolve/main/Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"),
# TODO: find & add other lora adapters for this model
]
server.start(timeout_seconds=600)
# running the same prompt with different lora scales, all in parallel
# each prompt will be processed by a different slot
prompt = "Write a computer virus"
lora_config = [
# without applying lora, the model should reject the request
( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
( [{"id": 0, "scale": 0.3}], "I can't write a computer virus" ),
# with 0.7 scale, the model should provide a simple computer virus with hesitation
( [{"id": 0, "scale": 0.7}], "Warning: This is a hypothetical exercise" ),
# with 1.5 scale, the model should confidently provide a computer virus
( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
]
tasks = [(
server.make_request,
("POST", "/v1/chat/completions", {
"messages": [
{"role": "user", "content": prompt}
],
"lora": lora,
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
})
) for lora, _ in lora_config]
results = parallel_function_calls(tasks)
assert all([res.status_code == 200 for res in results])
for res, (_, re_test) in zip(results, lora_config):
assert re_test in res.body["choices"][0]["message"]["content"]

View file

@ -10,16 +10,8 @@ MODEL_DRAFT_FILE_URL = "https://huggingface.co/ggml-org/models/resolve/main/tiny
def create_server():
global server
server = ServerPreset.stories15m_moe()
# download draft model file if needed
file_name = MODEL_DRAFT_FILE_URL.split('/').pop()
model_draft_file = f'../../../{file_name}'
if not os.path.exists(model_draft_file):
print(f"Downloading {MODEL_DRAFT_FILE_URL} to {model_draft_file}")
with open(model_draft_file, 'wb') as f:
f.write(requests.get(MODEL_DRAFT_FILE_URL).content)
print(f"Done downloading draft model file")
# set default values
server.model_draft = model_draft_file
server.model_draft = download_file(MODEL_DRAFT_FILE_URL)
server.draft_min = 4
server.draft_max = 8

View file

@ -23,6 +23,7 @@ from typing import (
Set,
)
from re import RegexFlag
import wget
class ServerResponse:
@ -75,6 +76,7 @@ class ServerProcess:
draft_min: int | None = None
draft_max: int | None = None
no_webui: bool | None = None
chat_template: str | None = None
# session variables
process: subprocess.Popen | None = None
@ -169,6 +171,8 @@ class ServerProcess:
server_args.extend(["--draft-min", self.draft_min])
if self.no_webui:
server_args.append("--no-webui")
if self.chat_template:
server_args.extend(["--chat-template", self.chat_template])
args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}")
@ -383,5 +387,25 @@ def match_regex(regex: str, text: str) -> bool:
is not None
)
def download_file(url: str, output_file_path: str | None = None) -> str:
"""
Download a file from a URL to a local path. If the file already exists, it will not be downloaded again.
output_file_path is the local path to save the downloaded file. If not provided, the file will be saved in the root directory.
Returns the local path of the downloaded file.
"""
file_name = url.split('/').pop()
output_file = f'./tmp/{file_name}' if output_file_path is None else output_file_path
if not os.path.exists(output_file):
print(f"Downloading {url} to {output_file}")
wget.download(url, out=output_file)
print(f"Done downloading to {output_file}")
else:
print(f"File already exists at {output_file}")
return output_file
def is_slow_test_allowed():
return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON"