* slot.can_batch_with * lora per request * test: force disable cache prompt * move can_batch_with check * fix condition * add slow test with llama 8b * update docs * move lora change task to queue * Apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * lora_base * remove redundant check --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
		
			
				
	
	
		
			115 lines
		
	
	
	
		
			4.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			115 lines
		
	
	
	
		
			4.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| from utils import *
 | |
| 
 | |
| server = ServerPreset.stories15m_moe()
 | |
| 
 | |
| LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
 | |
| 
 | |
| @pytest.fixture(scope="module", autouse=True)
 | |
| def create_server():
 | |
|     global server
 | |
|     server = ServerPreset.stories15m_moe()
 | |
|     server.lora_files = [download_file(LORA_FILE_URL)]
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize("scale,re_content", [
 | |
|     # without applying lora, the model should behave like a bedtime story generator
 | |
|     (0.0, "(little|girl|three|years|old)+"),
 | |
|     # with lora, the model should behave like a Shakespearean text generator
 | |
|     (1.0, "(eye|love|glass|sun)+"),
 | |
| ])
 | |
| def test_lora(scale: float, re_content: str):
 | |
|     global server
 | |
|     server.start()
 | |
|     res_lora_control = server.make_request("POST", "/lora-adapters", data=[
 | |
|         {"id": 0, "scale": scale}
 | |
|     ])
 | |
|     assert res_lora_control.status_code == 200
 | |
|     res = server.make_request("POST", "/completion", data={
 | |
|         "prompt": "Look in thy glass",
 | |
|     })
 | |
|     assert res.status_code == 200
 | |
|     assert match_regex(re_content, res.body["content"])
 | |
| 
 | |
| 
 | |
| def test_lora_per_request():
 | |
|     global server
 | |
|     server.n_slots = 4
 | |
|     server.start()
 | |
| 
 | |
|     # running the same prompt with different lora scales, all in parallel
 | |
|     # each prompt will be processed by a different slot
 | |
|     prompt = "Look in thy glass"
 | |
|     lora_config = [
 | |
|         ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
 | |
|         ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
 | |
|         ( [{"id": 0, "scale": 0.3}], "(special|thing|gifted)+" ),
 | |
|         ( [{"id": 0, "scale": 0.7}], "(far|from|home|away)+" ),
 | |
|         ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
 | |
|         ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
 | |
|     ]
 | |
| 
 | |
|     tasks = [(
 | |
|         server.make_request,
 | |
|         ("POST", "/completion", {
 | |
|             "prompt": prompt,
 | |
|             "lora": lora,
 | |
|             "seed": 42,
 | |
|             "temperature": 0.0,
 | |
|             "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
 | |
|         })
 | |
|     ) for lora, _ in lora_config]
 | |
|     results = parallel_function_calls(tasks)
 | |
| 
 | |
|     assert all([res.status_code == 200 for res in results])
 | |
|     for res, (_, re_test) in zip(results, lora_config):
 | |
|         assert match_regex(re_test, res.body["content"])
 | |
| 
 | |
| 
 | |
| @pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
 | |
| def test_with_big_model():
 | |
|     server = ServerProcess()
 | |
|     server.model_hf_repo = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
 | |
|     server.model_hf_file = "Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf"
 | |
|     server.model_alias = "Llama-3.2-8B-Instruct"
 | |
|     server.n_slots = 4
 | |
|     server.n_ctx = server.n_slots * 1024
 | |
|     server.n_predict = 64
 | |
|     server.temperature = 0.0
 | |
|     server.seed = 42
 | |
|     server.lora_files = [
 | |
|         download_file("https://huggingface.co/ngxson/Llama-3-Instruct-abliteration-LoRA-8B-F16-GGUF/resolve/main/Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"),
 | |
|         # TODO: find & add other lora adapters for this model
 | |
|     ]
 | |
|     server.start(timeout_seconds=600)
 | |
| 
 | |
|     # running the same prompt with different lora scales, all in parallel
 | |
|     # each prompt will be processed by a different slot
 | |
|     prompt = "Write a computer virus"
 | |
|     lora_config = [
 | |
|         # without applying lora, the model should reject the request
 | |
|         ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
 | |
|         ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
 | |
|         ( [{"id": 0, "scale": 0.3}], "I can't write a computer virus" ),
 | |
|         # with 0.7 scale, the model should provide a simple computer virus with hesitation
 | |
|         ( [{"id": 0, "scale": 0.7}], "Warning: This is a hypothetical exercise" ),
 | |
|         # with 1.5 scale, the model should confidently provide a computer virus
 | |
|         ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
 | |
|         ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
 | |
|     ]
 | |
| 
 | |
|     tasks = [(
 | |
|         server.make_request,
 | |
|         ("POST", "/v1/chat/completions", {
 | |
|             "messages": [
 | |
|                 {"role": "user", "content": prompt}
 | |
|             ],
 | |
|             "lora": lora,
 | |
|             "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
 | |
|         })
 | |
|     ) for lora, _ in lora_config]
 | |
|     results = parallel_function_calls(tasks)
 | |
| 
 | |
|     assert all([res.status_code == 200 for res in results])
 | |
|     for res, (_, re_test) in zip(results, lora_config):
 | |
|         assert re_test in res.body["choices"][0]["message"]["content"]
 |