server : allow using LoRA adapters per-request (#10994)
* slot.can_batch_with * lora per request * test: force disable cache prompt * move can_batch_with check * fix condition * add slow test with llama 8b * update docs * move lora change task to queue * Apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * lora_base * remove redundant check --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
a45433ba20
commit
0da5d86026
8 changed files with 235 additions and 59 deletions
|
@ -10,16 +10,8 @@ MODEL_DRAFT_FILE_URL = "https://huggingface.co/ggml-org/models/resolve/main/tiny
|
|||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.stories15m_moe()
|
||||
# download draft model file if needed
|
||||
file_name = MODEL_DRAFT_FILE_URL.split('/').pop()
|
||||
model_draft_file = f'../../../{file_name}'
|
||||
if not os.path.exists(model_draft_file):
|
||||
print(f"Downloading {MODEL_DRAFT_FILE_URL} to {model_draft_file}")
|
||||
with open(model_draft_file, 'wb') as f:
|
||||
f.write(requests.get(MODEL_DRAFT_FILE_URL).content)
|
||||
print(f"Done downloading draft model file")
|
||||
# set default values
|
||||
server.model_draft = model_draft_file
|
||||
server.model_draft = download_file(MODEL_DRAFT_FILE_URL)
|
||||
server.draft_min = 4
|
||||
server.draft_max = 8
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue