common: llama_load_model_from_url using --model-url (#6098)
* common: llama_load_model_from_url with libcurl dependency Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
cd776c37c9
commit
d01b3c4c32
16 changed files with 397 additions and 55 deletions
|
@ -4,7 +4,8 @@ Feature: llama.cpp server
|
|||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
|
||||
And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
|
||||
And a model file ggml-model-f16.gguf
|
||||
And a model alias bert-bge-small
|
||||
And 42 as server seed
|
||||
And 2 slots
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
import errno
|
||||
import os
|
||||
import socket
|
||||
import subprocess
|
||||
import time
|
||||
from contextlib import closing
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from contextlib import closing
|
||||
|
||||
import psutil
|
||||
|
||||
|
||||
def before_scenario(context, scenario):
|
||||
|
@ -20,33 +22,40 @@ def before_scenario(context, scenario):
|
|||
|
||||
|
||||
def after_scenario(context, scenario):
|
||||
if context.server_process is None:
|
||||
return
|
||||
if scenario.status == "failed":
|
||||
if 'GITHUB_ACTIONS' in os.environ:
|
||||
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
|
||||
if os.path.isfile('llama.log'):
|
||||
with closing(open('llama.log', 'r')) as f:
|
||||
for line in f:
|
||||
print(line)
|
||||
if not is_server_listening(context.server_fqdn, context.server_port):
|
||||
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
|
||||
try:
|
||||
if 'server_process' not in context or context.server_process is None:
|
||||
return
|
||||
if scenario.status == "failed":
|
||||
if 'GITHUB_ACTIONS' in os.environ:
|
||||
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
|
||||
if os.path.isfile('llama.log'):
|
||||
with closing(open('llama.log', 'r')) as f:
|
||||
for line in f:
|
||||
print(line)
|
||||
if not is_server_listening(context.server_fqdn, context.server_port):
|
||||
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
|
||||
|
||||
if not pid_exists(context.server_process.pid):
|
||||
assert False, f"Server not running pid={context.server_process.pid} ..."
|
||||
if not pid_exists(context.server_process.pid):
|
||||
assert False, f"Server not running pid={context.server_process.pid} ..."
|
||||
|
||||
server_graceful_shutdown(context)
|
||||
server_graceful_shutdown(context)
|
||||
|
||||
# Wait few for socket to free up
|
||||
time.sleep(0.05)
|
||||
# Wait few for socket to free up
|
||||
time.sleep(0.05)
|
||||
|
||||
attempts = 0
|
||||
while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
|
||||
server_kill(context)
|
||||
time.sleep(0.1)
|
||||
attempts += 1
|
||||
if attempts > 5:
|
||||
server_kill_hard(context)
|
||||
attempts = 0
|
||||
while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
|
||||
server_kill(context)
|
||||
time.sleep(0.1)
|
||||
attempts += 1
|
||||
if attempts > 5:
|
||||
server_kill_hard(context)
|
||||
except:
|
||||
exc = sys.exception()
|
||||
print("error in after scenario: \n")
|
||||
print(exc)
|
||||
print("*** print_tb: \n")
|
||||
traceback.print_tb(exc.__traceback__, file=sys.stdout)
|
||||
|
||||
|
||||
def server_graceful_shutdown(context):
|
||||
|
@ -67,11 +76,11 @@ def server_kill_hard(context):
|
|||
path = context.server_path
|
||||
|
||||
print(f"Server dangling exits, hard killing force {pid}={path}...\n")
|
||||
if os.name == 'nt':
|
||||
process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode()
|
||||
print(process)
|
||||
else:
|
||||
os.kill(-pid, signal.SIGKILL)
|
||||
try:
|
||||
psutil.Process(pid).kill()
|
||||
except psutil.NoSuchProcess:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_server_listening(server_fqdn, server_port):
|
||||
|
@ -84,17 +93,9 @@ def is_server_listening(server_fqdn, server_port):
|
|||
|
||||
|
||||
def pid_exists(pid):
|
||||
"""Check whether pid exists in the current process table."""
|
||||
if pid < 0:
|
||||
try:
|
||||
psutil.Process(pid)
|
||||
except psutil.NoSuchProcess:
|
||||
return False
|
||||
if os.name == 'nt':
|
||||
output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode()
|
||||
print(output)
|
||||
return "No tasks are running" not in output
|
||||
else:
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except OSError as e:
|
||||
return e.errno == errno.EPERM
|
||||
else:
|
||||
return True
|
||||
return True
|
||||
|
||||
|
|
|
@ -4,7 +4,8 @@ Feature: llama.cpp server
|
|||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||
And a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf
|
||||
And a model file stories260K.gguf
|
||||
And a model alias tinyllama-2
|
||||
And 42 as server seed
|
||||
# KV Cache corresponds to the total amount of tokens
|
||||
|
|
|
@ -5,6 +5,8 @@ import os
|
|||
import re
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from contextlib import closing
|
||||
from re import RegexFlag
|
||||
|
@ -32,6 +34,8 @@ def step_server_config(context, server_fqdn, server_port):
|
|||
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
|
||||
|
||||
context.model_alias = None
|
||||
context.model_file = None
|
||||
context.model_url = None
|
||||
context.n_batch = None
|
||||
context.n_ubatch = None
|
||||
context.n_ctx = None
|
||||
|
@ -65,6 +69,16 @@ def step_download_hf_model(context, hf_file, hf_repo):
|
|||
print(f"model file: {context.model_file}\n")
|
||||
|
||||
|
||||
@step('a model file {model_file}')
|
||||
def step_model_file(context, model_file):
|
||||
context.model_file = model_file
|
||||
|
||||
|
||||
@step('a model url {model_url}')
|
||||
def step_model_url(context, model_url):
|
||||
context.model_url = model_url
|
||||
|
||||
|
||||
@step('a model alias {model_alias}')
|
||||
def step_model_alias(context, model_alias):
|
||||
context.model_alias = model_alias
|
||||
|
@ -141,7 +155,8 @@ def step_start_server(context):
|
|||
async def step_wait_for_the_server_to_be_started(context, expecting_status):
|
||||
match expecting_status:
|
||||
case 'healthy':
|
||||
await wait_for_health_status(context, context.base_url, 200, 'ok')
|
||||
await wait_for_health_status(context, context.base_url, 200, 'ok',
|
||||
timeout=30)
|
||||
|
||||
case 'ready' | 'idle':
|
||||
await wait_for_health_status(context, context.base_url, 200, 'ok',
|
||||
|
@ -1038,8 +1053,11 @@ def start_server_background(context):
|
|||
server_args = [
|
||||
'--host', server_listen_addr,
|
||||
'--port', context.server_port,
|
||||
'--model', context.model_file
|
||||
]
|
||||
if context.model_file:
|
||||
server_args.extend(['--model', context.model_file])
|
||||
if context.model_url:
|
||||
server_args.extend(['--model-url', context.model_url])
|
||||
if context.n_batch:
|
||||
server_args.extend(['--batch-size', context.n_batch])
|
||||
if context.n_ubatch:
|
||||
|
@ -1079,8 +1097,23 @@ def start_server_background(context):
|
|||
|
||||
pkwargs = {
|
||||
'creationflags': flags,
|
||||
'stdout': subprocess.PIPE,
|
||||
'stderr': subprocess.PIPE
|
||||
}
|
||||
context.server_process = subprocess.Popen(
|
||||
[str(arg) for arg in [context.server_path, *server_args]],
|
||||
**pkwargs)
|
||||
|
||||
def log_stdout(process):
|
||||
for line in iter(process.stdout.readline, b''):
|
||||
print(line.decode('utf-8'), end='')
|
||||
thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
|
||||
thread_stdout.start()
|
||||
|
||||
def log_stderr(process):
|
||||
for line in iter(process.stderr.readline, b''):
|
||||
print(line.decode('utf-8'), end='', file=sys.stderr)
|
||||
thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
|
||||
thread_stderr.start()
|
||||
|
||||
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue