common: llama_load_model_from_url using --model-url (#6098)

* common: llama_load_model_from_url with libcurl dependency Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-03-17 19:12:37 +01:00 · 2024-03-17 19:12:37 +01:00 · d01b3c4c32
commit d01b3c4c32
parent cd776c37c9
16 changed files with 397 additions and 55 deletions
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@ -4,7 +4,8 @@ Feature: llama.cpp server

  Background: Server startup
    Given a server listening on localhost:8080
-    And   a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
+    And   a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
+    And   a model file ggml-model-f16.gguf
    And   a model alias bert-bge-small
    And   42 as server seed
    And   2 slots
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@ -1,10 +1,12 @@
-import errno
 import os
-import socket
-import subprocess
-import time
-from contextlib import closing
 import signal
+import socket
+import sys
+import time
+import traceback
+from contextlib import closing
+
+import psutil


 def before_scenario(context, scenario):
@ -20,33 +22,40 @@ def before_scenario(context, scenario):


 def after_scenario(context, scenario):
-    if context.server_process is None:
-        return
-    if scenario.status == "failed":
-        if 'GITHUB_ACTIONS' in os.environ:
-            print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
-            if os.path.isfile('llama.log'):
-                with closing(open('llama.log', 'r')) as f:
-                    for line in f:
-                        print(line)
-        if not is_server_listening(context.server_fqdn, context.server_port):
-            print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
+    try:
+        if 'server_process' not in context or context.server_process is None:
+            return
+        if scenario.status == "failed":
+            if 'GITHUB_ACTIONS' in os.environ:
+                print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
+                if os.path.isfile('llama.log'):
+                    with closing(open('llama.log', 'r')) as f:
+                        for line in f:
+                            print(line)
+            if not is_server_listening(context.server_fqdn, context.server_port):
+                print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")

-    if not pid_exists(context.server_process.pid):
-        assert False, f"Server not running pid={context.server_process.pid} ..."
+        if not pid_exists(context.server_process.pid):
+            assert False, f"Server not running pid={context.server_process.pid} ..."

-    server_graceful_shutdown(context)
+        server_graceful_shutdown(context)

-    # Wait few for socket to free up
-    time.sleep(0.05)
+        # Wait few for socket to free up
+        time.sleep(0.05)

-    attempts = 0
-    while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
-        server_kill(context)
-        time.sleep(0.1)
-        attempts += 1
-        if attempts > 5:
-            server_kill_hard(context)
+        attempts = 0
+        while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
+            server_kill(context)
+            time.sleep(0.1)
+            attempts += 1
+            if attempts > 5:
+                server_kill_hard(context)
+    except:
+        exc = sys.exception()
+        print("error in after scenario: \n")
+        print(exc)
+        print("*** print_tb: \n")
+        traceback.print_tb(exc.__traceback__, file=sys.stdout)


 def server_graceful_shutdown(context):
@ -67,11 +76,11 @@ def server_kill_hard(context):
    path = context.server_path

    print(f"Server dangling exits, hard killing force {pid}={path}...\n")
-    if os.name == 'nt':
-        process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode()
-        print(process)
-    else:
-        os.kill(-pid, signal.SIGKILL)
+    try:
+        psutil.Process(pid).kill()
+    except psutil.NoSuchProcess:
+        return False
+    return True


 def is_server_listening(server_fqdn, server_port):
@ -84,17 +93,9 @@ def is_server_listening(server_fqdn, server_port):


 def pid_exists(pid):
-    """Check whether pid exists in the current process table."""
-    if pid < 0:
+    try:
+        psutil.Process(pid)
+    except psutil.NoSuchProcess:
        return False
-    if os.name == 'nt':
-        output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode()
-        print(output)
-        return "No tasks are running" not in output
-    else:
-        try:
-            os.kill(pid, 0)
-        except OSError as e:
-            return e.errno == errno.EPERM
-        else:
-            return True
+    return True
+
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -4,7 +4,8 @@ Feature: llama.cpp server

  Background: Server startup
    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf
+    And   a model file stories260K.gguf
    And   a model alias tinyllama-2
    And   42 as server seed
      # KV Cache corresponds to the total amount of tokens
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -5,6 +5,8 @@ import os
 import re
 import socket
 import subprocess
+import sys
+import threading
 import time
 from contextlib import closing
 from re import RegexFlag
@ -32,6 +34,8 @@ def step_server_config(context, server_fqdn, server_port):
    context.base_url = f'http://{context.server_fqdn}:{context.server_port}'

    context.model_alias = None
+    context.model_file = None
+    context.model_url = None
    context.n_batch = None
    context.n_ubatch = None
    context.n_ctx = None
@ -65,6 +69,16 @@ def step_download_hf_model(context, hf_file, hf_repo):
        print(f"model file: {context.model_file}\n")


+@step('a model file {model_file}')
+def step_model_file(context, model_file):
+    context.model_file = model_file
+
+
+@step('a model url {model_url}')
+def step_model_url(context, model_url):
+    context.model_url = model_url
+
+
@step('a model alias {model_alias}')
 def step_model_alias(context, model_alias):
    context.model_alias = model_alias
@ -141,7 +155,8 @@ def step_start_server(context):
 async def step_wait_for_the_server_to_be_started(context, expecting_status):
    match expecting_status:
        case 'healthy':
-            await wait_for_health_status(context, context.base_url, 200, 'ok')
+            await wait_for_health_status(context, context.base_url, 200, 'ok',
+                                         timeout=30)

        case 'ready' | 'idle':
            await wait_for_health_status(context, context.base_url, 200, 'ok',
@ -1038,8 +1053,11 @@ def start_server_background(context):
    server_args = [
        '--host', server_listen_addr,
        '--port', context.server_port,
-        '--model', context.model_file
    ]
+    if context.model_file:
+        server_args.extend(['--model', context.model_file])
+    if context.model_url:
+        server_args.extend(['--model-url', context.model_url])
    if context.n_batch:
        server_args.extend(['--batch-size', context.n_batch])
    if context.n_ubatch:
@ -1079,8 +1097,23 @@ def start_server_background(context):

    pkwargs = {
        'creationflags': flags,
+        'stdout': subprocess.PIPE,
+        'stderr': subprocess.PIPE
    }
    context.server_process = subprocess.Popen(
        [str(arg) for arg in [context.server_path, *server_args]],
        **pkwargs)
+
+    def log_stdout(process):
+        for line in iter(process.stdout.readline, b''):
+            print(line.decode('utf-8'), end='')
+    thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
+    thread_stdout.start()
+
+    def log_stderr(process):
+        for line in iter(process.stderr.readline, b''):
+            print(line.decode('utf-8'), end='', file=sys.stderr)
+    thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
+    thread_stderr.start()
+
    print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")