From 4468d96aecd79ffbc1a8f95bfebfe02c6d449762 Mon Sep 17 00:00:00 2001 From: Robert Washbourne Date: Fri, 24 Nov 2023 00:41:33 -0500 Subject: [PATCH] add handler --- .devops/handler.py | 49 +++++++++++++++++++++++++----------- .devops/main-cuda.Dockerfile | 2 +- 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/.devops/handler.py b/.devops/handler.py index 14249b3f7..f2b9de1ed 100644 --- a/.devops/handler.py +++ b/.devops/handler.py @@ -2,24 +2,45 @@ import subprocess import runpod import os import time +import aiohttp +import json + +headers = { + 'Accept': 'text/event-stream', + 'Connection': 'keep-alive', + 'Content-Type': 'application/json', + 'Origin': 'http://127.0.0.1:8080', + 'Referer': 'http://127.0.0.1:8080/', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-origin', +} llama_cmd = os.environ.get('LLAMA_CMD', "/server --host 0.0.0.0 --threads 8 -ngl 999 -np 8 -cb -m model.gguf -c 16384") -subprocess.Popen(llama_cmd.split(' ')) +sub = subprocess.Popen(llama_cmd.split(' ')) ## load your model(s) into vram here -def handler(event): - print(event) - time_slept = 0 - while time_slept < sleep_time: - print("working, I promise") - time_slept += 1 - time.sleep(1) - # do the things - - return "Hello World" - - +url = "http://0.0.0.0:8080/completion" +async def handler(event): + print(event) + prompt = event["input"]["prompt"] + async with aiohttp.ClientSession() as session: + async with session.post(url, data=json.dumps(json_data = { + 'stream': True, + 'n_predict': 2048, + 'temperature': 0.2, + 'stop': [ + '', + 'Llama:', + 'User:', + ], + 'prompt': prompt, + }), headers=headers) as response: + async for line in response.content: + yield line + runpod.serverless.start({ - "handler": handler + "handler": handler, + "return_aggregate_stream": True # Optional, results available via /run }) diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile index b2650b9c5..df2ebf593 100644 --- a/.devops/main-cuda.Dockerfile +++ b/.devops/main-cuda.Dockerfile @@ -38,7 +38,7 @@ RUN wget $MODEL_URL -O /model.gguf WORKDIR /install RUN apt-get install -y python3 python3-pip -RUN pip install --prefix /install runpod +RUN pip install --prefix /install runpod aiohttp FROM ${BASE_CUDA_RUN_CONTAINER} as runtime