add handler

This commit is contained in:
Robert Washbourne 2023-11-24 00:41:33 -05:00
parent 8ddd5cb916
commit 4468d96aec
2 changed files with 36 additions and 15 deletions

View file

@ -2,24 +2,45 @@ import subprocess
import runpod import runpod
import os import os
import time import time
import aiohttp
import json
headers = {
'Accept': 'text/event-stream',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Origin': 'http://127.0.0.1:8080',
'Referer': 'http://127.0.0.1:8080/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
}
llama_cmd = os.environ.get('LLAMA_CMD', "/server --host 0.0.0.0 --threads 8 -ngl 999 -np 8 -cb -m model.gguf -c 16384") llama_cmd = os.environ.get('LLAMA_CMD', "/server --host 0.0.0.0 --threads 8 -ngl 999 -np 8 -cb -m model.gguf -c 16384")
subprocess.Popen(llama_cmd.split(' ')) sub = subprocess.Popen(llama_cmd.split(' '))
## load your model(s) into vram here ## load your model(s) into vram here
def handler(event): url = "http://0.0.0.0:8080/completion"
async def handler(event):
print(event) print(event)
time_slept = 0 prompt = event["input"]["prompt"]
while time_slept < sleep_time: async with aiohttp.ClientSession() as session:
print("working, I promise") async with session.post(url, data=json.dumps(json_data = {
time_slept += 1 'stream': True,
time.sleep(1) 'n_predict': 2048,
# do the things 'temperature': 0.2,
'stop': [
return "Hello World" '</s>',
'Llama:',
'User:',
],
'prompt': prompt,
}), headers=headers) as response:
async for line in response.content:
yield line
runpod.serverless.start({ runpod.serverless.start({
"handler": handler "handler": handler,
"return_aggregate_stream": True # Optional, results available via /run
}) })

View file

@ -38,7 +38,7 @@ RUN wget $MODEL_URL -O /model.gguf
WORKDIR /install WORKDIR /install
RUN apt-get install -y python3 python3-pip RUN apt-get install -y python3 python3-pip
RUN pip install --prefix /install runpod RUN pip install --prefix /install runpod aiohttp
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime FROM ${BASE_CUDA_RUN_CONTAINER} as runtime