add handler

This commit is contained in:
Robert Washbourne 2023-11-24 00:41:33 -05:00
parent 8ddd5cb916
commit 4468d96aec
2 changed files with 36 additions and 15 deletions

View file

@ -2,24 +2,45 @@ import subprocess
import runpod import runpod
import os import os
import time import time
import aiohttp
import json
headers = {
'Accept': 'text/event-stream',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Origin': 'http://127.0.0.1:8080',
'Referer': 'http://127.0.0.1:8080/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
}
llama_cmd = os.environ.get('LLAMA_CMD', "/server --host 0.0.0.0 --threads 8 -ngl 999 -np 8 -cb -m model.gguf -c 16384") llama_cmd = os.environ.get('LLAMA_CMD', "/server --host 0.0.0.0 --threads 8 -ngl 999 -np 8 -cb -m model.gguf -c 16384")
subprocess.Popen(llama_cmd.split(' ')) sub = subprocess.Popen(llama_cmd.split(' '))
## load your model(s) into vram here ## load your model(s) into vram here
def handler(event): url = "http://0.0.0.0:8080/completion"
print(event) async def handler(event):
time_slept = 0 print(event)
while time_slept < sleep_time: prompt = event["input"]["prompt"]
print("working, I promise") async with aiohttp.ClientSession() as session:
time_slept += 1 async with session.post(url, data=json.dumps(json_data = {
time.sleep(1) 'stream': True,
# do the things 'n_predict': 2048,
'temperature': 0.2,
return "Hello World" 'stop': [
'</s>',
'Llama:',
'User:',
],
'prompt': prompt,
}), headers=headers) as response:
async for line in response.content:
yield line
runpod.serverless.start({ runpod.serverless.start({
"handler": handler "handler": handler,
"return_aggregate_stream": True # Optional, results available via /run
}) })

View file

@ -38,7 +38,7 @@ RUN wget $MODEL_URL -O /model.gguf
WORKDIR /install WORKDIR /install
RUN apt-get install -y python3 python3-pip RUN apt-get install -y python3 python3-pip
RUN pip install --prefix /install runpod RUN pip install --prefix /install runpod aiohttp
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime FROM ${BASE_CUDA_RUN_CONTAINER} as runtime