add handler

2023-11-24 00:41:33 -05:00 · 2023-11-24 00:41:33 -05:00 · 4468d96aec
commit 4468d96aec
parent 8ddd5cb916
2 changed files with 36 additions and 15 deletions
--- a/.devops/handler.py
+++ b/.devops/handler.py
@ -2,24 +2,45 @@ import subprocess
 import runpod
 import os
 import time
+import aiohttp
+import json
+
+headers = {
+    'Accept': 'text/event-stream',
+    'Connection': 'keep-alive',
+    'Content-Type': 'application/json',
+    'Origin': 'http://127.0.0.1:8080',
+    'Referer': 'http://127.0.0.1:8080/',
+    'Sec-Fetch-Dest': 'empty',
+    'Sec-Fetch-Mode': 'cors',
+    'Sec-Fetch-Site': 'same-origin',
+}

 llama_cmd = os.environ.get('LLAMA_CMD', "/server --host 0.0.0.0 --threads 8 -ngl 999 -np 8 -cb -m model.gguf -c 16384")
-subprocess.Popen(llama_cmd.split(' '))
+sub = subprocess.Popen(llama_cmd.split(' '))

 ## load your model(s) into vram here

-def handler(event):
-    print(event)
-    time_slept = 0
-    while time_slept < sleep_time:
-        print("working, I promise")
-        time_slept += 1
-        time.sleep(1)
-    # do the things
-
-    return "Hello World"
-
-
+url = "http://0.0.0.0:8080/completion"
+async def handler(event):
+  print(event)
+  prompt = event["input"]["prompt"]
+  async with aiohttp.ClientSession() as session:
+    async with session.post(url, data=json.dumps(json_data = {
+      'stream': True,
+      'n_predict': 2048,
+      'temperature': 0.2,
+      'stop': [
+          '</s>',
+          'Llama:',
+          'User:',
+      ],
+      'prompt': prompt,
+    }), headers=headers) as response:
+      async for line in response.content:
+        yield line
+    
 runpod.serverless.start({
-    "handler": handler
+    "handler": handler,
+    "return_aggregate_stream": True # Optional, results available via /run
 })
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -38,7 +38,7 @@ RUN wget $MODEL_URL -O /model.gguf

 WORKDIR /install
 RUN apt-get install -y python3 python3-pip
-RUN pip install --prefix /install runpod
+RUN pip install --prefix /install runpod aiohttp

 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime