From 4468d96aecd79ffbc1a8f95bfebfe02c6d449762 Mon Sep 17 00:00:00 2001
From: Robert Washbourne <rawashbourne@gmail.com>
Date: Fri, 24 Nov 2023 00:41:33 -0500
Subject: [PATCH] add handler

---
 .devops/handler.py           | 49 +++++++++++++++++++++++++-----------
 .devops/main-cuda.Dockerfile |  2 +-
 2 files changed, 36 insertions(+), 15 deletions(-)
diff --git a/.devops/handler.py b/.devops/handler.py
index 14249b3f7..f2b9de1ed 100644
--- a/.devops/handler.py
+++ b/.devops/handler.py
@@ -2,24 +2,45 @@ import subprocess
 import runpod
 import os
 import time
+import aiohttp
+import json
+
+headers = {
+    'Accept': 'text/event-stream',
+    'Connection': 'keep-alive',
+    'Content-Type': 'application/json',
+    'Origin': 'http://127.0.0.1:8080',
+    'Referer': 'http://127.0.0.1:8080/',
+    'Sec-Fetch-Dest': 'empty',
+    'Sec-Fetch-Mode': 'cors',
+    'Sec-Fetch-Site': 'same-origin',
+}
 
 llama_cmd = os.environ.get('LLAMA_CMD', "/server --host 0.0.0.0 --threads 8 -ngl 999 -np 8 -cb -m model.gguf -c 16384")
-subprocess.Popen(llama_cmd.split(' '))
+sub = subprocess.Popen(llama_cmd.split(' '))
 
 ## load your model(s) into vram here
 
-def handler(event):
-    print(event)
-    time_slept = 0
-    while time_slept < sleep_time:
-        print("working, I promise")
-        time_slept += 1
-        time.sleep(1)
-    # do the things
-
-    return "Hello World"
-
-
+url = "http://0.0.0.0:8080/completion"
+async def handler(event):
+  print(event)
+  prompt = event["input"]["prompt"]
+  async with aiohttp.ClientSession() as session:
+    async with session.post(url, data=json.dumps(json_data = {
+      'stream': True,
+      'n_predict': 2048,
+      'temperature': 0.2,
+      'stop': [
+          '</s>',
+          'Llama:',
+          'User:',
+      ],
+      'prompt': prompt,
+    }), headers=headers) as response:
+      async for line in response.content:
+        yield line
+    
 runpod.serverless.start({
-    "handler": handler
+    "handler": handler,
+    "return_aggregate_stream": True # Optional, results available via /run
 })
diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile
index b2650b9c5..df2ebf593 100644
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -38,7 +38,7 @@ RUN wget $MODEL_URL -O /model.gguf
 
 WORKDIR /install
 RUN apt-get install -y python3 python3-pip
-RUN pip install --prefix /install runpod
+RUN pip install --prefix /install runpod aiohttp
 
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime