From 1b703db0e117ca955ae992eb03623b9e10f138f9 Mon Sep 17 00:00:00 2001
From: Robert Washbourne <rawashbourne@gmail.com>
Date: Thu, 23 Nov 2023 13:31:30 -0500
Subject: [PATCH] change entrypoint

llama.cpp server dockerfile
---
 .devops/handler.py           | 23 +++++++++++++++++++++++
 .devops/main-cuda.Dockerfile | 28 +++++++++++++++++++++++-----
 .github/workflows/docker.yml | 19 ++++++++++++-------
 3 files changed, 58 insertions(+), 12 deletions(-)
 create mode 100644 .devops/handler.py

diff --git a/.devops/handler.py b/.devops/handler.py
new file mode 100644
index 000000000..a1fa0cbde
--- /dev/null
+++ b/.devops/handler.py
@@ -0,0 +1,23 @@
+import runpod
+import os
+import time
+
+sleep_time = int(os.environ.get('SLEEP_TIME', 1))
+
+## load your model(s) into vram here
+
+def handler(event):
+    print(event)
+    time_slept = 0
+    while time_slept < sleep_time:
+        print("working, I promise")
+        time_slept += 1
+        time.sleep(1)
+    # do the things
+
+    return "Hello World"
+
+
+runpod.serverless.start({
+    "handler": handler
+})
diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile
index 2b7faf7c1..a35488cea 100644
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -10,9 +10,10 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 
 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
+# ARG CUDA_DOCKER_ARCH=sm_86
 
 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential git wget python3 python3-pip
 
 WORKDIR /app
 
@@ -22,11 +23,28 @@ COPY . .
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable cuBLAS
 ENV LLAMA_CUBLAS=1
+ENV LLAMA_CUDA_MMV_Y=2
+ENV LLAMA_CUDA_DMMV_X=64
+ENV LLAMA_CUDA_F16=true
 
-RUN make
+RUN make -j
+
+# Accept the build argument into an environment variable
+ARG MODEL_URL
+ENV MODEL_URL=${MODEL_URL}
+
+# Use the environment variable to download the model
+RUN wget $MODEL_URL -O /model.gguf
+
+WORKDIR /install
+RUN pip install --install-option="--prefix=/install" runpod
 
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+COPY --from=builder /install /usr/local
+COPY --from=build /app/server /server
+COPY --from=build /model.gguf model.gguf
+COPY --from=build /app/models models
 
-COPY --from=build /app/main /main
-
-ENTRYPOINT [ "/main" ]
+# CMD ["/bin/sh", "-c", "/server --model model.gguf --threads $(nproc) -ngl 99 -np $(nproc) -cb"]
+# CMD ["/server --host 0.0.0.0 --threads 8 -ngl 999 -np 8 -cb -m model.gguf -c 16384"]
+CMD [ "python", "-u", "/handler.py" ]
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 9c90c77ac..cd65b5556 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -23,18 +23,19 @@ jobs:
     runs-on: ubuntu-latest
     env:
       COMMIT_SHA: ${{ github.sha }}
+      MODEL_URL: "https://huggingface.co/TheBloke/neural-chat-7B-v3-1-GGUF/resolve/main/neural-chat-7b-v3-1.Q4_K_M.gguf"
     strategy:
       matrix:
         config:
-          - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
           # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
           #                     have disabled them for now until the reason why
           #                     is understood.
           - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
+          # - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
     steps:
       - name: Check out the repo
         uses: actions/checkout@v3
@@ -59,8 +60,10 @@ jobs:
           context: .
           push: true
           platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          tags: "ghcr.io/rawsh/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
           file: ${{ matrix.config.dockerfile }}
+          build-args: |
+            MODEL_URL=${{ env.MODEL_URL }}
 
       - name: Build and push Docker image (tagged)
         uses: docker/build-push-action@v4
@@ -68,5 +71,7 @@ jobs:
           context: .
           push: ${{ github.event_name == 'push' }}
           platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
+          tags: "ghcr.io/rawsh/llama.cpp:${{ matrix.config.tag }}"
           file: ${{ matrix.config.dockerfile }}
+          build-args: |
+            MODEL_URL=${{ env.MODEL_URL }}