change entrypoint

llama.cpp server dockerfile
2023-11-23 13:31:30 -05:00 · 2023-11-23 13:31:30 -05:00 · 1b703db0e1
commit 1b703db0e1
parent 6b0a7420d0
3 changed files with 58 additions and 12 deletions
--- a/.devops/handler.py
+++ b/.devops/handler.py
@ -0,0 +1,23 @@
+import runpod
+import os
+import time
+
+sleep_time = int(os.environ.get('SLEEP_TIME', 1))
+
+## load your model(s) into vram here
+
+def handler(event):
+    print(event)
+    time_slept = 0
+    while time_slept < sleep_time:
+        print("working, I promise")
+        time_slept += 1
+        time.sleep(1)
+    # do the things
+
+    return "Hello World"
+
+
+runpod.serverless.start({
+    "handler": handler
+})
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -10,9 +10,10 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build

 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
+# ARG CUDA_DOCKER_ARCH=sm_86

 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential git wget python3 python3-pip

 WORKDIR /app

@ -22,11 +23,28 @@ COPY . .
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable cuBLAS
 ENV LLAMA_CUBLAS=1
+ENV LLAMA_CUDA_MMV_Y=2
+ENV LLAMA_CUDA_DMMV_X=64
+ENV LLAMA_CUDA_F16=true

-RUN make
+RUN make -j
+
+# Accept the build argument into an environment variable
+ARG MODEL_URL
+ENV MODEL_URL=${MODEL_URL}
+
+# Use the environment variable to download the model
+RUN wget $MODEL_URL -O /model.gguf
+
+WORKDIR /install
+RUN pip install --install-option="--prefix=/install" runpod

 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+COPY --from=builder /install /usr/local
+COPY --from=build /app/server /server
+COPY --from=build /model.gguf model.gguf
+COPY --from=build /app/models models

-COPY --from=build /app/main /main
-
-ENTRYPOINT [ "/main" ]
+# CMD ["/bin/sh", "-c", "/server --model model.gguf --threads $(nproc) -ngl 99 -np $(nproc) -cb"]
+# CMD ["/server --host 0.0.0.0 --threads 8 -ngl 999 -np 8 -cb -m model.gguf -c 16384"]
+CMD [ "python", "-u", "/handler.py" ]
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -23,18 +23,19 @@ jobs:
    runs-on: ubuntu-latest
    env:
      COMMIT_SHA: ${{ github.sha }}
+      MODEL_URL: "https://huggingface.co/TheBloke/neural-chat-7B-v3-1-GGUF/resolve/main/neural-chat-7b-v3-1.Q4_K_M.gguf"
    strategy:
      matrix:
        config:
-          - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
          #                     have disabled them for now until the reason why
          #                     is understood.
          - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
+          # - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v3
@ -59,8 +60,10 @@ jobs:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          tags: "ghcr.io/rawsh/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}
+          build-args: |
+            MODEL_URL=${{ env.MODEL_URL }}

      - name: Build and push Docker image (tagged)
        uses: docker/build-push-action@v4
@ -68,5 +71,7 @@ jobs:
          context: .
          push: ${{ github.event_name == 'push' }}
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
+          tags: "ghcr.io/rawsh/llama.cpp:${{ matrix.config.tag }}"
          file: ${{ matrix.config.dockerfile }}
+          build-args: |
+            MODEL_URL=${{ env.MODEL_URL }}