From 1b703db0e117ca955ae992eb03623b9e10f138f9 Mon Sep 17 00:00:00 2001 From: Robert Washbourne Date: Thu, 23 Nov 2023 13:31:30 -0500 Subject: [PATCH] change entrypoint llama.cpp server dockerfile --- .devops/handler.py | 23 +++++++++++++++++++++++ .devops/main-cuda.Dockerfile | 28 +++++++++++++++++++++++----- .github/workflows/docker.yml | 19 ++++++++++++------- 3 files changed, 58 insertions(+), 12 deletions(-) create mode 100644 .devops/handler.py diff --git a/.devops/handler.py b/.devops/handler.py new file mode 100644 index 000000000..a1fa0cbde --- /dev/null +++ b/.devops/handler.py @@ -0,0 +1,23 @@ +import runpod +import os +import time + +sleep_time = int(os.environ.get('SLEEP_TIME', 1)) + +## load your model(s) into vram here + +def handler(event): + print(event) + time_slept = 0 + while time_slept < sleep_time: + print("working, I promise") + time_slept += 1 + time.sleep(1) + # do the things + + return "Hello World" + + +runpod.serverless.start({ + "handler": handler +}) diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile index 2b7faf7c1..a35488cea 100644 --- a/.devops/main-cuda.Dockerfile +++ b/.devops/main-cuda.Dockerfile @@ -10,9 +10,10 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build # Unless otherwise specified, we make a fat build. ARG CUDA_DOCKER_ARCH=all +# ARG CUDA_DOCKER_ARCH=sm_86 RUN apt-get update && \ - apt-get install -y build-essential git + apt-get install -y build-essential git wget python3 python3-pip WORKDIR /app @@ -22,11 +23,28 @@ COPY . . ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} # Enable cuBLAS ENV LLAMA_CUBLAS=1 +ENV LLAMA_CUDA_MMV_Y=2 +ENV LLAMA_CUDA_DMMV_X=64 +ENV LLAMA_CUDA_F16=true -RUN make +RUN make -j + +# Accept the build argument into an environment variable +ARG MODEL_URL +ENV MODEL_URL=${MODEL_URL} + +# Use the environment variable to download the model +RUN wget $MODEL_URL -O /model.gguf + +WORKDIR /install +RUN pip install --install-option="--prefix=/install" runpod FROM ${BASE_CUDA_RUN_CONTAINER} as runtime +COPY --from=builder /install /usr/local +COPY --from=build /app/server /server +COPY --from=build /model.gguf model.gguf +COPY --from=build /app/models models -COPY --from=build /app/main /main - -ENTRYPOINT [ "/main" ] +# CMD ["/bin/sh", "-c", "/server --model model.gguf --threads $(nproc) -ngl 99 -np $(nproc) -cb"] +# CMD ["/server --host 0.0.0.0 --threads 8 -ngl 999 -np 8 -cb -m model.gguf -c 16384"] +CMD [ "python", "-u", "/handler.py" ] diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 9c90c77ac..cd65b5556 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -23,18 +23,19 @@ jobs: runs-on: ubuntu-latest env: COMMIT_SHA: ${{ github.sha }} + MODEL_URL: "https://huggingface.co/TheBloke/neural-chat-7B-v3-1-GGUF/resolve/main/neural-chat-7b-v3-1.Q4_K_M.gguf" strategy: matrix: config: - - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } + # - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" } + # - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I # have disabled them for now until the reason why # is understood. - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + # - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } + # - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + # - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } steps: - name: Check out the repo uses: actions/checkout@v3 @@ -59,8 +60,10 @@ jobs: context: . push: true platforms: ${{ matrix.config.platforms }} - tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" + tags: "ghcr.io/rawsh/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" file: ${{ matrix.config.dockerfile }} + build-args: | + MODEL_URL=${{ env.MODEL_URL }} - name: Build and push Docker image (tagged) uses: docker/build-push-action@v4 @@ -68,5 +71,7 @@ jobs: context: . push: ${{ github.event_name == 'push' }} platforms: ${{ matrix.config.platforms }} - tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}" + tags: "ghcr.io/rawsh/llama.cpp:${{ matrix.config.tag }}" file: ${{ matrix.config.dockerfile }} + build-args: | + MODEL_URL=${{ env.MODEL_URL }}