change entrypoint

llama.cpp server dockerfile
This commit is contained in:
Robert Washbourne 2023-11-23 13:31:30 -05:00
parent 6b0a7420d0
commit 1b703db0e1
3 changed files with 58 additions and 12 deletions

23
.devops/handler.py Normal file
View file

@ -0,0 +1,23 @@
import runpod
import os
import time
sleep_time = int(os.environ.get('SLEEP_TIME', 1))
## load your model(s) into vram here
def handler(event):
print(event)
time_slept = 0
while time_slept < sleep_time:
print("working, I promise")
time_slept += 1
time.sleep(1)
# do the things
return "Hello World"
runpod.serverless.start({
"handler": handler
})

View file

@ -10,9 +10,10 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
# ARG CUDA_DOCKER_ARCH=sm_86
RUN apt-get update && \
apt-get install -y build-essential git
apt-get install -y build-essential git wget python3 python3-pip
WORKDIR /app
@ -22,11 +23,28 @@ COPY . .
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV LLAMA_CUBLAS=1
ENV LLAMA_CUDA_MMV_Y=2
ENV LLAMA_CUDA_DMMV_X=64
ENV LLAMA_CUDA_F16=true
RUN make
RUN make -j
# Accept the build argument into an environment variable
ARG MODEL_URL
ENV MODEL_URL=${MODEL_URL}
# Use the environment variable to download the model
RUN wget $MODEL_URL -O /model.gguf
WORKDIR /install
RUN pip install --install-option="--prefix=/install" runpod
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
COPY --from=builder /install /usr/local
COPY --from=build /app/server /server
COPY --from=build /model.gguf model.gguf
COPY --from=build /app/models models
COPY --from=build /app/main /main
ENTRYPOINT [ "/main" ]
# CMD ["/bin/sh", "-c", "/server --model model.gguf --threads $(nproc) -ngl 99 -np $(nproc) -cb"]
# CMD ["/server --host 0.0.0.0 --threads 8 -ngl 999 -np 8 -cb -m model.gguf -c 16384"]
CMD [ "python", "-u", "/handler.py" ]

View file

@ -23,18 +23,19 @@ jobs:
runs-on: ubuntu-latest
env:
COMMIT_SHA: ${{ github.sha }}
MODEL_URL: "https://huggingface.co/TheBloke/neural-chat-7B-v3-1-GGUF/resolve/main/neural-chat-7b-v3-1.Q4_K_M.gguf"
strategy:
matrix:
config:
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
# have disabled them for now until the reason why
# is understood.
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
# - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
steps:
- name: Check out the repo
uses: actions/checkout@v3
@ -59,8 +60,10 @@ jobs:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
tags: "ghcr.io/rawsh/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
build-args: |
MODEL_URL=${{ env.MODEL_URL }}
- name: Build and push Docker image (tagged)
uses: docker/build-push-action@v4
@ -68,5 +71,7 @@ jobs:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
tags: "ghcr.io/rawsh/llama.cpp:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }}
build-args: |
MODEL_URL=${{ env.MODEL_URL }}