diff --git a/.devops/handler.py b/.devops/handler.py index a1fa0cbde..14249b3f7 100644 --- a/.devops/handler.py +++ b/.devops/handler.py @@ -1,8 +1,10 @@ +import subprocess import runpod import os import time -sleep_time = int(os.environ.get('SLEEP_TIME', 1)) +llama_cmd = os.environ.get('LLAMA_CMD', "/server --host 0.0.0.0 --threads 8 -ngl 999 -np 8 -cb -m model.gguf -c 16384") +subprocess.Popen(llama_cmd.split(' ')) ## load your model(s) into vram here diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile index fa0934e62..44747f000 100644 --- a/.devops/main-cuda.Dockerfile +++ b/.devops/main-cuda.Dockerfile @@ -27,7 +27,7 @@ ENV LLAMA_CUDA_MMV_Y=2 ENV LLAMA_CUDA_DMMV_X=64 ENV LLAMA_CUDA_F16=true -RUN make -j +RUN make # Accept the build argument into an environment variable ARG MODEL_URL