From b3eed8947803ccbd13b5bf22d062e73f0228f334 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 21 Aug 2024 11:55:35 +0200 Subject: [PATCH] add LLAMA_ARG_HOST to server dockerfile --- .devops/llama-server-cuda.Dockerfile | 2 ++ .devops/llama-server-intel.Dockerfile | 2 ++ .devops/llama-server-rocm.Dockerfile | 2 ++ .devops/llama-server-vulkan.Dockerfile | 2 ++ .devops/llama-server.Dockerfile | 2 ++ common/common.cpp | 2 ++ examples/server/README.md | 4 +++- 7 files changed, 15 insertions(+), 1 deletion(-) diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile index 67328cf1c..184248984 100644 --- a/.devops/llama-server-cuda.Dockerfile +++ b/.devops/llama-server-cuda.Dockerfile @@ -24,6 +24,8 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} ENV GGML_CUDA=1 # Enable cURL ENV LLAMA_CURL=1 +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 RUN make -j$(nproc) llama-server diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile index f525658dd..9c355b664 100644 --- a/.devops/llama-server-intel.Dockerfile +++ b/.devops/llama-server-intel.Dockerfile @@ -26,6 +26,8 @@ RUN apt-get update && \ COPY --from=build /app/build/bin/llama-server /llama-server ENV LC_ALL=C.utf8 +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile index 763b4cd3f..fd0e19ad6 100644 --- a/.devops/llama-server-rocm.Dockerfile +++ b/.devops/llama-server-rocm.Dockerfile @@ -39,6 +39,8 @@ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 # Enable cURL ENV LLAMA_CURL=1 diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile index 13a61ffd8..93c5e0c26 100644 --- a/.devops/llama-server-vulkan.Dockerfile +++ b/.devops/llama-server-vulkan.Dockerfile @@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \ rm -rf /app ENV LC_ALL=C.utf8 +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile index ff558604e..02accc85e 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -21,6 +21,8 @@ RUN apt-get update && \ COPY --from=build /app/llama-server /llama-server ENV LC_ALL=C.utf8 +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] diff --git a/common/common.cpp b/common/common.cpp index 918b4e1f8..960d4fac0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -327,6 +327,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { void gpt_params_parse_from_env(gpt_params & params) { // we only care about server-related params for now get_env("LLAMA_ARG_MODEL", params.model); + get_env("LLAMA_ARG_MODEL_URL", params.model_url); + get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias); get_env("LLAMA_ARG_HF_REPO", params.hf_repo); get_env("LLAMA_ARG_HF_FILE", params.hf_file); get_env("LLAMA_ARG_THREADS", params.n_threads); diff --git a/examples/server/README.md b/examples/server/README.md index 839e9e5d6..26811721f 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -252,6 +252,8 @@ Available environment variables (if specified, these variables will override par - `LLAMA_CACHE`: cache directory, used by `--hf-repo` - `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo` - `LLAMA_ARG_MODEL`: equivalent to `-m` +- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu` +- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a` - `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo` - `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file` - `LLAMA_ARG_THREADS`: equivalent to `-t` @@ -282,11 +284,11 @@ services: volumes: - ./models:/models environment: + # alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model LLAMA_ARG_MODEL: /models/my_model.gguf LLAMA_ARG_CTX_SIZE: 4096 LLAMA_ARG_N_PARALLEL: 2 LLAMA_ARG_ENDPOINT_METRICS: 1 # to disable, either remove or set to 0 - LLAMA_ARG_HOST: 0.0.0.0 LLAMA_ARG_PORT: 8080 ```