From a9dd5f3769460aa14646882197379dc5fc1c7c03 Mon Sep 17 00:00:00 2001 From: pudepiedj Date: Fri, 23 Feb 2024 09:56:14 +0000 Subject: [PATCH] Revised server hostname --- Llamaserver.py | 2 +- examples/server/README.md | 10 +++++++++- examples/server/server.cpp | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Llamaserver.py b/Llamaserver.py index ea163a605..ce48aeae2 100644 --- a/Llamaserver.py +++ b/Llamaserver.py @@ -92,7 +92,7 @@ if __name__ == "__main__": global bar lockbar = threading.Lock() - url = "http://localhost:8080/completion" + url = "http://192.168.1.31:8080/completion" num_requests = 20 q = Queue(maxsize = 64) diff --git a/examples/server/README.md b/examples/server/README.md index db4942439..be7532775 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -1,6 +1,10 @@ # llama.cpp/example/server This example demonstrates a simple HTTP API server and a simple web front end to interact with llama.cpp. +It is set only to run on the local machine using http://127.0.0.1:8080 but it can serve a local network or a public network if the router allows port forwarding. +To make the server accessible to other machines on the local or public network change the server username to '0.0.0.0'. +It will then run on http://0.0.0.0:8080 and to access the server on the host machine the url must be http://:8080 +So for example http://192.168.1.42:8080 will have the same effect as localhost:8080 when the servername is 127.0.0.1 Command line options: @@ -37,6 +41,8 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled) - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. +- '-skvg' or '--show-graphics': display a dynamic graphic of kvcache occupancy per slot. +- '-skvi' or '--show-interactive-graphics': display a dynamic graphic of kvcache that requires user intervention to move on after each request - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` - `-n, --n-predict`: Set the maximum tokens to predict (default: -1) @@ -60,7 +66,9 @@ server is build alongside everything else from the root of the project ## Quick Start -To get started right away, run the following command, making sure to use the correct path for the model you have: +To get started right away, run the following command, making sure to use the correct path for the model you have. +This will use the default setting for most parameters and so will only allow one simultaneous user. +To allow more simultaneous users enter -np N after -c 2048. ### Unix-based systems (Linux, macOS, etc.) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ada0cd182..9cb16dbb6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -37,7 +37,7 @@ using json = nlohmann::json; struct server_params { - std::string hostname = "127.0.0.1"; + std::string hostname = "0.0.0.0"; // 127.0.0.1 restricts to localhost only; use 0.0.0.0 for local network. std::vector api_keys; std::string public_path = "examples/server/public"; std::string chat_template = "";