diff --git a/common/arg.cpp b/common/arg.cpp index 27886b84e..3d44f9428 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1859,6 +1859,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.n_cache_reuse = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE")); + add_opt(common_arg( + {"--standby-timeout"}, "N", + string_format("seconds that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout), + [](common_params & params, int value) { + params.standby_timeout = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STANDBY_TIMEOUT")); add_opt(common_arg( {"--metrics"}, string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), diff --git a/common/common.h b/common/common.h index 0d452cf0f..31116046f 100644 --- a/common/common.h +++ b/common/common.h @@ -318,6 +318,7 @@ struct common_params { int32_t timeout_write = timeout_read; // http write timeout in seconds int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting + int32_t standby_timeout = 0; // seconds that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically. std::string hostname = "127.0.0.1"; std::string public_path = ""; // NOLINT diff --git a/examples/server/README.md b/examples/server/README.md index 1f0a27d96..31bda7f77 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -153,6 +153,7 @@ The project is under active development, and we are [looking for feedback and co | `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
(env: LLAMA_ARG_CACHE_REUSE) | +| `--standby-timeout N` | seconds that must pass since a request has been served, before the server stops automatically (default: 0)
(env: LLAMA_ARG_STANDBY_TIMEOUT) | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | | `--slots` | enable slots monitoring endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | | `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) | diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 127323e77..6eeeca9a3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -29,6 +29,8 @@ #include #include #include +#include +#include using json = nlohmann::ordered_json; @@ -1413,6 +1415,8 @@ struct server_queue { std::function callback_new_task; std::function callback_update_slots; + int standby_timeout; + // Add a new task to the end of the queue int post(server_task task, bool front = false) { std::unique_lock lock(mutex_tasks); @@ -1527,9 +1531,18 @@ struct server_queue { QUE_DBG("%s", "terminate\n"); return; } - condition_tasks.wait(lock, [&]{ - return (!queue_tasks.empty() || !running); - }); + const auto pred = [&] { + return (!queue_tasks.empty() || !running); + }; + if (standby_timeout > 0) { + if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) { + QUE_INF("%s", "stand-by timeout reached\n"); + running = false; + break; + } + } else { + condition_tasks.wait(lock, pred); + } } } } @@ -1692,6 +1705,8 @@ struct server_context { n_ctx = llama_n_ctx(ctx); + queue_tasks.standby_timeout = params.standby_timeout; + add_bos_token = llama_add_bos_token(model); has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL;