Merge 7006dd784c
into c05e8c9934
This commit is contained in:
commit
71de9f4046
4 changed files with 27 additions and 3 deletions
|
@ -1859,6 +1859,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.n_cache_reuse = value;
|
params.n_cache_reuse = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--standby-timeout"}, "N",
|
||||||
|
string_format("seconds that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.standby_timeout = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STANDBY_TIMEOUT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--metrics"},
|
{"--metrics"},
|
||||||
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
|
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
|
||||||
|
|
|
@ -318,6 +318,7 @@ struct common_params {
|
||||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||||
|
int32_t standby_timeout = 0; // seconds that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically.
|
||||||
|
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::string public_path = ""; // NOLINT
|
std::string public_path = ""; // NOLINT
|
||||||
|
|
|
@ -153,6 +153,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
|
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
|
||||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||||
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
|
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
|
||||||
|
| `--standby-timeout N` | seconds that must pass since a request has been served, before the server stops automatically (default: 0)<br/>(env: LLAMA_ARG_STANDBY_TIMEOUT) |
|
||||||
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
||||||
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
|
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
|
||||||
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
|
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
|
||||||
|
|
|
@ -29,6 +29,8 @@
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
|
#include <chrono>
|
||||||
|
#include <variant>
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
using json = nlohmann::ordered_json;
|
||||||
|
|
||||||
|
@ -1413,6 +1415,8 @@ struct server_queue {
|
||||||
std::function<void(server_task)> callback_new_task;
|
std::function<void(server_task)> callback_new_task;
|
||||||
std::function<void(void)> callback_update_slots;
|
std::function<void(void)> callback_update_slots;
|
||||||
|
|
||||||
|
int standby_timeout;
|
||||||
|
|
||||||
// Add a new task to the end of the queue
|
// Add a new task to the end of the queue
|
||||||
int post(server_task task, bool front = false) {
|
int post(server_task task, bool front = false) {
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
|
@ -1527,9 +1531,18 @@ struct server_queue {
|
||||||
QUE_DBG("%s", "terminate\n");
|
QUE_DBG("%s", "terminate\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
condition_tasks.wait(lock, [&]{
|
const auto pred = [&] {
|
||||||
return (!queue_tasks.empty() || !running);
|
return (!queue_tasks.empty() || !running);
|
||||||
});
|
};
|
||||||
|
if (standby_timeout > 0) {
|
||||||
|
if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) {
|
||||||
|
QUE_INF("%s", "stand-by timeout reached\n");
|
||||||
|
running = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
condition_tasks.wait(lock, pred);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1692,6 +1705,8 @@ struct server_context {
|
||||||
|
|
||||||
n_ctx = llama_n_ctx(ctx);
|
n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
|
queue_tasks.standby_timeout = params.standby_timeout;
|
||||||
|
|
||||||
add_bos_token = llama_add_bos_token(model);
|
add_bos_token = llama_add_bos_token(model);
|
||||||
has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL;
|
has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue