diff --git a/common/arg.cpp b/common/arg.cpp index 2a85ad845..7f5c05a34 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1838,9 +1838,23 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.endpoint_metrics = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); + add_opt(llama_arg( + {"--slots"}, + format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + [](gpt_params & params) { + params.endpoint_slots = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); + add_opt(llama_arg( + {"--props"}, + format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), + [](gpt_params & params) { + params.endpoint_props = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS")); add_opt(llama_arg( {"--no-slots"}, - format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + "disables slots monitoring endpoint", [](gpt_params & params) { params.endpoint_slots = false; } diff --git a/common/common.h b/common/common.h index 8b84cf9ad..65add1f30 100644 --- a/common/common.h +++ b/common/common.h @@ -290,7 +290,10 @@ struct gpt_params { std::string ssl_file_key = ""; // NOLINT std::string ssl_file_cert = ""; // NOLINT - bool endpoint_slots = true; + // "advanced" endpoints are disabled by default for better security + bool webui = true; + bool endpoint_slots = false; + bool endpoint_props = false; // only control POST requests, not GET bool endpoint_metrics = false; bool log_json = false; diff --git a/examples/server/README.md b/examples/server/README.md index 6253de43c..ef1f026df 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -380,8 +380,6 @@ node index.js `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false` - `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) - `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values. **Response format** @@ -519,34 +517,41 @@ Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/B Takes a prefix and a suffix and returns the predicted completion as stream. - *Options:* +*Options:* - `input_prefix`: Set the prefix of the code to infill. +- `input_prefix`: Set the prefix of the code to infill. +- `input_suffix`: Set the suffix of the code to infill. - `input_suffix`: Set the suffix of the code to infill. +It also accepts all the options of `/completion` except `stream` and `prompt`. - It also accepts all the options of `/completion` except `stream` and `prompt`. +### **GET** `/props`: Get server global properties. -- **GET** `/props`: Return current server settings. +This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props` **Response format** ```json { - "assistant_name": "", - "user_name": "", + "system_prompt": "", "default_generation_settings": { ... }, "total_slots": 1, "chat_template": "" } ``` -- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots. -- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots. +- `system_prompt` - the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - `chat_template` - the model's original Jinja2 prompt template +### POST `/props`: Change server global properties. + +To use this endpoint with POST method, you need to start server with `--props` + +*Options:* + +- `system_prompt`: Change the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt. + ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. @@ -813,28 +818,6 @@ To know the `id` of the adapter, use GET `/lora-adapters` ## More examples -### Change system prompt on runtime - -To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once. - -`prompt`: Specify a context that you want all connecting clients to respect. - -`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint. - -`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint. - -```json -{ - "system_prompt": { - "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:", - "anti_prompt": "User:", - "assistant_name": "Assistant:" - } -} -``` - -**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`. - ### Interactive mode Check the sample in [chat.mjs](chat.mjs). diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 13e54e501..a5c38970a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1106,12 +1106,7 @@ struct server_context { SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str()); system_prompt = sys_prompt; - - // release all slots - for (server_slot & slot : slots) { - slot.release(); - } - + // update system_tokens and KV cache as soon as all slots are idle system_need_update = true; return true; } @@ -1627,16 +1622,6 @@ struct server_context { break; } - if (task.data.contains("system_prompt")) { - std::string sys_prompt = json_value(task.data, "system_prompt", std::string()); - system_prompt_set(sys_prompt); - - for (server_slot & slot : slots) { - slot.n_past = 0; - slot.n_past_se = 0; - } - } - slot->reset(); slot->id_task = task.id; @@ -1862,10 +1847,6 @@ struct server_context { } void update_slots() { - if (system_need_update) { - system_prompt_update(); - } - // check if all slots are idle { bool all_idle = true; @@ -1878,6 +1859,10 @@ struct server_context { } if (all_idle) { + if (system_need_update) { + system_prompt_update(); + } + SRV_INF("%s", "all slots are idle\n"); if (system_prompt.empty() && clean_kv_cache) { kv_cache_clear(); @@ -2536,20 +2521,11 @@ int main(int argc, char ** argv) { // auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) { - // TODO: should we apply API key to all endpoints, including "/health" and "/models"? - static const std::unordered_set protected_endpoints = { + static const std::unordered_set public_endpoints = { + "/health", + "/models", + "/v1/models", "/props", - "/completion", - "/completions", - "/v1/completions", - "/chat/completions", - "/v1/chat/completions", - "/infill", - "/tokenize", - "/detokenize", - "/embedding", - "/embeddings", - "/v1/embeddings", }; // If API key is not set, skip validation @@ -2557,8 +2533,8 @@ int main(int argc, char ** argv) { return true; } - // If path is not in protected_endpoints list, skip validation - if (protected_endpoints.find(req.path) == protected_endpoints.end()) { + // If path is public, skip validation + if (public_endpoints.find(req.path) != public_endpoints.end()) { return true; } @@ -2620,7 +2596,7 @@ int main(int argc, char ** argv) { const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) { if (!params.endpoint_slots) { - res_error(res, format_error_response("This server does not support slots endpoint. Start it without `--no-slots`", ERROR_TYPE_NOT_SUPPORTED)); + res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2869,24 +2845,31 @@ int main(int argc, char ** argv) { }; const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { - std::string template_key = "tokenizer.chat_template", curr_tmpl; - int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0); - if (tlen > 0) { - std::vector curr_tmpl_buf(tlen + 1, 0); - if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) { - curr_tmpl = std::string(curr_tmpl_buf.data(), tlen); - } - } json data = { - { "system_prompt", ctx_server.system_prompt.c_str() }, + { "system_prompt", ctx_server.system_prompt }, { "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "total_slots", ctx_server.params.n_parallel }, - { "chat_template", curr_tmpl.c_str() }, + { "chat_template", llama_get_chat_template(ctx_server.model) }, }; res_ok(res, data); }; + const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { + if (!ctx_server.params.endpoint_props) { + res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED)); + return; + } + + json data = json::parse(req.body); + if (data.contains("system_prompt")) { + std::string system_prompt = data.at("system_prompt"); + ctx_server.system_prompt_set(system_prompt); + } + + res_ok(res, {{ "success", true }}); + }; + const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) { if (ctx_server.params.embedding || ctx_server.params.reranking) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); @@ -3265,30 +3248,39 @@ int main(int argc, char ** argv) { svr->set_base_dir(params.public_path); } - // using embedded static files - svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); - svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); - svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); - svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); + if (!params.api_keys.empty()) { + // for now, if API key is set, web UI is unusable + svr->Get("/", [&](const httplib::Request & req, httplib::Response & res) { + return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8"); + }); + } else { + // using embedded static files + svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); + svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); + svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); + svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); - // add new-ui files - svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8")); - svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); - svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); - svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); - svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); + // add new-ui files + svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8")); + svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); + svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); + svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); + svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); + } // register API routes - svr->Get ("/health", handle_health); + svr->Get ("/health", handle_health); // public endpoint (no API key check) svr->Get ("/metrics", handle_metrics); - svr->Get ("/props", handle_props); - svr->Get ("/v1/models", handle_models); + svr->Get ("/props", handle_props); // public endpoint (no API key check) + svr->Post("/props", handle_props_change); + svr->Get ("/models", handle_models); // public endpoint (no API key check) + svr->Get ("/v1/models", handle_models); // public endpoint (no API key check) svr->Post("/completion", handle_completions); // legacy svr->Post("/completions", handle_completions); svr->Post("/v1/completions", handle_completions); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 47dfdfde5..701a7851f 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -90,6 +90,19 @@ inline std::string format_chat(const struct llama_model * model, const std::stri return formatted_chat; } +std::string llama_get_chat_template(const struct llama_model * model) { + std::string template_key = "tokenizer.chat_template"; + // call with NULL buffer to get the total size of the string + int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0); + if (res < 0) { + return ""; + } else { + std::vector model_template(res, 0); + llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); + return std::string(model_template.data(), model_template.size()); + } +} + // // base64 utils (TODO: move to common in the future) //