diff --git a/common/arg.cpp b/common/arg.cpp index 3c8101091..eb5f35f34 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1859,9 +1859,23 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.endpoint_metrics = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); + add_opt(llama_arg( + {"--slots"}, + format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + [](gpt_params & params) { + params.endpoint_slots = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); + add_opt(llama_arg( + {"--props"}, + format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), + [](gpt_params & params) { + params.endpoint_props = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS")); add_opt(llama_arg( {"--no-slots"}, - format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + "disables slots monitoring endpoint", [](gpt_params & params) { params.endpoint_slots = false; } diff --git a/common/common.h b/common/common.h index 7851b4bc4..4e2bbb9de 100644 --- a/common/common.h +++ b/common/common.h @@ -294,7 +294,10 @@ struct gpt_params { std::string ssl_file_key = ""; // NOLINT std::string ssl_file_cert = ""; // NOLINT - bool endpoint_slots = true; + // "advanced" endpoints are disabled by default for better security + bool webui = true; + bool endpoint_slots = false; + bool endpoint_props = false; // only control POST requests, not GET bool endpoint_metrics = false; bool log_json = false; diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 0051a5eb6..644d46a62 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -314,9 +314,9 @@ struct lora_merge_ctx { // optionally dequantize it printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type)); auto nels = ggml_nelements(inp_base); - ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type); + const auto * qtype = ggml_get_type_traits(base->type); std::vector dequant_buf(nels * sizeof(float)); - qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels); + qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels); ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size()); } else { ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base)); diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 498cbbe3c..e372856c6 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { } static void test_roundtrip_on_chunk( - const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference, + const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference, float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats ) { if (layer->type == GGML_TYPE_F16) { @@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk( // Run quantization function for a single layer and update error stats static void test_roundtrip_on_layer( - std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference, + std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference, const ggml_tensor * layer, std::vector & input_scratch, std::vector & quantized_scratch, std::vector & output_scratch, error_stats & total_error, int max_thread = 0 ) { @@ -371,8 +371,8 @@ int main(int argc, char ** argv) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } - ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); - if (qfns.from_float && qfns.to_float) { + const auto * qfns = ggml_get_type_traits(type); + if (qfns->from_float && qfns->to_float) { if (params.verbose) { printf("testing %s ...\n", ggml_type_name(type)); } @@ -393,7 +393,7 @@ int main(int argc, char ** argv) { test_roundtrip_on_layer( layer_name, params.per_layer_stats, - qfns, + *qfns, params.reference, kv_tensor.second, input_scratch, diff --git a/examples/server/README.md b/examples/server/README.md index 6253de43c..09d1cf097 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -18,6 +18,8 @@ The project is under active development, and we are [looking for feedback and co ## Usage + + **Common params** | Argument | Explanation | @@ -149,7 +151,9 @@ The project is under active development, and we are [looking for feedback and co | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | -| `--no-slots` | disables slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | +| `--slots` | enable slots monitoring endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | +| `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) | +| `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted:
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| @@ -380,8 +384,6 @@ node index.js `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false` - `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) - `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values. **Response format** @@ -519,34 +521,41 @@ Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/B Takes a prefix and a suffix and returns the predicted completion as stream. - *Options:* +*Options:* - `input_prefix`: Set the prefix of the code to infill. +- `input_prefix`: Set the prefix of the code to infill. +- `input_suffix`: Set the suffix of the code to infill. - `input_suffix`: Set the suffix of the code to infill. +It also accepts all the options of `/completion` except `stream` and `prompt`. - It also accepts all the options of `/completion` except `stream` and `prompt`. +### **GET** `/props`: Get server global properties. -- **GET** `/props`: Return current server settings. +This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props` **Response format** ```json { - "assistant_name": "", - "user_name": "", + "system_prompt": "", "default_generation_settings": { ... }, "total_slots": 1, "chat_template": "" } ``` -- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots. -- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots. +- `system_prompt` - the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - `chat_template` - the model's original Jinja2 prompt template +### POST `/props`: Change server global properties. + +To use this endpoint with POST method, you need to start server with `--props` + +*Options:* + +- `system_prompt`: Change the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt. + ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. @@ -813,28 +822,6 @@ To know the `id` of the adapter, use GET `/lora-adapters` ## More examples -### Change system prompt on runtime - -To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once. - -`prompt`: Specify a context that you want all connecting clients to respect. - -`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint. - -`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint. - -```json -{ - "system_prompt": { - "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:", - "anti_prompt": "User:", - "assistant_name": "Assistant:" - } -} -``` - -**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`. - ### Interactive mode Check the sample in [chat.mjs](chat.mjs). diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 13e54e501..aedfca0d6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1106,12 +1106,7 @@ struct server_context { SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str()); system_prompt = sys_prompt; - - // release all slots - for (server_slot & slot : slots) { - slot.release(); - } - + // update system_tokens and KV cache as soon as all slots are idle system_need_update = true; return true; } @@ -1627,16 +1622,6 @@ struct server_context { break; } - if (task.data.contains("system_prompt")) { - std::string sys_prompt = json_value(task.data, "system_prompt", std::string()); - system_prompt_set(sys_prompt); - - for (server_slot & slot : slots) { - slot.n_past = 0; - slot.n_past_se = 0; - } - } - slot->reset(); slot->id_task = task.id; @@ -1862,10 +1847,6 @@ struct server_context { } void update_slots() { - if (system_need_update) { - system_prompt_update(); - } - // check if all slots are idle { bool all_idle = true; @@ -1878,6 +1859,10 @@ struct server_context { } if (all_idle) { + if (system_need_update) { + system_prompt_update(); + } + SRV_INF("%s", "all slots are idle\n"); if (system_prompt.empty() && clean_kv_cache) { kv_cache_clear(); @@ -2536,20 +2521,10 @@ int main(int argc, char ** argv) { // auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) { - // TODO: should we apply API key to all endpoints, including "/health" and "/models"? - static const std::unordered_set protected_endpoints = { - "/props", - "/completion", - "/completions", - "/v1/completions", - "/chat/completions", - "/v1/chat/completions", - "/infill", - "/tokenize", - "/detokenize", - "/embedding", - "/embeddings", - "/v1/embeddings", + static const std::unordered_set public_endpoints = { + "/health", + "/models", + "/v1/models", }; // If API key is not set, skip validation @@ -2557,8 +2532,8 @@ int main(int argc, char ** argv) { return true; } - // If path is not in protected_endpoints list, skip validation - if (protected_endpoints.find(req.path) == protected_endpoints.end()) { + // If path is public, skip validation + if (public_endpoints.find(req.path) != public_endpoints.end()) { return true; } @@ -2620,7 +2595,7 @@ int main(int argc, char ** argv) { const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) { if (!params.endpoint_slots) { - res_error(res, format_error_response("This server does not support slots endpoint. Start it without `--no-slots`", ERROR_TYPE_NOT_SUPPORTED)); + res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2869,24 +2844,31 @@ int main(int argc, char ** argv) { }; const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { - std::string template_key = "tokenizer.chat_template", curr_tmpl; - int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0); - if (tlen > 0) { - std::vector curr_tmpl_buf(tlen + 1, 0); - if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) { - curr_tmpl = std::string(curr_tmpl_buf.data(), tlen); - } - } json data = { - { "system_prompt", ctx_server.system_prompt.c_str() }, + { "system_prompt", ctx_server.system_prompt }, { "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "total_slots", ctx_server.params.n_parallel }, - { "chat_template", curr_tmpl.c_str() }, + { "chat_template", llama_get_chat_template(ctx_server.model) }, }; res_ok(res, data); }; + const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { + if (!ctx_server.params.endpoint_props) { + res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED)); + return; + } + + json data = json::parse(req.body); + if (data.contains("system_prompt")) { + std::string system_prompt = data.at("system_prompt"); + ctx_server.system_prompt_set(system_prompt); + } + + res_ok(res, {{ "success", true }}); + }; + const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) { if (ctx_server.params.embedding || ctx_server.params.reranking) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); @@ -3265,30 +3247,39 @@ int main(int argc, char ** argv) { svr->set_base_dir(params.public_path); } - // using embedded static files - svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); - svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); - svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); - svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); + if (!params.api_keys.empty()) { + // for now, if API key is set, web UI is unusable + svr->Get("/", [&](const httplib::Request &, httplib::Response & res) { + return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8"); + }); + } else { + // using embedded static files + svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); + svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); + svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); + svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); - // add new-ui files - svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8")); - svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); - svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); - svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); - svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); + // add new-ui files + svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8")); + svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); + svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); + svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); + svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); + } // register API routes - svr->Get ("/health", handle_health); + svr->Get ("/health", handle_health); // public endpoint (no API key check) svr->Get ("/metrics", handle_metrics); svr->Get ("/props", handle_props); - svr->Get ("/v1/models", handle_models); + svr->Post("/props", handle_props_change); + svr->Get ("/models", handle_models); // public endpoint (no API key check) + svr->Get ("/v1/models", handle_models); // public endpoint (no API key check) svr->Post("/completion", handle_completions); // legacy svr->Post("/completions", handle_completions); svr->Post("/v1/completions", handle_completions); diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature index eb82e7aca..0a3c5cc77 100644 --- a/examples/server/tests/features/security.feature +++ b/examples/server/tests/features/security.feature @@ -5,7 +5,7 @@ Feature: Security Background: Server startup with an api key defined Given a server listening on localhost:8080 And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models - And a server api key llama.cpp + And a server api key THIS_IS_THE_KEY Then the server is starting Then the server is healthy @@ -16,11 +16,11 @@ Feature: Security And a completion request with api error Examples: Prompts - | api_key | api_error | - | llama.cpp | no | - | llama.cpp | no | - | hackeme | raised | - | | raised | + | api_key | api_error | + | THIS_IS_THE_KEY | no | + | THIS_IS_THE_KEY | no | + | hackeme | raised | + | | raised | Scenario Outline: OAI Compatibility Given a system prompt test @@ -32,10 +32,10 @@ Feature: Security Given an OAI compatible chat completions request with api error Examples: Prompts - | api_key | api_error | - | llama.cpp | no | - | llama.cpp | no | - | hackme | raised | + | api_key | api_error | + | THIS_IS_THE_KEY | no | + | THIS_IS_THE_KEY | no | + | hackme | raised | Scenario Outline: OAI Compatibility (invalid response formats) Given a system prompt test @@ -55,7 +55,7 @@ Feature: Security Scenario Outline: CORS Options - Given a user api key llama.cpp + Given a user api key THIS_IS_THE_KEY When an OPTIONS request is sent from Then CORS header is set to diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 2611614ba..540a2ecd5 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1299,7 +1299,8 @@ async def wait_for_slots_status(context, async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: while True: - async with await session.get(f'{base_url}/slots', params=params) as slots_response: + headers = {'Authorization': f'Bearer {context.server_api_key}'} + async with await session.get(f'{base_url}/slots', params=params, headers=headers) as slots_response: status_code = slots_response.status slots = await slots_response.json() if context.debug: @@ -1387,6 +1388,7 @@ def start_server_background(context): context.server_path = os.environ['LLAMA_SERVER_BIN_PATH'] server_listen_addr = context.server_fqdn server_args = [ + '--slots', # requires to get slot status via /slots endpoint '--host', server_listen_addr, '--port', context.server_port, ] diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 47dfdfde5..452606cca 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -90,6 +90,19 @@ inline std::string format_chat(const struct llama_model * model, const std::stri return formatted_chat; } +static std::string llama_get_chat_template(const struct llama_model * model) { + std::string template_key = "tokenizer.chat_template"; + // call with NULL buffer to get the total size of the string + int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0); + if (res < 0) { + return ""; + } else { + std::vector model_template(res, 0); + llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); + return std::string(model_template.data(), model_template.size()); + } +} + // // base64 utils (TODO: move to common in the future) // diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index e7678d071..4508da4fb 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2535,7 +2535,7 @@ extern "C" { typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y, int nr, int nc); - typedef struct { + struct ggml_type_traits { const char * type_name; int64_t blck_size; int64_t blck_size_interleave; // interleave elements in blocks @@ -2551,9 +2551,9 @@ extern "C" { int64_t ncols; // number of columns to process simultaneously ggml_gemv_t gemv; ggml_gemm_t gemm; - } ggml_type_traits_t; + }; - GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); + GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); #ifdef __cplusplus } diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index fbd49d13d..627b4dbc7 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1177,7 +1177,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st op->type != GGML_TYPE_IQ1_S && op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float case GGML_OP_MUL_MAT: - return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; + return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type; case GGML_OP_ROPE_BACK: return op->src[2] == NULL && (op->op_params[2] & 4) == 0; case GGML_OP_IM2COL_BACK: diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp index 0c6574de5..55f724586 100644 --- a/ggml/src/ggml-blas.cpp +++ b/ggml/src/ggml-blas.cpp @@ -65,8 +65,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg // convert src0 to float if (type != GGML_TYPE_F32) { - ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type); - ggml_to_float_t const to_float = type_traits.to_float; + const auto * type_traits = ggml_get_type_traits(type); + ggml_to_float_t const to_float = type_traits->to_float; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -420,19 +420,21 @@ static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const s // TODO: find the optimal value const int64_t min_batch = 32; - return (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - src1->type == GGML_TYPE_F32 && - (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch)); + return ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + src1->type == GGML_TYPE_F32 && + (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) && + (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); } case GGML_OP_OUT_PROD: - return (op->src[0]->type == GGML_TYPE_F32 && - op->src[1]->type == GGML_TYPE_F32 && - ggml_is_matrix(src0) && - ggml_is_matrix(src1) && - ggml_is_contiguous(src0) && - (ggml_is_contiguous(src1) || ggml_is_transposed(src1))); + return op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + ggml_is_matrix(src0) && + ggml_is_matrix(src1) && + ggml_is_contiguous(src0) && + (ggml_is_contiguous(src1) || ggml_is_transposed(src1)) && + (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); default: return false; diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 30bd376da..374c6ecd7 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -5287,9 +5287,9 @@ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, gg return; } - ggml_type_traits_t tt = ggml_internal_get_type_traits(quant); + const auto * tt = ggml_get_type_traits(quant); - ggml_to_float_t dequant_fn = tt.to_float; + ggml_to_float_t dequant_fn = tt->to_float; dequant_fn(from, to, ne); } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 03b832d0f..3f01092d9 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -729,7 +729,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc); static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc); -static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { +static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { .type_name = "i8", .blck_size = 1, @@ -1151,9 +1151,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { }; // For internal test use -ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { +const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { GGML_ASSERT(type < GGML_TYPE_COUNT); - return type_traits[type]; + return &type_traits[type]; } // diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp index 1a52ff5e9..131d7c177 100644 --- a/pocs/vdot/q8dot.cpp +++ b/pocs/vdot/q8dot.cpp @@ -136,7 +136,7 @@ int main(int argc, char** argv) { auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1; - auto funcs = ggml_internal_get_type_traits(ggml_type); + const auto * funcs = ggml_get_type_traits(ggml_type); Stat simple, ggml; @@ -156,8 +156,8 @@ int main(int argc, char** argv) { t1 = std::chrono::high_resolution_clock::now(); float fs; - if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1); - else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1); + if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1); + else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1); t2 = std::chrono::high_resolution_clock::now(); t = 1e-3*std::chrono::duration_cast(t2-t1).count(); if (iloop > 3) ggml.addResult(fs, t); diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index 17e9e4482..88e66ea13 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -236,7 +236,7 @@ int main(int argc, char** argv) { int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64); int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64); - auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0); + const auto * funcs = useQ4_1 ? ggml_get_type_traits(GGML_TYPE_Q4_1) : ggml_get_type_traits(GGML_TYPE_Q4_0); std::vector q40; std::vector q41; @@ -261,9 +261,9 @@ int main(int argc, char** argv) { // Note, we do not include this in the timing as in practical application // we already have the quantized model weights. if (useQ4_1) { - funcs.from_float(x1.data(), q41.data(), kVecSize); + funcs->from_float(x1.data(), q41.data(), kVecSize); } else { - funcs.from_float(x1.data(), q40.data(), kVecSize); + funcs->from_float(x1.data(), q40.data(), kVecSize); } // Now measure time the dot product needs using the "scalar" version above @@ -282,10 +282,10 @@ int main(int argc, char** argv) { dot_q4_q8(kVecSize, &result, q40.data(), q8.data()); } else { - auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type); - vdot.from_float(y1.data(), q8.data(), kVecSize); - if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1); - else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1); + const auto * vdot = ggml_get_type_traits(funcs->vec_dot_type); + vdot->from_float(y1.data(), q8.data(), kVecSize); + if (useQ4_1) funcs->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1); + else funcs->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1); } sumq += result; t2 = std::chrono::high_resolution_clock::now(); diff --git a/scripts/debug-test.sh b/scripts/debug-test.sh index 91946c514..c6c1e988a 100755 --- a/scripts/debug-test.sh +++ b/scripts/debug-test.sh @@ -110,7 +110,7 @@ rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir" ########################################################### # Note: test-eval-callback requires -DLLAMA_CURL -cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment" +cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build environment" pushd "$build_dir" make -j || abort "Failed to compile" popd > /dev/null || exit 1 @@ -127,7 +127,7 @@ printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n" pushd "$build_dir" tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1')) if [ ${#tests[@]} -eq 0 ]; then - abort "No tests avaliable... check your compliation process..." + abort "No tests available... check your compilation process..." fi popd > /dev/null || exit 1 @@ -137,7 +137,7 @@ popd > /dev/null || exit 1 # Select test number if [ -z $test_number ]; then - # List out avaliable tests + # List out available tests printf "Which test would you like to debug?\n" id=0 for s in "${tests[@]}" diff --git a/src/llama.cpp b/src/llama.cpp index 3fb8132f0..01cdf17dc 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17872,10 +17872,9 @@ static void llama_tensor_dequantize_internal( } float * f32_output = (float *) output.data(); - ggml_type_traits_t qtype; + const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type); if (ggml_is_quantized(tensor->type)) { - qtype = ggml_internal_get_type_traits(tensor->type); - if (qtype.to_float == NULL) { + if (qtype->to_float == NULL) { throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type))); } } else if (tensor->type != GGML_TYPE_F16 && @@ -17889,7 +17888,7 @@ static void llama_tensor_dequantize_internal( } else if (tensor->type == GGML_TYPE_BF16) { ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements); } else if (ggml_is_quantized(tensor->type)) { - qtype.to_float(tensor->data, f32_output, nelements); + qtype->to_float(tensor->data, f32_output, nelements); } else { GGML_ABORT("fatal error"); // unreachable } @@ -17925,7 +17924,7 @@ static void llama_tensor_dequantize_internal( } else if (typ == GGML_TYPE_BF16) { ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels); } else { - qtype.to_float(inbuf, outbuf, nels); + qtype->to_float(inbuf, outbuf, nels); } }; workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems); diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp index 07424bbab..04dcd7fcf 100644 --- a/src/unicode-data.cpp +++ b/src/unicode-data.cpp @@ -2311,7 +2311,7 @@ const std::unordered_set unicode_set_whitespace = { 0x003000, }; -// list is always in ascending order, to enable binary searh +// list is always in ascending order, to enable binary search const std::initializer_list> unicode_map_lowercase = { {0x000041, 0x000061}, {0x000042, 0x000062}, @@ -3748,7 +3748,7 @@ const std::initializer_list> unicode_map_lowercase {0x01E921, 0x01E943}, }; -// list is always in ascending order, to enable binary searh +// list is always in ascending order, to enable binary search const std::initializer_list> unicode_map_uppercase = { {0x000061, 0x000041}, {0x000062, 0x000042}, diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index fa26cc653..ee1a8877e 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -133,7 +133,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) { std::vector buf(ggml_nbytes(t)); ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t)); - ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type); + const auto * tt = ggml_get_type_traits(t->type); size_t bs = ggml_blck_size(t->type); std::vector vq(ggml_blck_size(t->type)); bool quantized = ggml_is_quantized(t->type); @@ -159,7 +159,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) { } else if (t->type == GGML_TYPE_I8) { tv.push_back((float)*(int8_t *) &buf[i]); } else if (quantized) { - tt.to_float(&buf[i], vq.data(), bs); + tt->to_float(&buf[i], vq.data(), bs); tv.insert(tv.end(), vq.begin(), vq.end()); } else { GGML_ABORT("fatal error"); diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index ccf5721a3..d50417ba0 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -44,26 +44,26 @@ static float array_rmse(const float * a1, const float * a2, size_t n) { } // Total quantization error on test data -static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { +static float total_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) { std::vector tmp_q(2*test_size); std::vector tmp_out(test_size); - qfns.from_float(test_data, tmp_q.data(), test_size); - qfns.to_float(tmp_q.data(), tmp_out.data(), test_size); + qfns->from_float(test_data, tmp_q.data(), test_size); + qfns->to_float(tmp_q.data(), tmp_out.data(), test_size); return array_rmse(test_data, tmp_out.data(), test_size); } // Total quantization error on test data -static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { +static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) { std::vector tmp_q(2*test_size); std::vector tmp_out(test_size); std::vector tmp_out_ref(test_size); - qfns.from_float(test_data, tmp_q.data(), test_size); - qfns.to_float(tmp_q.data(), tmp_out.data(), test_size); + qfns->from_float(test_data, tmp_q.data(), test_size); + qfns->to_float(tmp_q.data(), tmp_out.data(), test_size); - qfns.from_float_ref(test_data, tmp_q.data(), test_size); - qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size); + qfns->from_float_ref(test_data, tmp_q.data(), test_size); + qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size); return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size); } @@ -78,18 +78,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) { // Total dot product error static float dot_product_error( - ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2 + const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2 ) { std::vector tmp_q1(2*test_size); std::vector tmp_q2(2*test_size); - auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type); + const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type); - qfns.from_float(test_data1, tmp_q1.data(), test_size); - vdot.from_float(test_data2, tmp_q2.data(), test_size); + qfns->from_float(test_data1, tmp_q1.data(), test_size); + vdot->from_float(test_data2, tmp_q2.data(), test_size); float result = INFINITY; - qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1); + qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1); const float dot_ref = dot_product(test_data1, test_data2, test_size); @@ -131,10 +131,10 @@ int main(int argc, char * argv[]) { for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; - ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); + const auto * qfns = ggml_get_type_traits(type); // deprecated - skip - if (qfns.blck_size == 0) { + if (qfns->blck_size == 0) { continue; } @@ -143,7 +143,7 @@ int main(int argc, char * argv[]) { printf("Testing %s\n", ggml_type_name((ggml_type) i)); ggml_quantize_init(ei); - if (qfns.from_float && qfns.to_float) { + if (qfns->from_float && qfns->to_float) { const float total_error = total_quantization_error(qfns, test_size, test_data.data()); const float max_quantization_error = type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 24e066053..bdbdd90a8 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -122,9 +122,9 @@ static void usage(char * argv[]) { printf(" --type TYPE set test type as"); for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; - ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); + const auto * qfns = ggml_get_type_traits(type); if (ggml_type_name(type) != NULL) { - if (qfns.from_float && qfns.to_float) { + if (qfns->from_float && qfns->to_float) { printf(" %s", ggml_type_name(type)); } } @@ -270,12 +270,12 @@ int main(int argc, char * argv[]) { for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; - ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); + const auto * qfns = ggml_get_type_traits(type); if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) { continue; } - if (qfns.from_float && qfns.to_float) { + if (qfns->from_float && qfns->to_float) { printf("%s\n", ggml_type_name(type)); ggml_quantize_init(type); @@ -285,7 +285,7 @@ int main(int argc, char * argv[]) { for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { - qfns.from_float_ref(test_data1, test_q1, size); + qfns->from_float_ref(test_data1, test_q1, size); return test_q1[0]; }; size_t quantized_size = ggml_row_size(type, size); @@ -299,7 +299,7 @@ int main(int argc, char * argv[]) { for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { - qfns.from_float(test_data1, test_q1, size); + qfns->from_float(test_data1, test_q1, size); return test_q1[0]; }; size_t quantized_size = ggml_row_size(type, size); @@ -310,11 +310,11 @@ int main(int argc, char * argv[]) { if (params.op_dequantize_row_q) { printf(" dequantize_row_q\n"); - qfns.from_float(test_data1, test_q1, largest); + qfns->from_float(test_data1, test_q1, largest); for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { - qfns.to_float(test_q1, test_out, size); + qfns->to_float(test_q1, test_out, size); return test_out[0]; }; size_t quantized_size = ggml_row_size(type, size); @@ -328,8 +328,8 @@ int main(int argc, char * argv[]) { for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { - auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type); - vdot.from_float(test_data1, test_q1, size); + const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type); + vdot->from_float(test_data1, test_q1, size); return test_q1[0]; }; size_t quantized_size = ggml_row_size(type, size); @@ -340,13 +340,13 @@ int main(int argc, char * argv[]) { if (params.op_vec_dot_q) { printf(" vec_dot_q\n"); - qfns.from_float(test_data1, test_q1, largest); - qfns.from_float(test_data2, test_q2, largest); + qfns->from_float(test_data1, test_q1, largest); + qfns->from_float(test_data2, test_q2, largest); for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { float result; - qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1); + qfns->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1); return result; }; size_t quantized_size = ggml_row_size(type, size);