diff --git a/README.md b/README.md index 748def30f..d40309875 100644 --- a/README.md +++ b/README.md @@ -423,7 +423,7 @@ To learn more about model quantization, [read this documentation](examples/quant -[^1]: [examples/perplexity/README.md](examples/perplexity/README.md) +[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md) [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity) ## [`llama-bench`](examples/llama-bench) diff --git a/examples/server/README.md b/examples/server/README.md index d7b921116..ce1ae8858 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -460,7 +460,7 @@ These words will not be included in the completion, so make sure to add them to - Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support. - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has a nested array `top_logprobs`. It contains at **maximum** `n_probs` elements: - ```json + ``` { "content": "", "tokens": [ generated token ids if requested ], @@ -561,7 +561,7 @@ If `with_pieces` is `true`: ``` With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k -```json +``` { "tokens": [ {"id": 198, "piece": [195]}, // hex C3 @@ -576,6 +576,18 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k `tokens`: Set the tokens to detokenize. +### POST `/apply-template`: Apply chat template to a conversation + +Uses the server's prompt template formatting functionality to convert chat messages to a single string expected by a chat model as input, but does not perform inference. Instead, the prompt string is returned in the `prompt` field of the JSON response. The prompt can then be modified as desired (for example, to insert "Sure!" at the beginning of the model's response) before sending to `/completion` to generate the chat response. + +*Options:* + +`messages`: (Required) Chat turns in the same format as `/v1/chat/completions`. + +**Response format** + +Returns a JSON object with a field `prompt` containing a string of the input messages formatted according to the model's chat template format. + ### POST `/embedding`: Generate embedding of a given text > [!IMPORTANT] @@ -768,7 +780,7 @@ Same as the `/v1/embeddings` endpoint. **Response format** -```json +``` [ { "index": 0, diff --git a/examples/server/server.cpp b/examples/server/server.cpp index accc60124..ab548d541 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -4185,6 +4185,14 @@ int main(int argc, char ** argv) { res_ok(res, root); }; + const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) { + auto body = json::parse(req.body); + const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default; + json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja); + + res_ok(res, {{ "prompt", data.at("prompt") }}); + }; + const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE); }; @@ -4361,6 +4369,7 @@ int main(int argc, char ** argv) { svr->Post("/v1/reranking", handle_rerank); svr->Post("/tokenize", handle_tokenize); svr->Post("/detokenize", handle_detokenize); + svr->Post("/apply-template", handle_apply_template); // LoRA adapters hotswap svr->Get ("/lora-adapters", handle_lora_adapters_list); svr->Post("/lora-adapters", handle_lora_adapters_apply); @@ -4439,11 +4448,13 @@ int main(int argc, char ** argv) { ctx_server.chat_templates.template_default->source().c_str(), common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str()); - ctx_server.queue_tasks.on_new_task(std::bind( - &server_context::process_single_task, &ctx_server, std::placeholders::_1)); + ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) { + ctx_server.process_single_task(task); + }); - ctx_server.queue_tasks.on_update_slots(std::bind( - &server_context::update_slots, &ctx_server)); + ctx_server.queue_tasks.on_update_slots([&ctx_server]() { + ctx_server.update_slots(); + }); shutdown_handler = [&](int) { ctx_server.queue_tasks.terminate(); diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index fa6cbeb67..c65fd6c1e 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -121,6 +121,21 @@ def test_chat_template(): assert res.body["__verbose"]["prompt"] == " <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +def test_apply_chat_template(): + global server + server.chat_template = "command-r" + server.start() + res = server.make_request("POST", "/apply-template", data={ + "messages": [ + {"role": "system", "content": "You are a test."}, + {"role": "user", "content":"Hi there"}, + ] + }) + assert res.status_code == 200 + assert "prompt" in res.body + assert res.body["prompt"] == "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a test.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" + + @pytest.mark.parametrize("response_format,n_predicted,re_content", [ ({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""), ({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"), diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 561f8bdb8..ad9ffe66a 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1692,7 +1692,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); linefeed_id = ids[0]; } else { - const std::vector ids = tokenize("\xC4\x8A", false); // U+010A + const std::vector ids = tokenize("\n", false); //GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); if (ids.empty()) {