Merge branch 'master' into gg/llama-kv-cache
This commit is contained in:
commit
a40ba49fa6
26 changed files with 1922 additions and 193 deletions
|
@ -236,9 +236,13 @@ npm i
|
|||
# to run the dev server
|
||||
npm run dev
|
||||
|
||||
# to build the public/index.html
|
||||
# to build the public/index.html.gz
|
||||
npm run build
|
||||
```
|
||||
After `public/index.html.gz` has been generated we need to generate the c++
|
||||
headers (like build/examples/server/index.html.gz.hpp) that will be included
|
||||
by server.cpp. This is done by building `llama-server` as described in the
|
||||
[build](#build) section above.
|
||||
|
||||
NOTE: if you are using the vite dev server, you can change the API base URL to llama.cpp. To do that, run this code snippet in browser's console:
|
||||
|
||||
|
@ -456,7 +460,7 @@ These words will not be included in the completion, so make sure to add them to
|
|||
- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
|
||||
|
||||
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has a nested array `top_logprobs`. It contains at **maximum** `n_probs` elements:
|
||||
```json
|
||||
```
|
||||
{
|
||||
"content": "<the generated completion text>",
|
||||
"tokens": [ generated token ids if requested ],
|
||||
|
@ -557,7 +561,7 @@ If `with_pieces` is `true`:
|
|||
```
|
||||
|
||||
With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
|
||||
```json
|
||||
```
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 198, "piece": [195]}, // hex C3
|
||||
|
@ -572,6 +576,18 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
|
|||
|
||||
`tokens`: Set the tokens to detokenize.
|
||||
|
||||
### POST `/apply-template`: Apply chat template to a conversation
|
||||
|
||||
Uses the server's prompt template formatting functionality to convert chat messages to a single string expected by a chat model as input, but does not perform inference. Instead, the prompt string is returned in the `prompt` field of the JSON response. The prompt can then be modified as desired (for example, to insert "Sure!" at the beginning of the model's response) before sending to `/completion` to generate the chat response.
|
||||
|
||||
*Options:*
|
||||
|
||||
`messages`: (Required) Chat turns in the same format as `/v1/chat/completions`.
|
||||
|
||||
**Response format**
|
||||
|
||||
Returns a JSON object with a field `prompt` containing a string of the input messages formatted according to the model's chat template format.
|
||||
|
||||
### POST `/embedding`: Generate embedding of a given text
|
||||
|
||||
> [!IMPORTANT]
|
||||
|
@ -764,7 +780,7 @@ Same as the `/v1/embeddings` endpoint.
|
|||
|
||||
**Response format**
|
||||
|
||||
```json
|
||||
```
|
||||
[
|
||||
{
|
||||
"index": 0,
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
// mime type for sending response
|
||||
#define MIMETYPE_JSON "application/json; charset=utf-8"
|
||||
|
||||
// auto generated files (update with ./deps.sh)
|
||||
// auto generated files (see README.md for details)
|
||||
#include "index.html.gz.hpp"
|
||||
#include "loading.html.hpp"
|
||||
|
||||
|
@ -4124,6 +4124,14 @@ int main(int argc, char ** argv) {
|
|||
res_ok(res, root);
|
||||
};
|
||||
|
||||
const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||
auto body = json::parse(req.body);
|
||||
const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
|
||||
json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja);
|
||||
|
||||
res_ok(res, {{ "prompt", data.at("prompt") }});
|
||||
};
|
||||
|
||||
const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
|
||||
handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE);
|
||||
};
|
||||
|
@ -4300,6 +4308,7 @@ int main(int argc, char ** argv) {
|
|||
svr->Post("/v1/reranking", handle_rerank);
|
||||
svr->Post("/tokenize", handle_tokenize);
|
||||
svr->Post("/detokenize", handle_detokenize);
|
||||
svr->Post("/apply-template", handle_apply_template);
|
||||
// LoRA adapters hotswap
|
||||
svr->Get ("/lora-adapters", handle_lora_adapters_list);
|
||||
svr->Post("/lora-adapters", handle_lora_adapters_apply);
|
||||
|
@ -4378,11 +4387,13 @@ int main(int argc, char ** argv) {
|
|||
ctx_server.chat_templates.template_default->source().c_str(),
|
||||
common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
|
||||
|
||||
ctx_server.queue_tasks.on_new_task(std::bind(
|
||||
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|
||||
ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
|
||||
ctx_server.process_single_task(task);
|
||||
});
|
||||
|
||||
ctx_server.queue_tasks.on_update_slots(std::bind(
|
||||
&server_context::update_slots, &ctx_server));
|
||||
ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
|
||||
ctx_server.update_slots();
|
||||
});
|
||||
|
||||
shutdown_handler = [&](int) {
|
||||
ctx_server.queue_tasks.terminate();
|
||||
|
|
|
@ -121,6 +121,21 @@ def test_chat_template():
|
|||
assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
|
||||
|
||||
|
||||
def test_apply_chat_template():
|
||||
global server
|
||||
server.chat_template = "command-r"
|
||||
server.start()
|
||||
res = server.make_request("POST", "/apply-template", data={
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a test."},
|
||||
{"role": "user", "content":"Hi there"},
|
||||
]
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "prompt" in res.body
|
||||
assert res.body["prompt"] == "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a test.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("response_format,n_predicted,re_content", [
|
||||
({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""),
|
||||
({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue