From 92be9f12164f18ce845a5bab60cefa5f7fec6836 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Oct 2024 06:11:26 +0300 Subject: [PATCH 01/10] flake.lock: Update (#9870) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/bc947f541ae55e999ffdb4013441347d83b00feb?narHash=sha256-NOiTvBbRLIOe5F6RbHaAh6%2B%2BBNjsb149fGZd1T4%2BKBg%3D' (2024-10-04) → 'github:NixOS/nixpkgs/5633bcff0c6162b9e4b5f1264264611e950c8ec7?narHash=sha256-9UTxR8eukdg%2BXZeHgxW5hQA9fIKHsKCdOIUycTryeVw%3D' (2024-10-09) Co-authored-by: github-actions[bot] --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index 3fb6ced51..702527028 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1728018373, - "narHash": "sha256-NOiTvBbRLIOe5F6RbHaAh6++BNjsb149fGZd1T4+KBg=", + "lastModified": 1728492678, + "narHash": "sha256-9UTxR8eukdg+XZeHgxW5hQA9fIKHsKCdOIUycTryeVw=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "bc947f541ae55e999ffdb4013441347d83b00feb", + "rev": "5633bcff0c6162b9e4b5f1264264611e950c8ec7", "type": "github" }, "original": { From c7181bd294757dd80a7904e3dd0fea2d0be914e7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Oct 2024 18:52:48 +0300 Subject: [PATCH 02/10] server : reuse cached context chunks (#9866) ggml-ci --- common/arg.cpp | 7 ++++ common/common.h | 3 +- examples/server/README.md | 1 + examples/server/server.cpp | 69 ++++++++++++++++++++++++++++++++++++-- examples/server/utils.hpp | 4 +-- 5 files changed, 78 insertions(+), 6 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 205177d46..8969fc107 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1788,6 +1788,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.n_threads_http = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); + add_opt(common_arg( + {"--cache-reuse"}, "N", + string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse), + [](common_params & params, int value) { + params.n_cache_reuse = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE")); add_opt(common_arg( {"--metrics"}, string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), diff --git a/common/common.h b/common/common.h index 71e686156..5507b1c59 100644 --- a/common/common.h +++ b/common/common.h @@ -277,7 +277,8 @@ struct common_params { int32_t port = 8080; // server listens on this network port int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds - int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) + int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) + int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting std::string hostname = "127.0.0.1"; std::string public_path = ""; // NOLINT diff --git a/examples/server/README.md b/examples/server/README.md index b5feeb77b..cd0eaf847 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -147,6 +147,7 @@ The project is under active development, and we are [looking for feedback and co | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate
(env: LLAMA_ARG_SSL_CERT_FILE) | | `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | +| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
(env: LLAMA_ARG_CACHE_REUSE) | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | | `--slots` | enable slots monitoring endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | | `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) | diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f809c46d5..015b3b2c5 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -800,7 +800,7 @@ struct server_context { int slot_prompt_len = slot_prompt.size(); // length of the Longest Common Prefix between the current slot's prompt and the input prompt - int lcp_len = common_part(slot_prompt, prompt); + int lcp_len = longest_common_prefix(slot_prompt, prompt); // fraction of the common substring length compared to the current slot's prompt length similarity = static_cast(lcp_len) / slot_prompt_len; @@ -2012,7 +2012,7 @@ struct server_context { } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - // if input prompt is too big, truncate it (if group attention self-extend is disabled) + // if input prompt is too big, truncate it if (slot.n_prompt_tokens >= slot.n_ctx) { const int n_left = slot.n_ctx - slot.params.n_keep; @@ -2042,12 +2042,74 @@ struct server_context { if (slot.params.cache_prompt) { // reuse any previously computed tokens that are common with the new prompt - slot.n_past = common_part(slot.cache_tokens, prompt_tokens); + slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens); // push the prompt into the sampling context (do not apply grammar) for (int i = 0; i < slot.n_past; ++i) { common_sampler_accept(slot.smpl, slot.cache_tokens[i], false); } + + // reuse chunks from the cached prompt by shifting their KV cache in the new position + if (params.n_cache_reuse > 0) { + size_t head_c = slot.n_past; // cache + size_t head_p = slot.n_past; // current prompt + + SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past); + + while (head_c < slot.cache_tokens.size() && + head_p < prompt_tokens.size()) { + if (llama_token_is_control(model, slot.cache_tokens[head_c])) { + break; + } + + if (llama_token_is_control(model, prompt_tokens[head_p])) { + break; + } + + size_t n_match = 0; + + while (head_c + n_match < slot.cache_tokens.size() && + head_p + n_match < prompt_tokens.size() && + slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) { + if (llama_token_is_control(model, slot.cache_tokens[head_c + n_match])) { + break; + } + + if (llama_token_is_control(model, prompt_tokens[head_p + n_match])) { + break; + } + + n_match++; + } + + if (n_match >= (size_t) params.n_cache_reuse) { + SLT_DBG(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); + //for (size_t i = head_p; i < head_p + n_match; i++) { + // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + //} + + const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; + + llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c); + llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift); + + for (size_t i = 0; i < n_match; i++) { + slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; + + common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false); + + slot.n_past++; + } + + head_c += n_match; + head_p += n_match; + } else { + head_c += 1; + } + } + + SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past); + } } } @@ -3257,6 +3319,7 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.on_new_task(std::bind( &server_context::process_single_task, &ctx_server, std::placeholders::_1)); + ctx_server.queue_tasks.on_update_slots(std::bind( &server_context::update_slots, &ctx_server)); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index ad99e9574..37999604d 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -195,14 +195,14 @@ static std::string gen_chatcmplid() { // other common utils // -static size_t common_part(const std::vector & a, const std::vector & b) { +static size_t longest_common_prefix(const std::vector & a, const std::vector & b) { size_t i; for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} return i; } -static size_t common_part(const std::string & a, const std::string & b) { +static size_t longest_common_prefix(const std::string & a, const std::string & b) { size_t i; for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} From d4c19c0f5cdb1e512573e8c86c79e8d0238c73c4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Oct 2024 21:31:35 +0300 Subject: [PATCH 03/10] server : accept extra_context for the infill endpoint (#9874) * server : accept extra_context for the infill endpoint ggml-ci * server : update readme [no ci] * server : use repo-level FIM pattern if possible ggml-ci --- examples/server/README.md | 21 ++++++++ examples/server/server.cpp | 102 ++++++++++++++++++++++++++++++++++--- src/llama.cpp | 56 +++++++++++++------- 3 files changed, 153 insertions(+), 26 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index cd0eaf847..eb0a7b32e 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -524,9 +524,30 @@ Takes a prefix and a suffix and returns the predicted completion as stream. - `input_prefix`: Set the prefix of the code to infill. - `input_suffix`: Set the suffix of the code to infill. +- `prompt`: Added after the `FIM_MID` token +- `extra_context`: Additional context inserted before the FIM prefix. See https://github.com/ggerganov/llama.cpp/pull/9874 It also accepts all the options of `/completion`. +If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](https://arxiv.org/pdf/2409.12186) is used: + +```txt +myproject +{chunk 0 filename} +{chunk 0 text} +{chunk 1 filename} +{chunk 1 text} +... +filename +[input_prefix][input_suffix][prompt] +``` + +If the tokens are missing, then the extra context is simply prefixed at the start: + +```txt +[extra_context][input_prefix][input_suffix][prompt] +``` + ### **GET** `/props`: Get server global properties. This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props` diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 015b3b2c5..18bcad3f0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -139,6 +139,7 @@ struct slot_params { json input_prefix; json input_suffix; + json extra_context; }; struct server_slot { @@ -170,6 +171,7 @@ struct server_slot { // when a task is submitted, we first tokenize the prompt and store it here std::vector prompt_tokens; + std::vector extra_tokens; std::string generated_text; std::vector cache_tokens; @@ -906,8 +908,26 @@ struct server_context { } // infill - slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix); - slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix); + slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix); + slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix); + slot.params.extra_context = json_value(data, "extra_context", default_params.extra_context); + + SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.params.extra_context.size()); + for (const auto & chunk : slot.params.extra_context) { + // { "text": string, "filename": string } + if (!chunk.contains("text") || !chunk["text"].is_string()) { + send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST); + return false; + } + + // filename is optional + if (chunk.contains("filename") && !chunk["filename"].is_string()) { + send_error(task, "extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST); + return false; + } + + SLT_DBG(slot, "extra_context chunk in file '%s':\n%s\n", chunk.value("filename", "").c_str(), chunk.value("text", "").c_str()); + } // get prompt if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) { @@ -1934,13 +1954,66 @@ struct server_context { } break; case SERVER_TASK_CMPL_TYPE_INFILL: { + // use FIM repo-level pattern: + // ref: https://arxiv.org/pdf/2409.12186 + // + // [FIM_REP]myproject + // [FIM_SEP]filename0 + // extra chunk 0 + // [FIM_SEP]filename1 + // extra chunk 1 + // ... + // [FIM_SEP]filename + // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID] + // auto prefix_tokens = tokenize(slot.params.input_prefix, false, false); auto suffix_tokens = tokenize(slot.params.input_suffix, false, false); - // for now pick context to fit in a single batch (ratio prefix:suffix = 3:1, TODO: configurable?) - const int n_suffix_take = std::min(suffix_tokens.size(), n_batch/4); + slot.extra_tokens.clear(); + if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) { + static const auto k_fim_repo = tokenize("myproject\n", false, false); + + slot.extra_tokens.push_back(llama_token_fim_rep(model)); + slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); + } + + for (const auto & chunk : slot.params.extra_context) { + // { "text": string, "filename": string } + const std::string text = chunk.value("text", ""); + const std::string filename = chunk.value("filename", "tmp"); + + if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { + const auto k_fim_file = tokenize(filename + "\n", false, false); + + slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model)); + slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); + } else { + // chunk separator in binary form to avoid confusing the AI + static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00}; + static const auto k_chunk_prefix_tokens = tokenize(k_chunk_prefix_str, false, false); + + slot.extra_tokens.insert(slot.extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end()); + } + + const auto chunk_tokens = tokenize(text, false, false); + slot.extra_tokens.insert(slot.extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end()); + } + + if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { + // TODO: current filename + static const auto k_fim_file = tokenize("filename\n", false, false); + + slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model)); + slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); + } + + // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?) + const int n_suffix_take = std::min(suffix_tokens.size(), (n_batch)/4); const int n_prefix_take = std::min(prefix_tokens.size(), (n_batch - 3) - n_suffix_take); + // fill the rest of the context with extra chunks + const int n_extra_take = std::min(std::max(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size()); + prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take); suffix_tokens.resize(n_suffix_take); @@ -1954,6 +2027,11 @@ struct server_context { embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); } + SLT_DBG(slot, "extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", slot.n_ctx, n_extra_take, (int) slot.extra_tokens.size()); + + // put the extra context before the FIM prefix + embd_inp.insert(embd_inp.begin(), slot.extra_tokens.end() - n_extra_take, slot.extra_tokens.end()); + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); embd_inp.push_back(llama_token_fim_mid(model)); @@ -2058,11 +2136,15 @@ struct server_context { while (head_c < slot.cache_tokens.size() && head_p < prompt_tokens.size()) { - if (llama_token_is_control(model, slot.cache_tokens[head_c])) { + if (llama_token_is_control(model, slot.cache_tokens[head_c]) && + slot.cache_tokens[head_c] != llama_token_fim_rep(model) && + slot.cache_tokens[head_c] != llama_token_fim_sep(model)) { break; } - if (llama_token_is_control(model, prompt_tokens[head_p])) { + if (llama_token_is_control(model, prompt_tokens[head_p]) && + prompt_tokens[head_p] != llama_token_fim_rep(model) && + prompt_tokens[head_p] != llama_token_fim_sep(model)) { break; } @@ -2071,11 +2153,15 @@ struct server_context { while (head_c + n_match < slot.cache_tokens.size() && head_p + n_match < prompt_tokens.size() && slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) { - if (llama_token_is_control(model, slot.cache_tokens[head_c + n_match])) { + if (llama_token_is_control(model, slot.cache_tokens[head_c + n_match]) && + slot.cache_tokens[head_c + n_match] != llama_token_fim_rep(model) && + slot.cache_tokens[head_c + n_match] != llama_token_fim_sep(model)) { break; } - if (llama_token_is_control(model, prompt_tokens[head_p + n_match])) { + if (llama_token_is_control(model, prompt_tokens[head_p + n_match]) && + prompt_tokens[head_p + n_match] != llama_token_fim_rep(model) && + prompt_tokens[head_p + n_match] != llama_token_fim_sep(model)) { break; } diff --git a/src/llama.cpp b/src/llama.cpp index f68024f5b..511f91802 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6596,8 +6596,8 @@ static void llm_load_vocab( ) { vocab.special_eot_id = t.second; if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { - LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", - __func__, t.first.c_str()); + LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", + __func__, t.second, t.first.c_str()); vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; } } @@ -6610,8 +6610,8 @@ static void llm_load_vocab( ) { vocab.special_eom_id = t.second; if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { - LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", - __func__, t.first.c_str()); + LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", + __func__, t.second, t.first.c_str()); vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; } } @@ -6627,8 +6627,8 @@ static void llm_load_vocab( ) { vocab.special_fim_pre_id = t.second; if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { - LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", - __func__, t.first.c_str()); + LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", + __func__, t.second, t.first.c_str()); vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; } } @@ -6644,8 +6644,8 @@ static void llm_load_vocab( ) { vocab.special_fim_suf_id = t.second; if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { - LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", - __func__, t.first.c_str()); + LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", + __func__, t.second, t.first.c_str()); vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; } } @@ -6661,8 +6661,8 @@ static void llm_load_vocab( ) { vocab.special_fim_mid_id = t.second; if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { - LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", - __func__, t.first.c_str()); + LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", + __func__, t.second, t.first.c_str()); vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; } } @@ -6677,8 +6677,8 @@ static void llm_load_vocab( ) { vocab.special_fim_pad_id = t.second; if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { - LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", - __func__, t.first.c_str()); + LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", + __func__, t.second, t.first.c_str()); vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; } } @@ -6694,8 +6694,8 @@ static void llm_load_vocab( ) { vocab.special_fim_rep_id = t.second; if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { - LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", - __func__, t.first.c_str()); + LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", + __func__, t.second, t.first.c_str()); vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; } } @@ -6708,8 +6708,8 @@ static void llm_load_vocab( ) { vocab.special_fim_sep_id = t.second; if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { - LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", - __func__, t.first.c_str()); + LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", + __func__, t.second, t.first.c_str()); vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; } } @@ -6720,6 +6720,19 @@ static void llm_load_vocab( // this is currently determined based on the token text, which is obviously not ideal // ref: https://github.com/ggerganov/llama.cpp/issues/9606 vocab.special_eog_ids.clear(); + + if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) { + vocab.special_eog_ids.insert(vocab.special_fim_pad_id); + } + + if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) { + vocab.special_eog_ids.insert(vocab.special_fim_rep_id); + } + + if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) { + vocab.special_eog_ids.insert(vocab.special_fim_sep_id); + } + for (const auto & t : vocab.token_to_id) { if (false || t.first == "<|eot_id|>" @@ -6732,13 +6745,20 @@ static void llm_load_vocab( ) { vocab.special_eog_ids.insert(t.second); if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { - LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", - __func__, t.first.c_str()); + LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", + __func__, t.second, t.first.c_str()); vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; } + } else { + // token is control, but not marked as EOG -> print a warning + if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) { + LLAMA_LOG_WARN("%s: control token: %6d '%s' is not marked as EOG\n", + __func__, t.second, t.first.c_str()); + } } } + // sanity checks if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) { vocab.special_eog_ids.insert(vocab.special_eos_id); LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__); From 13dca2a54a394757d56fdd652b9f0df08f44ea22 Mon Sep 17 00:00:00 2001 From: agray3 Date: Mon, 14 Oct 2024 01:49:08 +0100 Subject: [PATCH 04/10] Vectorize load instructions in dmmv f16 CUDA kernel (#9816) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Vectorize load instructions in dmmv f16 CUDA kernel Replaces scalar with vector load instructions, which substantially improves performance on NVIDIA HBM GPUs, e.g. gives a 1.27X overall speedup for Meta-Llama-3-8B-Instruct-F16 BS1 inference evaluation on H100 SXM 80GB HBM3. On GDDR GPUs, there is a slight (1.01X) speedup. * addressed comment * Update ggml/src/ggml-cuda/dmmv.cu Co-authored-by: Johannes Gäßler --------- Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/dmmv.cu | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu index 96a5adef5..00e21b5d7 100644 --- a/ggml/src/ggml-cuda/dmmv.cu +++ b/ggml/src/ggml-cuda/dmmv.cu @@ -416,10 +416,11 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ const half * x = (const half *) vx; - + // load 2 halfs into register in a single instruction + const half2 x_reg = *((half2 *) &(x[ib + iqs])); // automatic half -> float type cast if dfloat == float - v.x = x[ib + iqs + 0]; - v.y = x[ib + iqs + 1]; + v.x = __low2float(x_reg); + v.y = __high2float(x_reg); } static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) { @@ -476,13 +477,28 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons // matrix multiplication // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 #ifdef GGML_CUDA_F16 - tmp += __hmul2(v, { - y[iybs + iqs + j/qr + 0], - y[iybs + iqs + j/qr + y_offset] - }); + if ( y_offset == 1 ) { + // load 2 dfloats into register in a single instruction + const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr])); + tmp += __hmul2(v, y_reg); + } + else { + tmp += __hmul2(v, { + y[iybs + iqs + j/qr + 0], + y[iybs + iqs + j/qr + y_offset] + }); + } #else - tmp += v.x * y[iybs + iqs + j/qr + 0]; - tmp += v.y * y[iybs + iqs + j/qr + y_offset]; + if ( y_offset == 1 ) { + // load 2 dfloats into register in a single instruction + const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr])); + tmp += v.x * y_reg.x; + tmp += v.y * y_reg.y; + } + else { + tmp += v.x * y[iybs + iqs + j/qr + 0]; + tmp += v.y * y[iybs + iqs + j/qr + y_offset]; + } #endif // GGML_CUDA_F16 } } From a89f75e1b7b90cb2d4d4c52ca53ef9e9b466aa45 Mon Sep 17 00:00:00 2001 From: VoidIsVoid <343750470@qq.com> Date: Mon, 14 Oct 2024 15:04:36 +0800 Subject: [PATCH 05/10] server : handle "logprobs" field with false value (#9871) Co-authored-by: Gimling --- examples/server/utils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 37999604d..69519ef95 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -360,9 +360,9 @@ static json oaicompat_completion_params_parse( // Handle "logprobs" field // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future - if (body.contains("logprobs")) { + if (json_value(body, "logprobs", false)) { llama_params["n_probs"] = json_value(body, "top_logprobs", 20); - } else if (body.contains("top_logprobs")) { + } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) { throw std::runtime_error("top_logprobs requires logprobs to be set to true"); } From 4c42f93b22146c83b763d8cbee5fafc512746649 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Tuszy=C5=84ski?= Date: Tue, 15 Oct 2024 10:20:34 +0200 Subject: [PATCH 06/10] readme : update bindings list (#9889) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index dd4927b04..08fe8cc92 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,7 @@ Typically finetunes of the base models below are supported as well. - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326) - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp) +- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift) **UI:** From dcdd535302fc9702a4709be25f56540d65163a44 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 15 Oct 2024 12:48:44 +0300 Subject: [PATCH 07/10] server : update preact (#9895) --- examples/server/public/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/public/index.js b/examples/server/public/index.js index fe615ca25..32ec6e9e1 100644 --- a/examples/server/public/index.js +++ b/examples/server/public/index.js @@ -1 +1 @@ -const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function f(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function s(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}s.prototype.brand=t;s.prototype.h=function(){return!0};s.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};s.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};s.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};s.prototype.valueOf=function(){return this.value};s.prototype.toString=function(){return this.value+""};s.prototype.toJSON=function(){return this.value};s.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(s.prototype,"value",{get(){const t=f(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new s(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){s.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new s).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}s.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){s.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=f(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,T,D,M={},F=[],A=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,W=Array.isArray;function L(t,n){for(var e in n)t[e]=n[e];return t}function O(t){var n=t.parentNode;n&&n.removeChild(t)}function R(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return I(t,r,_,i,null)}function I(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function V(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;nn&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,f,s){var c,h,a,p,d,v=_&&_.__k||F,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c0?I(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i)?(i.__=t,i.__b=t.__b+1,u=Z(i,e,r,s),i.__i=u,o=null,-1!==u&&(s--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u==r-1?c--:u==r+1?c++:u>r?s>l-r?c+=u-r:c--:u(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u2&&(u.children=arguments.length>3?w.call(arguments,2):e),I(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+D++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=[],(_={})[n]=this,this.getChildContext=function(){return _},this.componentWillUnmount=function(){e=null},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.some((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.push(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e&&e.splice(e.indexOf(t),1),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=F.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=L({},this.state),"function"==typeof t&&(t=t(L({},e),this.props)),t&&L(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),T=et(!0),D=0;var at,pt,dt,vt,yt=0,mt=[],gt=S,bt=gt.__b,kt=gt.__r,wt=gt.diffed,St=gt.__c,xt=gt.unmount,Ct=gt.__;function Ut(t,n){gt.__h&>.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({}),e.__[t]}function Et(t){return yt=1,Ht(Bt,t)}function Ht(t,n,e){var _=Ut(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):Bt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Pt(t,n){var e=Ut(at++,3);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ut(at++,4);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function $t(t){return yt=5,Dt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ut(at++,7);return qt(e.__H,n)&&(e.__=t(),e.__H=n,e.__h=t),e.__}function Mt(t,n){return yt=8,Dt((function(){return t}),n)}function Ft(t){var n=pt.context[t.__c],e=Ut(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function At(t,n){gt.useDebugValue&>.useDebugValue(n?n(t):t)}function Wt(t){var n=Ut(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Ut(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Vt),t.__H.__h.forEach(jt),t.__H.__h=[]}catch(n){t.__H.__h=[],gt.__e(n,t.__v)}}gt.__b=function(t){pt=null,bt&&bt(t)},gt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ct&&Ct(t,n)},gt.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.i=t.__N=void 0}))):(n.__h.forEach(Vt),n.__h.forEach(jt),n.__h=[],at=0)),dt=pt},gt.diffed=function(t){wt&&wt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===gt.requestAnimationFrame||((vt=gt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.i=void 0}))),dt=pt=null},gt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Vt),t.__h=t.__h.filter((function(t){return!t.__||jt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],gt.__e(r,t.__v)}})),St&&St(t,n)},gt.unmount=function(t){xt&&xt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Vt(t)}catch(t){n=t}})),e.__H=void 0,n&>.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function Vt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function jt(t){var n=pt;t.__c=t.__(),pt=n}function qt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function zt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Gt,Jt;function Kt(t){if(Jt)Jt();Jt=t&&t.S()}function Qt({data:t}){const n=Yt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Qt.displayName="_st";Object.defineProperties(s.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Qt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});zt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof s){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});zt("__r",(t,n)=>{Kt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Gt=_;Kt(e);t(n)});zt("__e",(t,n,e,_)=>{Kt();Gt=void 0;t(n,e,_)});zt("diffed",(t,n)=>{Kt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Xt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Xt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}zt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});zt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Yt(t){return Dt(()=>c(t),[])}function Zt(t){const n=$t(t);n.current=t;Gt.__$f|=4;return Dt(()=>v(()=>n.current()),[])}function tn(t){const n=$t(t);n.current=t;Pt(()=>k(()=>n.current()),[])}var nn=function(t,n,e,_){var i;n[0]=0;for(var o=1;o=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][f+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var on=_n.bind(R);export{q as Component,j as Fragment,s as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,R as createElement,V as createRef,k as effect,R as h,on as html,st as hydrate,C as isValidElement,S as options,ft as render,c as signal,Y as toChildArray,o as untracked,Mt as useCallback,Zt as useComputed,Ft as useContext,At as useDebugValue,Pt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ht as useReducer,$t as useRef,Yt as useSignal,tn as useSignalEffect,Et as useState}; +const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function s(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function f(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}f.prototype.brand=t;f.prototype.h=function(){return!0};f.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};f.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};f.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};f.prototype.valueOf=function(){return this.value};f.prototype.toString=function(){return this.value+""};f.prototype.toJSON=function(){return this.value};f.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(f.prototype,"value",{get(){const t=s(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new f(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){f.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new f).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}f.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){f.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=s(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,T,D,M={},A=[],F=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,W=Array.isArray;function L(t,n){for(var e in n)t[e]=n[e];return t}function O(t){t&&t.parentNode&&t.parentNode.removeChild(t)}function R(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return I(t,r,_,i,null)}function I(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function V(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;nn&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,s,f){var c,h,a,p,d,v=_&&_.__k||A,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c0?I(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i).__=t,i.__b=t.__b+1,o=null,-1!==(u=i.__i=Z(i,e,r,f))&&(f--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u==r-1?c--:u==r+1?c++:(u>r?c--:c++,i.__u|=65536))):i=t.__k[_]=null;if(f)for(_=0;_(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u2&&(u.children=arguments.length>3?w.call(arguments,2):e),I(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+D++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=new Set,(_={})[n]=this,this.getChildContext=function(){return _},this.componentWillUnmount=function(){e=null},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.forEach((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.add(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e&&e.delete(t),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=A.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=L({},this.state),"function"==typeof t&&(t=t(L({},e),this.props)),t&&L(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),T=et(!0),D=0;var at,pt,dt,vt,yt=0,mt=[],gt=S,bt=gt.__b,kt=gt.__r,wt=gt.diffed,St=gt.__c,xt=gt.unmount,Ct=gt.__;function Ut(t,n){gt.__h&>.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({}),e.__[t]}function Et(t){return yt=1,Ht(Bt,t)}function Ht(t,n,e){var _=Ut(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):Bt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Pt(t,n){var e=Ut(at++,3);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ut(at++,4);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function $t(t){return yt=5,Dt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ut(at++,7);return qt(e.__H,n)&&(e.__=t(),e.__H=n,e.__h=t),e.__}function Mt(t,n){return yt=8,Dt((function(){return t}),n)}function At(t){var n=pt.context[t.__c],e=Ut(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function Ft(t,n){gt.useDebugValue&>.useDebugValue(n?n(t):t)}function Wt(t){var n=Ut(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Ut(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Vt),t.__H.__h.forEach(jt),t.__H.__h=[]}catch(n){t.__H.__h=[],gt.__e(n,t.__v)}}gt.__b=function(t){pt=null,bt&&bt(t)},gt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ct&&Ct(t,n)},gt.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.i=t.__N=void 0}))):(n.__h.forEach(Vt),n.__h.forEach(jt),n.__h=[],at=0)),dt=pt},gt.diffed=function(t){wt&&wt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===gt.requestAnimationFrame||((vt=gt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.i=void 0}))),dt=pt=null},gt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Vt),t.__h=t.__h.filter((function(t){return!t.__||jt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],gt.__e(r,t.__v)}})),St&&St(t,n)},gt.unmount=function(t){xt&&xt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Vt(t)}catch(t){n=t}})),e.__H=void 0,n&>.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function Vt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function jt(t){var n=pt;t.__c=t.__(),pt=n}function qt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function zt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Gt,Jt;function Kt(t){if(Jt)Jt();Jt=t&&t.S()}function Qt({data:t}){const n=Yt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Qt.displayName="_st";Object.defineProperties(f.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Qt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});zt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof f){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});zt("__r",(t,n)=>{Kt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Gt=_;Kt(e);t(n)});zt("__e",(t,n,e,_)=>{Kt();Gt=void 0;t(n,e,_)});zt("diffed",(t,n)=>{Kt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Xt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Xt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}zt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});zt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Yt(t){return Dt(()=>c(t),[])}function Zt(t){const n=$t(t);n.current=t;Gt.__$f|=4;return Dt(()=>v(()=>n.current()),[])}function tn(t){const n=$t(t);n.current=t;Pt(()=>k(()=>n.current()),[])}var nn=function(t,n,e,_){var i;n[0]=0;for(var o=1;o=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][s+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var on=_n.bind(R);export{q as Component,j as Fragment,f as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,R as createElement,V as createRef,k as effect,R as h,on as html,ft as hydrate,C as isValidElement,S as options,st as render,c as signal,Y as toChildArray,o as untracked,Mt as useCallback,Zt as useComputed,At as useContext,Ft as useDebugValue,Pt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ht as useReducer,$t as useRef,Yt as useSignal,tn as useSignalEffect,Et as useState}; From fbc98b748e7b075e327bcf13237057f647678049 Mon Sep 17 00:00:00 2001 From: MaggotHATE Date: Tue, 15 Oct 2024 15:54:55 +0500 Subject: [PATCH 08/10] sampling : add XTC sampler (#9742) * Initial XTC commit Adds XTC sampler, not activated by default, but recommended settings by default. * Cleanup * Simplified chances calculation To be more inline with the original implementation, chance is calculated once at the beginning. * First fixes by comments Still need to look into sorting * Fixed trailing backspaces * Fixed RNG to be reproduceable Thanks to @slaren for directions * Fixed forgotten header * Moved `min_keep` Moved from conditions to a simple check at the end. * Fixed broken randomization Thanks to @slaren for explanation * Swapped sorting for a custom algorithm Shifts tokens to remove the penalized ones, then puts the penalized at the back. Should make `min_keep` still viable. * Algorithm rework 1. Scan token from top till the first non-penalizable 2. Remove the last captured token (the least probable above threshold) 3. Shift all tokens to override the remaining penalizable 4. Penalize and put them at the the bottom. * Added XTC to `test-sampling` * Simplified algorithm and more tests * Updated info in common and args * Merged back lost commits in common and arg * Update dump info in common * Fixed incorrect min_keep check * Added XTC to README * Renamed parameters, fixed info and defaults * probability is at 0 by default, but XTC is included in sampling queue * threshold higher than 0.5 switches XTC off * Initial server support * Added XTC to server UIs * Fixed labels in old server UI * Made algorithm safer and more readable * Removed xtc_threshold_max * Fixed arg after update * Quick fixes by comments * Simplified algorithm since threshold_max is removed * Renamed random distribution * Fixed tests and outdated README * Small fixes --- common/arg.cpp | 14 ++++ common/common.cpp | 2 + common/common.h | 6 ++ common/sampling.cpp | 13 +++- examples/main/README.md | 13 ++++ examples/server/public/index-new.html | 6 ++ examples/server/public/index.html | 4 ++ examples/server/server.cpp | 4 ++ include/llama.h | 3 + src/llama-sampling.cpp | 95 +++++++++++++++++++++++++++ tests/test-sampling.cpp | 45 +++++++++++-- 11 files changed, 195 insertions(+), 10 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 8969fc107..d6a8e1f6f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -947,6 +947,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sparams.tfs_z = std::stof(value); } ).set_sparam()); + add_opt(common_arg( + {"--xtc-probability"}, "N", + string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability), + [](common_params & params, const std::string & value) { + params.sparams.xtc_probability = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--xtc-threshold"}, "N", + string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold), + [](common_params & params, const std::string & value) { + params.sparams.xtc_threshold = std::stof(value); + } + ).set_sparam()); add_opt(common_arg( {"--typical"}, "N", string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), diff --git a/common/common.cpp b/common/common.cpp index 451307b55..c08f01b42 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2104,6 +2104,8 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); + fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability); + fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold); fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); diff --git a/common/common.h b/common/common.h index 5507b1c59..df2ee6bd4 100644 --- a/common/common.h +++ b/common/common.h @@ -90,6 +90,8 @@ enum common_sampler_type { COMMON_SAMPLER_TYPE_TFS_Z = 4, COMMON_SAMPLER_TYPE_TYPICAL_P = 5, COMMON_SAMPLER_TYPE_TEMPERATURE = 6, + COMMON_SAMPLER_TYPE_XTC = 7, + }; // dimensionality reduction methods, used by cvector-generator @@ -108,6 +110,8 @@ struct common_sampler_params { int32_t top_k = 40; // <= 0 to use vocab size float top_p = 0.95f; // 1.0 = disabled float min_p = 0.05f; // 0.0 = disabled + float xtc_probability = 0.00f; // 0.0 = disabled + float xtc_threshold = 0.10f; // > 0.5 disables XTC float tfs_z = 1.00f; // 1.0 = disabled float typ_p = 1.00f; // typical_p, 1.0 = disabled float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities @@ -124,12 +128,14 @@ struct common_sampler_params { bool ignore_eos = false; bool no_perf = false; // disable performance metrics + std::vector samplers = { COMMON_SAMPLER_TYPE_TOP_K, COMMON_SAMPLER_TYPE_TFS_Z, COMMON_SAMPLER_TYPE_TYPICAL_P, COMMON_SAMPLER_TYPE_TOP_P, COMMON_SAMPLER_TYPE_MIN_P, + COMMON_SAMPLER_TYPE_XTC, COMMON_SAMPLER_TYPE_TEMPERATURE }; diff --git a/common/sampling.cpp b/common/sampling.cpp index cd49ade69..fb95bcd3b 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -130,10 +130,10 @@ std::string common_sampler_params::print() const { snprintf(result, sizeof(result), "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" - "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n" + "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n" "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", penalty_last_n, penalty_repeat, penalty_freq, penalty_present, - top_k, tfs_z, top_p, min_p, typ_p, temp, + top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp, mirostat, mirostat_eta, mirostat_tau); return std::string(result); @@ -184,6 +184,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co case COMMON_SAMPLER_TYPE_MIN_P: llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); break; + case COMMON_SAMPLER_TYPE_XTC: + llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); + break; case COMMON_SAMPLER_TYPE_TFS_Z: llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep)); break; @@ -372,6 +375,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_TOP_P: return 'p'; case COMMON_SAMPLER_TYPE_MIN_P: return 'm'; case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't'; + case COMMON_SAMPLER_TYPE_XTC: return 'x'; default : return '?'; } } @@ -384,6 +388,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_TOP_P: return "top_p"; case COMMON_SAMPLER_TYPE_MIN_P: return "min_p"; case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature"; + case COMMON_SAMPLER_TYPE_XTC: return "xtc"; default : return ""; } } @@ -396,6 +401,7 @@ std::vector common_sampler_types_from_names(const std::vect { "min_p", COMMON_SAMPLER_TYPE_MIN_P }, { "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z }, { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE }, + { "xtc", COMMON_SAMPLER_TYPE_XTC }, }; // since samplers names are written multiple ways @@ -441,7 +447,8 @@ std::vector common_sampler_types_from_chars(const std::stri { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P }, - { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE } + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC } }; std::vector samplers; diff --git a/examples/main/README.md b/examples/main/README.md index f0c3031ab..620934dad 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -241,6 +241,19 @@ The `--mirostat-ent` option sets the Mirostat target entropy (tau), which repres Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0` +### XTC Sampling + +- `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0). +- `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1). + +Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one. + +By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models. + +Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`. + +Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1` + ### Logit Bias - `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion. diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html index c87dd8f1e..ad4183cd9 100644 --- a/examples/server/public/index-new.html +++ b/examples/server/public/index-new.html @@ -43,6 +43,8 @@ top_k: 0, // <= 0 to use vocab size top_p: 1.0, // 1.0 = disabled min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4 + xtc_probability: 0.0, // 0 = disabled; + xtc_threshold: 0.1, // > 0.5 disables XTC; tfs_z: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled presence_penalty: 0.0, // 0.0 = disabled @@ -836,6 +838,8 @@ return html` ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })} ${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} ${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} + ${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })} + ${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })} ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })} @@ -1132,6 +1136,8 @@ document.addEventListener('DOMContentLoaded', (event) => { const snapSettings = { temperature: { snapValue: 1.0, snapRangeMultiplier: 6 }, min_p: { snapValue: 0.05, snapRangeMultiplier: 2 }, + xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 }, + xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 }, top_p: { snapValue: 1.0, snapRangeMultiplier: 4 }, tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 }, typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 }, diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 07fec6a38..88065705f 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -307,6 +307,8 @@ top_k: 40, // <= 0 to use vocab size top_p: 0.95, // 1.0 = disabled min_p: 0.05, // 0 = disabled + xtc_probability: 0.0, // 0 = disabled; + xtc_threshold: 0.1, // > 0.5 disables XTC; tfs_z: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled presence_penalty: 0.0, // 0.0 = disabled @@ -1013,6 +1015,8 @@ ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })} ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} + ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })} + ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 18bcad3f0..8d4380e12 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -863,6 +863,8 @@ struct server_context { slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); + slot.sparams.xtc_probability = json_value(data, "xtc_probability", default_sparams.xtc_probability); + slot.sparams.xtc_threshold = json_value(data, "xtc_threshold", default_sparams.xtc_threshold); slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); @@ -1196,6 +1198,8 @@ struct server_context { {"top_k", slot.sparams.top_k}, {"top_p", slot.sparams.top_p}, {"min_p", slot.sparams.min_p}, + {"xtc_probability", slot.sparams.xtc_probability}, + {"xtc_threshold", slot.sparams.xtc_threshold}, {"tfs_z", slot.sparams.tfs_z}, {"typical_p", slot.sparams.typ_p}, {"repeat_last_n", slot.sparams.penalty_last_n}, diff --git a/include/llama.h b/include/llama.h index 9110b5956..92d4c70c1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1101,6 +1101,9 @@ extern "C" { /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772. LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent); + /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 + LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed); + /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index e255a8fc4..67a78c3ac 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1059,6 +1059,101 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa }; } +// xtc + +struct llama_sampler_xtc { + const float probability; + const float threshold; + const size_t min_keep; + + const uint32_t seed; + uint32_t seed_cur; + + std::mt19937 rng; +}; + +static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) { + return "xtc"; +} + +static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_xtc *) smpl->ctx; + + if (ctx->probability <= 0.0f + || ctx->threshold > 0.5f + || cur_p->size < 2) { + return; + } + + std::uniform_real_distribution distribution(0.0f, 1.0f); + float chance = distribution(ctx->rng); + if (chance > ctx->probability) return; + + // in case it's not sorted/recalculated yet + llama_sampler_softmax_impl(cur_p); + + int pos_last = 0; + + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].p >= ctx->threshold) { + pos_last = i; + } else break; + } + + if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) { + cur_p->data += pos_last; + cur_p->size -= pos_last; + } +} + +static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_xtc *) smpl->ctx; + auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed); + + // copy the state + { + auto * result_ctx = (llama_sampler_xtc *) result->ctx; + + result_ctx->rng = ctx->rng; + } + + return result; +} + +static void llama_sampler_xtc_free(struct llama_sampler * smpl) { + delete (llama_sampler_xtc *) smpl->ctx; +} + +static void llama_sampler_xtc_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_xtc *) smpl->ctx; + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); +} + +static struct llama_sampler_i llama_sampler_xtc_i = { + /* .name = */ llama_sampler_xtc_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sample_xtc_apply, + /* .reset = */ llama_sampler_xtc_reset, + /* .clone = */ llama_sampler_xtc_clone, + /* .free = */ llama_sampler_xtc_free, +}; + +struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) { + auto seed_cur = get_rng_seed(seed); + return new llama_sampler { + /* .iface = */ &llama_sampler_xtc_i, + /* .ctx = */ new llama_sampler_xtc { + /* .probability = */ p, + /* .threshold = */ t, + /* .min_keep = */ min_keep, + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), + }, + }; +} + // mirostat struct llama_sampler_mirostat { diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 6e021c4c7..1372bdf13 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -111,6 +111,28 @@ static void test_min_p(const std::vector & probs, const std::vector & probs, const std::vector & expected_probs, float p, float t) { + const size_t n_vocab = probs.size(); + + std::vector cur; + cur.reserve(n_vocab); + for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { + const float logit = logf(probs[token_id]); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); + } + + llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + APPLY(llama_sampler_init_softmax(), &cur_p); + DUMP(&cur_p); + APPLY(llama_sampler_init_xtc(p, t, 0, 0), &cur_p); + DUMP(&cur_p); + + GGML_ASSERT(cur_p.size == expected_probs.size()); + for (size_t i = 0; i < cur_p.size; i++) { + GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5); + } +} + static void test_typical(const std::vector & probs, const std::vector & expected_probs, float p) { const size_t n_vocab = probs.size(); @@ -263,7 +285,7 @@ static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vec } const int64_t t_end = ggml_time_us(); llama_sampler_free(cnstr); - printf("%-42s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter); + printf("%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter); } #define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter)) @@ -279,12 +301,13 @@ static void test_perf() { data.emplace_back(llama_token_data{i, logit, 0.0f}); } - BENCH(llama_sampler_init_top_k (40), data, 32); - BENCH(llama_sampler_init_top_p (0.8f, 1), data, 32); - BENCH(llama_sampler_init_min_p (0.2f, 1), data, 32); - BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32); - BENCH(llama_sampler_init_typical (0.5f, 1), data, 32); - BENCH(llama_sampler_init_softmax (), data, 32); + BENCH(llama_sampler_init_top_k (40), data, 32); + BENCH(llama_sampler_init_top_p (0.8f, 1), data, 32); + BENCH(llama_sampler_init_min_p (0.2f, 1), data, 32); + BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32); + BENCH(llama_sampler_init_typical (0.5f, 1), data, 32); + BENCH(llama_sampler_init_xtc (1.0f, 0.1f, 1, 1), data, 32); + BENCH(llama_sampler_init_softmax (), data, 32); } int main(void) { @@ -309,6 +332,14 @@ int main(void) { test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 0.76f); test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.00f); + printf("XTC should:\n"); + test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.1f}, 0.99f, 0.09f); + test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.2f, 0.1f}, 0.99f, 0.19f); + test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.3f, 0.2f, 0.1f}, 0.99f, 0.29f); + + printf("XTC should not:\n"); + test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0.99f, 0.39f); + test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f); From 223c25a72fcc3f65cdfd7f5d57edd5b44b550e18 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 15 Oct 2024 16:28:55 +0300 Subject: [PATCH 09/10] server : improve infill context reuse (#9894) ggml-ci --- examples/server/README.md | 10 +++--- examples/server/server.cpp | 73 ++++++++++++++------------------------ 2 files changed, 33 insertions(+), 50 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index eb0a7b32e..fcdb02afd 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -524,10 +524,12 @@ Takes a prefix and a suffix and returns the predicted completion as stream. - `input_prefix`: Set the prefix of the code to infill. - `input_suffix`: Set the suffix of the code to infill. -- `prompt`: Added after the `FIM_MID` token -- `extra_context`: Additional context inserted before the FIM prefix. See https://github.com/ggerganov/llama.cpp/pull/9874 +- `input_extra`: Additional context inserted before the FIM prefix. +- `prompt`: Added after the `FIM_MID` token -It also accepts all the options of `/completion`. +`input_extra` is array of `{"filename": string, "text": string}` objects. + +The endpoint also accepts all the options of `/completion`. If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](https://arxiv.org/pdf/2409.12186) is used: @@ -545,7 +547,7 @@ If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern]( If the tokens are missing, then the extra context is simply prefixed at the start: ```txt -[extra_context][input_prefix][input_suffix][prompt] +[input_extra][input_prefix][input_suffix][prompt] ``` ### **GET** `/props`: Get server global properties. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8d4380e12..d53cca84c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -136,10 +136,6 @@ struct slot_params { int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit std::vector antiprompt; - - json input_prefix; - json input_suffix; - json extra_context; }; struct server_slot { @@ -169,6 +165,10 @@ struct server_slot { json prompt; // can be either a string, array of strings or array of token ids + json input_prefix; + json input_suffix; + json input_extra; + // when a task is submitted, we first tokenize the prompt and store it here std::vector prompt_tokens; std::vector extra_tokens; @@ -910,12 +910,12 @@ struct server_context { } // infill - slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix); - slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix); - slot.params.extra_context = json_value(data, "extra_context", default_params.extra_context); + slot.input_prefix = json_value(data, "input_prefix", json()); + slot.input_suffix = json_value(data, "input_suffix", json()); + slot.input_extra = json_value(data, "input_extra", json()); - SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.params.extra_context.size()); - for (const auto & chunk : slot.params.extra_context) { + SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.input_extra.size()); + for (const auto & chunk : slot.input_extra) { // { "text": string, "filename": string } if (!chunk.contains("text") || !chunk["text"].is_string()) { send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST); @@ -932,7 +932,7 @@ struct server_context { } // get prompt - if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) { + { const auto & prompt = data.find("prompt"); if (prompt == data.end()) { send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST); @@ -1958,6 +1958,8 @@ struct server_context { } break; case SERVER_TASK_CMPL_TYPE_INFILL: { + // TODO: optimize this block by reducing memory allocations and movement + // use FIM repo-level pattern: // ref: https://arxiv.org/pdf/2409.12186 // @@ -1968,10 +1970,11 @@ struct server_context { // extra chunk 1 // ... // [FIM_SEP]filename - // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID] + // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt // - auto prefix_tokens = tokenize(slot.params.input_prefix, false, false); - auto suffix_tokens = tokenize(slot.params.input_suffix, false, false); + auto tokens_prefix = tokenize(slot.input_prefix, false, false); + auto tokens_suffix = tokenize(slot.input_suffix, false, false); + auto tokens_prompt = tokenize(slot.prompt, false, false); slot.extra_tokens.clear(); if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) { @@ -1981,7 +1984,7 @@ struct server_context { slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); } - for (const auto & chunk : slot.params.extra_context) { + for (const auto & chunk : slot.input_extra) { // { "text": string, "filename": string } const std::string text = chunk.value("text", ""); const std::string filename = chunk.value("filename", "tmp"); @@ -2012,20 +2015,21 @@ struct server_context { } // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?) - const int n_suffix_take = std::min(suffix_tokens.size(), (n_batch)/4); - const int n_prefix_take = std::min(prefix_tokens.size(), (n_batch - 3) - n_suffix_take); + const int n_suffix_take = std::min(tokens_suffix.size(), (n_batch/4)); + const int n_prefix_take = std::min(tokens_prefix.size(), 3*(n_batch/4) - 3); // fill the rest of the context with extra chunks const int n_extra_take = std::min(std::max(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size()); - prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take); - suffix_tokens.resize(n_suffix_take); + tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take); + tokens_suffix.resize(n_suffix_take); - prefix_tokens.insert(prefix_tokens.begin(), llama_token_fim_pre(model)); - suffix_tokens.insert(suffix_tokens.begin(), llama_token_fim_suf(model)); + tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model)); + tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end()); + tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model)); - auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; - auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; + auto embd_inp = params.spm_infill ? tokens_suffix : tokens_prefix; + auto embd_end = params.spm_infill ? tokens_prefix : tokens_suffix; if (llama_add_bos_token(model)) { embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); @@ -2140,40 +2144,17 @@ struct server_context { while (head_c < slot.cache_tokens.size() && head_p < prompt_tokens.size()) { - if (llama_token_is_control(model, slot.cache_tokens[head_c]) && - slot.cache_tokens[head_c] != llama_token_fim_rep(model) && - slot.cache_tokens[head_c] != llama_token_fim_sep(model)) { - break; - } - - if (llama_token_is_control(model, prompt_tokens[head_p]) && - prompt_tokens[head_p] != llama_token_fim_rep(model) && - prompt_tokens[head_p] != llama_token_fim_sep(model)) { - break; - } size_t n_match = 0; - while (head_c + n_match < slot.cache_tokens.size() && head_p + n_match < prompt_tokens.size() && slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) { - if (llama_token_is_control(model, slot.cache_tokens[head_c + n_match]) && - slot.cache_tokens[head_c + n_match] != llama_token_fim_rep(model) && - slot.cache_tokens[head_c + n_match] != llama_token_fim_sep(model)) { - break; - } - - if (llama_token_is_control(model, prompt_tokens[head_p + n_match]) && - prompt_tokens[head_p + n_match] != llama_token_fim_rep(model) && - prompt_tokens[head_p + n_match] != llama_token_fim_sep(model)) { - break; - } n_match++; } if (n_match >= (size_t) params.n_cache_reuse) { - SLT_DBG(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); + SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); //for (size_t i = head_p; i < head_p + n_match; i++) { // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); //} From 755a9b2bf00fbae988e03a47e852b66eaddd113a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 15 Oct 2024 16:35:33 +0300 Subject: [PATCH 10/10] llama : add infill sampler (#9896) ggml-ci --- common/common.h | 4 +- common/sampling.cpp | 9 +- examples/main/main.cpp | 46 +++++----- include/llama.h | 28 ++++++ src/llama-sampling.cpp | 201 +++++++++++++++++++++++++++++++++++++++++ src/llama-sampling.h | 5 +- src/llama-vocab.cpp | 17 ++++ src/llama-vocab.h | 8 +- src/llama.cpp | 11 +++ 9 files changed, 300 insertions(+), 29 deletions(-) diff --git a/common/common.h b/common/common.h index df2ee6bd4..5ca8fd391 100644 --- a/common/common.h +++ b/common/common.h @@ -91,7 +91,7 @@ enum common_sampler_type { COMMON_SAMPLER_TYPE_TYPICAL_P = 5, COMMON_SAMPLER_TYPE_TEMPERATURE = 6, COMMON_SAMPLER_TYPE_XTC = 7, - + COMMON_SAMPLER_TYPE_INFILL = 8, }; // dimensionality reduction methods, used by cvector-generator @@ -136,7 +136,7 @@ struct common_sampler_params { COMMON_SAMPLER_TYPE_TOP_P, COMMON_SAMPLER_TYPE_MIN_P, COMMON_SAMPLER_TYPE_XTC, - COMMON_SAMPLER_TYPE_TEMPERATURE + COMMON_SAMPLER_TYPE_TEMPERATURE, }; std::string grammar; // optional BNF-like grammar to constrain sampling diff --git a/common/sampling.cpp b/common/sampling.cpp index fb95bcd3b..56cd0df6b 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -196,6 +196,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co case COMMON_SAMPLER_TYPE_TEMPERATURE: llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); break; + case COMMON_SAMPLER_TYPE_INFILL: + llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model)); + break; default: GGML_ASSERT(false && "unknown sampler type"); } @@ -376,6 +379,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_MIN_P: return 'm'; case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't'; case COMMON_SAMPLER_TYPE_XTC: return 'x'; + case COMMON_SAMPLER_TYPE_INFILL: return 'i'; default : return '?'; } } @@ -389,6 +393,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_MIN_P: return "min_p"; case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature"; case COMMON_SAMPLER_TYPE_XTC: return "xtc"; + case COMMON_SAMPLER_TYPE_INFILL: return "infill"; default : return ""; } } @@ -402,6 +407,7 @@ std::vector common_sampler_types_from_names(const std::vect { "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z }, { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE }, { "xtc", COMMON_SAMPLER_TYPE_XTC }, + { "infill", COMMON_SAMPLER_TYPE_INFILL }, }; // since samplers names are written multiple ways @@ -448,7 +454,8 @@ std::vector common_sampler_types_from_chars(const std::stri { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }, - { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC } + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL }, }; std::vector samplers; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index fb10c20c5..65483c45f 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -569,30 +569,30 @@ int main(int argc, char ** argv) { if (!params.ctx_shift){ LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__); break; - } else { - if (params.n_predict == -2) { - LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); - break; - } - - const int n_left = n_past - params.n_keep; - const int n_discard = n_left/2; - - LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", - n_past, n_left, n_ctx, params.n_keep, n_discard); - - llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); - - n_past -= n_discard; - - LOG_DBG("after swap: n_past = %d\n", n_past); - - LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); - - LOG_DBG("clear session path\n"); - path_session.clear(); } + + if (params.n_predict == -2) { + LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + break; + } + + const int n_left = n_past - params.n_keep; + const int n_discard = n_left/2; + + LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, n_ctx, params.n_keep, n_discard); + + llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + + n_past -= n_discard; + + LOG_DBG("after swap: n_past = %d\n", n_past); + + LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); + + LOG_DBG("clear session path\n"); + path_session.clear(); } } else { // context extension via Self-Extend diff --git a/include/llama.h b/include/llama.h index 92d4c70c1..02bc7f087 100644 --- a/include/llama.h +++ b/include/llama.h @@ -953,6 +953,12 @@ extern "C" { int32_t lstrip, bool special); + // check if token0 is contained as a prefix in token1 + LLAMA_API bool llama_token_is_prefix( + const struct llama_model * model, + llama_token token0, + llama_token token1); + /// @details Convert the provided tokens into text (inverse of llama_tokenize()). /// @param text The char pointer must be large enough to hold the resulting text. /// @return Returns the number of chars/bytes on success, no more than text_len_max. @@ -1148,6 +1154,28 @@ extern "C" { int32_t n_logit_bias, const llama_logit_bias * logit_bias); + // this sampler is meant to be used for fill-in-the-middle infilling + // it's supposed to be used after top_k + top_p sampling + // + // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG + // 2. combine probs of tokens that have the same prefix + // + // example: + // + // - before: + // "hel": 0.5 + // "hell": 0.2 + // "hello": 0.1 + // "dummy": 0.1 + // + // - after: + // "hel": 0.8 + // "dummy": 0.1 + // + // 3. discard non-EOG tokens with low prob + // 4. if no tokens are left -> pick EOT + // + LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model); // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 67a78c3ac..2e6550682 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1739,6 +1739,207 @@ struct llama_sampler * llama_sampler_init_logit_bias( }; } +// infill + +//#define GGML_DEBUG_SAMPLER_INFILL + +struct llama_sampler_infill { + const struct llama_vocab * vocab; +}; + +static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) { + return "infill"; +} + +static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_infill *) smpl->ctx; + + llama_sampler_softmax_impl(cur_p); + +#if defined(GGML_DEBUG_SAMPLER_INFILL) +#define LOG_DBG_CUR LLAMA_LOG_DEBUG +#else +#define LOG_DBG_CUR(...) +#endif + + for (size_t i = 0; i < cur_p->size; ++i) { + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + + float p_txt_sum = 0.0f; + float p_eog_sum = 0.0f; + + for (size_t i = 0; i < cur_p->size; ++i) { + if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) { + p_eog_sum += cur_p->data[i].p; + } else { + p_txt_sum += cur_p->data[i].p; + } + } + + const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat); + + LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size); + + if (3*p_eog_sum*cur_p->size > p_txt_sum) { + LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum); + + // keep just the EOG tokens + const auto size_org = cur_p->size; + + cur_p->size = 0; + + float p_sum = 0.0f; + + for (size_t i = 0; i < size_org; ++i) { + if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) { + p_sum += cur_p->data[i].p; + + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + } + + return; + } + + size_t n_combined = 0; GGML_UNUSED(n_combined); + + // combine tokens with common prefix + for (size_t i = 0; i < cur_p->size; ++i) { + for (size_t j = 0; j < cur_p->size; ++j) { + if (cur_p->data[i].logit == -INFINITY) { + break; + } + + if (i == j || cur_p->data[j].logit == -INFINITY) { + continue; + } + + if (llama_token_is_prefix_impl(*ctx->vocab, cur_p->data[i].id, cur_p->data[j].id)) { + if (cur_p->data[i].p > cur_p->data[j].p) { + cur_p->data[i].p += cur_p->data[j].p; + cur_p->data[j].logit = -INFINITY; + cur_p->data[j].p = 0.0f; + } else { + cur_p->data[j].p += cur_p->data[i].p; + cur_p->data[i].logit = -INFINITY; + cur_p->data[i].p = 0.0f; + } + + n_combined++; + } + } + } + + size_t n_non_eog = 0; + + size_t size_org = cur_p->size; + + float p_sum = 0.0f; + float thold = 0.2f; + + cur_p->size = 0; + + LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold); + + for (size_t i = 0; i < size_org; ++i) { + const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id); + + if (cur_p->data[i].p < thold && !is_eog) { + continue; + } + + if (!is_eog) { + ++n_non_eog; + } + + p_sum += cur_p->data[i].p; + + // keep this token + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + + LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog); + + // if no non-EOG tokens are left -> reduce cur_p to single EOT token + if (n_non_eog == 0) { + cur_p->size = 1; + cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab); + cur_p->data[0].logit = 1.0f; + + return; + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + + size_org = cur_p->size; + p_sum = 0.0f; + thold = 1.0/(n_non_eog + 1); + + cur_p->size = 0; + + LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold); + + for (size_t i = 0; i < size_org; ++i) { + const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id); + + if (cur_p->data[i].p < thold && !is_eog) { + continue; + } + + p_sum += cur_p->data[i].p; + + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + +#undef LOG_DBG_CUR +} + +static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_infill *) smpl->ctx; + return llama_sampler_init_infill_impl(*ctx->vocab); +} + +static void llama_sampler_infill_free(struct llama_sampler * smpl) { + delete (llama_sampler_infill *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_infill_i = { + /* .name = */ llama_sampler_infill_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_infill_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_infill_clone, + /* .free = */ llama_sampler_infill_free, +}; + +struct llama_sampler * llama_sampler_init_infill_impl( + const struct llama_vocab & vocab) { + return new llama_sampler { + /* .iface = */ &llama_sampler_infill_i, + /* .ctx = */ new llama_sampler_infill { + /* .vocab = */ &vocab, + }, + }; +} + // utils uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { diff --git a/src/llama-sampling.h b/src/llama-sampling.h index d90b14713..2683f1b92 100644 --- a/src/llama-sampling.h +++ b/src/llama-sampling.h @@ -4,8 +4,6 @@ #include "llama-grammar.h" -#include - struct llama_vocab; struct llama_grammar; @@ -27,3 +25,6 @@ struct llama_sampler * llama_sampler_init_grammar_impl( const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root); + +struct llama_sampler * llama_sampler_init_infill_impl( + const struct llama_vocab & vocab); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index a27394a37..070de9365 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1858,6 +1858,23 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token return 0; } +bool llama_token_is_prefix_impl( + const struct llama_vocab & vocab, + llama_token token0, + llama_token token1) { + char text_buf_0[128]; + char text_buf_1[128]; + + const int32_t len0 = llama_token_to_piece_impl(vocab, token0, text_buf_0, sizeof(text_buf_0) - 1, 0, false); + const int32_t len1 = llama_token_to_piece_impl(vocab, token1, text_buf_1, sizeof(text_buf_1) - 1, 0, false); + + if (len0 <= 0 || len1 <= 0) { + return false; + } + + return len0 <= len1 && memcmp(text_buf_0, text_buf_1, len0) == 0; +} + int32_t llama_detokenize_impl( const struct llama_vocab & vocab, const llama_token * tokens, diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 17e14488a..d958d0073 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -48,7 +48,7 @@ struct llama_vocab { id special_cls_id = LLAMA_TOKEN_NULL; id special_mask_id = LLAMA_TOKEN_NULL; - id linefeed_id = 13; + id linefeed_id = 13; // fim tokens id special_fim_pre_id = LLAMA_TOKEN_NULL; @@ -149,6 +149,12 @@ int32_t llama_token_to_piece_impl( int32_t lstrip, bool special); +// check if token0 is contained as a prefix in token1 +bool llama_token_is_prefix_impl( + const struct llama_vocab & vocab, + llama_token token0, + llama_token token1); + int32_t llama_detokenize_impl( const struct llama_vocab & vocab, const llama_token * tokens, diff --git a/src/llama.cpp b/src/llama.cpp index 511f91802..8d44c73c8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -21500,6 +21500,13 @@ int32_t llama_token_to_piece( return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special); } +bool llama_token_is_prefix( + const struct llama_model * model, + llama_token token0, + llama_token token1) { + return llama_token_is_prefix_impl(model->vocab, token0, token1); +} + int32_t llama_detokenize( const struct llama_model * model, const llama_token * tokens, @@ -21830,6 +21837,10 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root); } +struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) { + return llama_sampler_init_infill_impl(model->vocab); +} + // // model split //