From 4a471b12d6026335787b22d88615370da5de509d Mon Sep 17 00:00:00 2001 From: Paulo Date: Thu, 2 May 2024 21:28:47 -0300 Subject: [PATCH] don't shift if there's no truncation --- examples/server/server.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index de76f482a..071ae2463 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -103,7 +103,7 @@ struct slot_params { uint32_t seed = -1; // RNG seed int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half - int32_t n_truncate = 0; + int32_t n_truncate = 0; // number of tokens after n_keep that will be discarded when the prompt is bigger than the context int32_t n_predict = -1; // new tokens to predict std::vector antiprompt; @@ -192,7 +192,7 @@ struct server_slot { int32_t ga_n = 1; // group-attention factor int32_t ga_w = 512; // group-attention width - int32_t n_past_se = 0; // self-extend + int32_t n_past_se = 0; // self-extend // stats size_t n_sent_text = 0; // number of sent text character @@ -2057,7 +2057,12 @@ struct server_context { {"new_cache_size", new_cache_size}, {"cache_tokens", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cend())}, }); - } // else somebody trying to use n_truncate w/o previous cache + } else { + LOG_ERROR("n_truncate needs to be used with cache_prompt", { + {"id_slot", slot.id}, + {"id_task", slot.id_task}, + }); + } } GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); @@ -2074,7 +2079,7 @@ struct server_context { // reuse any previously computed tokens that are common with the new prompt slot.n_past = common_part(slot.cache_tokens, prompt_tokens); - LOG_INFO("[cached_tokens, prompt_tokens]", { + LOG_INFO("[cache_tokens, prompt_tokens]", { { "id_slot", slot.id }, { "id_task", slot.id_task }, { "common_part", slot.n_past} @@ -2113,7 +2118,7 @@ struct server_context { // shift KV cache if needed const int n_keep = slot.params.n_keep + add_bos_token; const int n_truncate = slot.params.n_truncate; - if (n_truncate && slot.params.cache_prompt) { + if (n_truncate && slot.params.cache_prompt && slot.truncated) { llama_kv_cache_seq_rm(ctx, slot.id + 1, n_keep, n_keep + n_truncate); LOG_INFO("kv cache rm", {