From 7cd5a1f986e9821b14bf40f04d24153ca2339185 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 7 Mar 2024 13:52:58 -0500 Subject: [PATCH] server : fix cache_tokens not getting correctly resized Otherwise, when the "we have to evaluate at least 1 token" special case was triggered, an extra token was kept in cache_tokens even if it was removed from the KV cache. For Mamba, this caused useless prompt reprocessing when the previous request triggered the above case. --- examples/server/server.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a920f2d92..23c5189a5 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1797,9 +1797,6 @@ struct server_context { // reuse any previously computed tokens that are common with the new prompt slot.n_past = common_part(slot.cache_tokens, prompt_tokens); - // remove the non-common part from the cache - slot.cache_tokens.resize(slot.n_past); - // push the prompt into the sampling context (do not apply grammar) for (int i = 0; i < slot.n_past; ++i) { llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false); @@ -1846,11 +1843,13 @@ struct server_context { slot.n_past = 0; slot.n_past_se = 0; slot.ga_i = 0; - slot.cache_tokens.clear(); // TODO: is the system prompt ever in the sampling context? llama_sampling_reset(slot.ctx_sampling); } + // remove the non-common part from the cache + slot.cache_tokens.resize(slot.n_past); + LOG_INFO("kv cache rm [p0, end)", { { "id_slot", slot.id }, { "id_task", slot.id_task },