diff --git a/examples/server/server.cpp b/examples/server/server.cpp index e049927d0..fc382c68d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1955,11 +1955,6 @@ struct server_context { // reuse any previously computed tokens that are common with the new prompt slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens); - // push the prompt into the sampling context (do not apply grammar) - for (int i = 0; i < slot.n_past; ++i) { - common_sampler_accept(slot.smpl, slot.cache_tokens[i], false); - } - // reuse chunks from the cached prompt by shifting their KV cache in the new position if (params.n_cache_reuse > 0) { size_t head_c = slot.n_past; // cache @@ -1991,9 +1986,6 @@ struct server_context { for (size_t i = 0; i < n_match; i++) { slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; - - common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false); - slot.n_past++; } @@ -2074,6 +2066,11 @@ struct server_context { GGML_ASSERT(batch.n_tokens > 0); + // Process all prompt tokens through sampler system + for (int i = 0; i < slot.n_prompt_tokens; ++i) { + common_sampler_accept(slot.smpl, slot.prompt_tokens[i], false); + } + // extract the logits only for the last token batch.logits[batch.n_tokens - 1] = true;