server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case was triggered, an extra token was kept in cache_tokens even if it was removed from the KV cache. For Mamba, this caused useless prompt reprocessing when the previous request triggered the above case.
This commit is contained in:
parent
916b586386
commit
7cd5a1f986
1 changed files with 3 additions and 4 deletions
|
@ -1797,9 +1797,6 @@ struct server_context {
|
||||||
// reuse any previously computed tokens that are common with the new prompt
|
// reuse any previously computed tokens that are common with the new prompt
|
||||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||||
|
|
||||||
// remove the non-common part from the cache
|
|
||||||
slot.cache_tokens.resize(slot.n_past);
|
|
||||||
|
|
||||||
// push the prompt into the sampling context (do not apply grammar)
|
// push the prompt into the sampling context (do not apply grammar)
|
||||||
for (int i = 0; i < slot.n_past; ++i) {
|
for (int i = 0; i < slot.n_past; ++i) {
|
||||||
llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);
|
llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);
|
||||||
|
@ -1846,11 +1843,13 @@ struct server_context {
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
slot.n_past_se = 0;
|
slot.n_past_se = 0;
|
||||||
slot.ga_i = 0;
|
slot.ga_i = 0;
|
||||||
slot.cache_tokens.clear();
|
|
||||||
// TODO: is the system prompt ever in the sampling context?
|
// TODO: is the system prompt ever in the sampling context?
|
||||||
llama_sampling_reset(slot.ctx_sampling);
|
llama_sampling_reset(slot.ctx_sampling);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// remove the non-common part from the cache
|
||||||
|
slot.cache_tokens.resize(slot.n_past);
|
||||||
|
|
||||||
LOG_INFO("kv cache rm [p0, end)", {
|
LOG_INFO("kv cache rm [p0, end)", {
|
||||||
{ "id_slot", slot.id },
|
{ "id_slot", slot.id },
|
||||||
{ "id_task", slot.id_task },
|
{ "id_task", slot.id_task },
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue