don't shift if there's no truncation
This commit is contained in:
parent
0c115da251
commit
4a471b12d6
1 changed files with 10 additions and 5 deletions
|
@ -103,7 +103,7 @@ struct slot_params {
|
||||||
uint32_t seed = -1; // RNG seed
|
uint32_t seed = -1; // RNG seed
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
||||||
int32_t n_truncate = 0;
|
int32_t n_truncate = 0; // number of tokens after n_keep that will be discarded when the prompt is bigger than the context
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
|
|
||||||
std::vector<std::string> antiprompt;
|
std::vector<std::string> antiprompt;
|
||||||
|
@ -2057,7 +2057,12 @@ struct server_context {
|
||||||
{"new_cache_size", new_cache_size},
|
{"new_cache_size", new_cache_size},
|
||||||
{"cache_tokens", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cend())},
|
{"cache_tokens", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cend())},
|
||||||
});
|
});
|
||||||
} // else somebody trying to use n_truncate w/o previous cache
|
} else {
|
||||||
|
LOG_ERROR("n_truncate needs to be used with cache_prompt", {
|
||||||
|
{"id_slot", slot.id},
|
||||||
|
{"id_task", slot.id_task},
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
||||||
|
@ -2074,7 +2079,7 @@ struct server_context {
|
||||||
// reuse any previously computed tokens that are common with the new prompt
|
// reuse any previously computed tokens that are common with the new prompt
|
||||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||||
|
|
||||||
LOG_INFO("[cached_tokens, prompt_tokens]", {
|
LOG_INFO("[cache_tokens, prompt_tokens]", {
|
||||||
{ "id_slot", slot.id },
|
{ "id_slot", slot.id },
|
||||||
{ "id_task", slot.id_task },
|
{ "id_task", slot.id_task },
|
||||||
{ "common_part", slot.n_past}
|
{ "common_part", slot.n_past}
|
||||||
|
@ -2113,7 +2118,7 @@ struct server_context {
|
||||||
// shift KV cache if needed
|
// shift KV cache if needed
|
||||||
const int n_keep = slot.params.n_keep + add_bos_token;
|
const int n_keep = slot.params.n_keep + add_bos_token;
|
||||||
const int n_truncate = slot.params.n_truncate;
|
const int n_truncate = slot.params.n_truncate;
|
||||||
if (n_truncate && slot.params.cache_prompt) {
|
if (n_truncate && slot.params.cache_prompt && slot.truncated) {
|
||||||
llama_kv_cache_seq_rm(ctx, slot.id + 1, n_keep, n_keep + n_truncate);
|
llama_kv_cache_seq_rm(ctx, slot.id + 1, n_keep, n_keep + n_truncate);
|
||||||
|
|
||||||
LOG_INFO("kv cache rm", {
|
LOG_INFO("kv cache rm", {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue