Merge branch 'master' into xsn/refactor_server_slot_input

This commit is contained in:
Xuan Son Nguyen 2024-10-23 23:43:42 +02:00
commit 3abc33962e

View file

@ -1955,11 +1955,6 @@ struct server_context {
// reuse any previously computed tokens that are common with the new prompt
slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
// push the prompt into the sampling context (do not apply grammar)
for (int i = 0; i < slot.n_past; ++i) {
common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
}
// reuse chunks from the cached prompt by shifting their KV cache in the new position
if (params.n_cache_reuse > 0) {
size_t head_c = slot.n_past; // cache
@ -1991,9 +1986,6 @@ struct server_context {
for (size_t i = 0; i < n_match; i++) {
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false);
slot.n_past++;
}
@ -2074,6 +2066,11 @@ struct server_context {
GGML_ASSERT(batch.n_tokens > 0);
// Process all prompt tokens through sampler system
for (int i = 0; i < slot.n_prompt_tokens; ++i) {
common_sampler_accept(slot.smpl, slot.prompt_tokens[i], false);
}
// extract the logits only for the last token
batch.logits[batch.n_tokens - 1] = true;