Merge branch 'master' into xsn/refactor_server_slot_input

2024-10-23 23:43:42 +02:00 · 2024-10-23 23:43:42 +02:00 · 3abc33962e
commit 3abc33962e
parent 5c749bea00 0a1c750c80
1 changed files with 5 additions and 8 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1955,11 +1955,6 @@ struct server_context {
                            // reuse any previously computed tokens that are common with the new prompt
                            slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);

-                            // push the prompt into the sampling context (do not apply grammar)
-                            for (int i = 0; i < slot.n_past; ++i) {
-                                common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
-                            }
-
                            // reuse chunks from the cached prompt by shifting their KV cache in the new position
                            if (params.n_cache_reuse > 0) {
                                size_t head_c = slot.n_past; // cache
@ -1991,9 +1986,6 @@ struct server_context {

                                        for (size_t i = 0; i < n_match; i++) {
                                            slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
-
-                                            common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false);
-
                                            slot.n_past++;
                                        }

@ -2074,6 +2066,11 @@ struct server_context {

                    GGML_ASSERT(batch.n_tokens > 0);

+                    // Process all prompt tokens through sampler system
+                    for (int i = 0; i < slot.n_prompt_tokens; ++i) {
+                        common_sampler_accept(slot.smpl, slot.prompt_tokens[i], false);
+                    }
+
                    // extract the logits only for the last token
                    batch.logits[batch.n_tokens - 1] = true;