From 2387dbea7d3417218faf7507eb9ff4eece396717 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 7 Sep 2024 14:50:43 +0300
Subject: [PATCH] sampling : fix repeat penalty out-of-bounds access

ggml-ci
---
 examples/server/server.cpp | 8 +++-----
 src/llama-sampling.cpp     | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1095f43b2..f45b59983 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2323,10 +2323,10 @@ struct server_context {
                         slot.release();
                         slot.i_batch = -1;
                         continue; // continue loop of slots
-                    } else {
-                        // prompt evaluated for next-token prediction
-                        slot.state = SLOT_STATE_GENERATING;
                     }
+
+                    // prompt evaluated for next-token prediction
+                    slot.state = SLOT_STATE_GENERATING;
                 } else if (slot.state != SLOT_STATE_GENERATING) {
                     continue; // continue loop of slots
                 }
@@ -2347,8 +2347,6 @@ struct server_context {
 
                 const auto * cur_p = gpt_sampler_get_candidates(slot.smpl);
 
-                // TODO: this logic might have been broken during https://github.com/ggerganov/llama.cpp/pull/8643
-                //       fix if necessary
                 for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
                     result.probs.push_back({
                         cur_p->data[i].id,
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 02b93b64c..61f4cbb92 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1280,7 +1280,7 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
         // Create a frequency map to count occurrences of each token in last_tokens
         // TODO: optimize this by maintaining the token count in the sampler context
         llama_token_cnt token_count;
-        for (int i = 0; i < ctx->penalty_last_n; ++i) {
+        for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
             token_count[ctx->prev.rat(i)]++;
         }