sampling : fix repeat penalty out-of-bounds access

ggml-ci
2024-09-07 14:50:43 +03:00 · 2024-09-07 14:50:43 +03:00 · 2387dbea7d
commit 2387dbea7d
parent 8a82f388cd
2 changed files with 4 additions and 6 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2323,10 +2323,10 @@ struct server_context {
                        slot.release();
                        slot.i_batch = -1;
                        continue; // continue loop of slots
-                    } else {
-                        // prompt evaluated for next-token prediction
-                        slot.state = SLOT_STATE_GENERATING;
                    }
+
+                    // prompt evaluated for next-token prediction
+                    slot.state = SLOT_STATE_GENERATING;
                } else if (slot.state != SLOT_STATE_GENERATING) {
                    continue; // continue loop of slots
                }
@ -2347,8 +2347,6 @@ struct server_context {

                const auto * cur_p = gpt_sampler_get_candidates(slot.smpl);

-                // TODO: this logic might have been broken during https://github.com/ggerganov/llama.cpp/pull/8643
-                //       fix if necessary
                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
                    result.probs.push_back({
                        cur_p->data[i].id,
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -1280,7 +1280,7 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
        // Create a frequency map to count occurrences of each token in last_tokens
        // TODO: optimize this by maintaining the token count in the sampler context
        llama_token_cnt token_count;
-        for (int i = 0; i < ctx->penalty_last_n; ++i) {
+        for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
            token_count[ctx->prev.rat(i)]++;
        }