perplexity : make hellaswag and multiple-choice outputs identical to master

Due to how the KV cache is updated, the logprobs for tokens in a batch are very slightly affected by the other tokens present in the batch, so to make hellaswag and multiple-choice return exactly the same results as on master, the last token of each sequence needs to be evaluated even though its output is not used at all. This will probably be changed back in the future to make these benchmarks a tiny bit faster. * perplexity : fix division by zero when using less than 100 multiple-choice tasks
2024-03-20 22:48:19 -04:00 · 2024-03-20 22:48:19 -04:00 · 5f33a675ca
commit 5f33a675ca
parent 7d8d6b589f
1 changed files with 14 additions and 14 deletions
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -832,9 +832,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            hs_cur.seq_tokens[0].size() - hs_cur.common_prefix +
            hs_cur.seq_tokens[1].size() - hs_cur.common_prefix +
            hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
-            hs_cur.seq_tokens[3].size() - hs_cur.common_prefix
+            hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
            // the last tokens don't need to be evaluated
            - 4;
        //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size());
@ -895,10 +893,12 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            n_logits += 1;
            for (int s = 0; s < 4; ++s) {
-                // end before the last token, no need to predict past the end of the sequences
+                const size_t seq_tokens_size = hs_cur.seq_tokens[s].size();
-                for (size_t i = hs_cur.common_prefix; i < hs_cur.seq_tokens[s].size() - 1; ++i) {
+                // TODO: don't evaluate the last token of each sequence
-                    llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, true);
+                for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
-                    n_logits += 1;
+                    const bool needs_logits = i < seq_tokens_size - 1;
                    llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
                    n_logits += needs_logits;
                }
            }
@ -1359,8 +1359,6 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos,
    for (auto& seq : task.seq_tokens) {
        task.required_tokens += seq.size() - task.common_prefix;
    }
    // the last tokens don't need to be evaluated
    task.required_tokens -= task.seq_tokens.size();
    return true;
 }
@ -1474,7 +1472,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
            return;
        }
    } else {
-        int n_dot = n_task/100;
+        int n_dot = std::max((int) n_task/100, 1);
        int i_task = 0;
        for (auto& task : tasks) {
            ++i_task;
@ -1549,10 +1547,12 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
            n_logits += 1;
            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
-                // end before the last token, no need to predict past the end of the sequences
+                const size_t seq_tokens_size = cur_task.seq_tokens[s].size();
-                for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size() - 1; ++i) {
+                // TODO: don't evaluate the last token of each sequence
-                    llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true);
+                for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
-                    n_logits += 1;
+                    const bool needs_logits = i < seq_tokens_size - 1;
                    llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
                    n_logits += needs_logits;
                }
            }