From 5f33a675ca426c03adecad2f76099ee3217221d1 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 20 Mar 2024 22:48:19 -0400 Subject: [PATCH] perplexity : make hellaswag and multiple-choice outputs identical to master Due to how the KV cache is updated, the logprobs for tokens in a batch are very slightly affected by the other tokens present in the batch, so to make hellaswag and multiple-choice return exactly the same results as on master, the last token of each sequence needs to be evaluated even though its output is not used at all. This will probably be changed back in the future to make these benchmarks a tiny bit faster. * perplexity : fix division by zero when using less than 100 multiple-choice tasks --- examples/perplexity/perplexity.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index d83777411..a65d9cd0b 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -832,9 +832,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { hs_cur.seq_tokens[0].size() - hs_cur.common_prefix + hs_cur.seq_tokens[1].size() - hs_cur.common_prefix + hs_cur.seq_tokens[2].size() - hs_cur.common_prefix + - hs_cur.seq_tokens[3].size() - hs_cur.common_prefix - // the last tokens don't need to be evaluated - - 4; + hs_cur.seq_tokens[3].size() - hs_cur.common_prefix; //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size()); @@ -895,10 +893,12 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { n_logits += 1; for (int s = 0; s < 4; ++s) { - // end before the last token, no need to predict past the end of the sequences - for (size_t i = hs_cur.common_prefix; i < hs_cur.seq_tokens[s].size() - 1; ++i) { - llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, true); - n_logits += 1; + const size_t seq_tokens_size = hs_cur.seq_tokens[s].size(); + // TODO: don't evaluate the last token of each sequence + for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) { + const bool needs_logits = i < seq_tokens_size - 1; + llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits); + n_logits += needs_logits; } } @@ -1359,8 +1359,6 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, for (auto& seq : task.seq_tokens) { task.required_tokens += seq.size() - task.common_prefix; } - // the last tokens don't need to be evaluated - task.required_tokens -= task.seq_tokens.size(); return true; } @@ -1474,7 +1472,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params return; } } else { - int n_dot = n_task/100; + int n_dot = std::max((int) n_task/100, 1); int i_task = 0; for (auto& task : tasks) { ++i_task; @@ -1549,10 +1547,12 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params n_logits += 1; for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { - // end before the last token, no need to predict past the end of the sequences - for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size() - 1; ++i) { - llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true); - n_logits += 1; + const size_t seq_tokens_size = cur_task.seq_tokens[s].size(); + // TODO: don't evaluate the last token of each sequence + for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) { + const bool needs_logits = i < seq_tokens_size - 1; + llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits); + n_logits += needs_logits; } }