perplexity : make hellaswag and multiple-choice outputs identical to master

Due to how the KV cache is updated, the logprobs for tokens in a batch
are very slightly affected by the other tokens present in the batch,
so to make hellaswag and multiple-choice return exactly the same results
as on master, the last token of each sequence needs to be evaluated
even though its output is not used at all.

This will probably be changed back in the future to make these benchmarks
a tiny bit faster.

* perplexity : fix division by zero when using less than 100 multiple-choice tasks
This commit is contained in:
Francis Couture-Harpin 2024-03-20 22:48:19 -04:00
parent 7d8d6b589f
commit 5f33a675ca

View file

@ -832,9 +832,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
hs_cur.seq_tokens[0].size() - hs_cur.common_prefix + hs_cur.seq_tokens[0].size() - hs_cur.common_prefix +
hs_cur.seq_tokens[1].size() - hs_cur.common_prefix + hs_cur.seq_tokens[1].size() - hs_cur.common_prefix +
hs_cur.seq_tokens[2].size() - hs_cur.common_prefix + hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
hs_cur.seq_tokens[3].size() - hs_cur.common_prefix hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
// the last tokens don't need to be evaluated
- 4;
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size()); //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size());
@ -895,10 +893,12 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
n_logits += 1; n_logits += 1;
for (int s = 0; s < 4; ++s) { for (int s = 0; s < 4; ++s) {
// end before the last token, no need to predict past the end of the sequences const size_t seq_tokens_size = hs_cur.seq_tokens[s].size();
for (size_t i = hs_cur.common_prefix; i < hs_cur.seq_tokens[s].size() - 1; ++i) { // TODO: don't evaluate the last token of each sequence
llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, true); for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
n_logits += 1; const bool needs_logits = i < seq_tokens_size - 1;
llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
n_logits += needs_logits;
} }
} }
@ -1359,8 +1359,6 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos,
for (auto& seq : task.seq_tokens) { for (auto& seq : task.seq_tokens) {
task.required_tokens += seq.size() - task.common_prefix; task.required_tokens += seq.size() - task.common_prefix;
} }
// the last tokens don't need to be evaluated
task.required_tokens -= task.seq_tokens.size();
return true; return true;
} }
@ -1474,7 +1472,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
return; return;
} }
} else { } else {
int n_dot = n_task/100; int n_dot = std::max((int) n_task/100, 1);
int i_task = 0; int i_task = 0;
for (auto& task : tasks) { for (auto& task : tasks) {
++i_task; ++i_task;
@ -1549,10 +1547,12 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
n_logits += 1; n_logits += 1;
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
// end before the last token, no need to predict past the end of the sequences const size_t seq_tokens_size = cur_task.seq_tokens[s].size();
for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size() - 1; ++i) { // TODO: don't evaluate the last token of each sequence
llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true); for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
n_logits += 1; const bool needs_logits = i < seq_tokens_size - 1;
llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
n_logits += needs_logits;
} }
} }