diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 046220fe4..8825149c8 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -540,14 +540,14 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { // This is needed as usual for LLaMA models const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + // The tasks should be randomized so the score stabilizes quickly. + bool randomize_tasks = true; + // Number of tasks to use when computing the score if (params.hellaswag_tasks < hs_task_count) { hs_task_count = params.hellaswag_tasks; } - // The tasks should be randomized so the score stabilizes quickly. - bool randomize_tasks = true; - // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now std::mt19937 rng(1); @@ -1079,6 +1079,8 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) { // Calculates TruthFulQA score (multiple choice with single correct answer) from prompt // // Data extracted from https://huggingface.co/datasets/truthful_qa + // The validation dataset in the binary format that is being used can be found at + // https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp // std::istringstream strstream(params.prompt); @@ -1207,6 +1209,7 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) { int n_done = 0; int n_correct = 0; + int n_tot_answers = 0; for (size_t i0 = 0; i0 < tasks.size(); i0++) { int n_cur = 0; @@ -1246,6 +1249,8 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) { } } + s0 += num_answers; + cur_task.i_batch = i_batch; i_batch += cur_task.required_tokens; @@ -1289,6 +1294,13 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) { // compute the logprobs for each ending of the decoded tasks for (size_t i = i0; i < i1; ++i) { auto & cur_task = tasks[i]; + //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str()); + //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) { + // if (cur_task.mc1.labels[j] == 1) { + // printf("%d", j+1); + // } + //} + //printf("\n common_prefix: %zu\n", cur_task.common_prefix); std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float)); @@ -1298,11 +1310,29 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) { for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { size_t count = 1; float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]); + //printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob); + //for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { + // printf(" %zu %g\n", ir, eval_results[ir]); + // ++count; + // log_prob += eval_results[ir++]; + //} + //size_t count = 0; + //float log_prob = 0; + //printf(" <%s>\n", cur_task.mc1.answers[s].c_str()); + //float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]); + //printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob); for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { + //printf(" %zu %g\n", ir, eval_results[ir]); ++count; log_prob += eval_results[ir++]; } + //if (!count) { + // ++count; + // log_prob += std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]); + //} cur_task.log_probs[s] = log_prob / count; + //printf(" Final: %g\n", log_prob / count); + //printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count); } // Find the ending with maximum logprob @@ -1315,13 +1345,14 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) { } } + n_tot_answers += cur_task.log_probs.size(); if (cur_task.mc1.labels[logprob_max_idx] == 1) { ++n_correct; } ++n_done; // Print the accumulated accuracy mean x 100 - printf("%zu\t%.8lf\n", i + 1, 100.*n_correct/n_done); + printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done); fflush(stdout); } @@ -1330,6 +1361,15 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) { llama_batch_free(batch); + if (n_done < 100) return; + + float p = 1.f*n_correct/n_done; + float sigma = sqrt(p*(1-p)/(n_done-1)); + printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); + p = 1.f*n_done/n_tot_answers; + sigma = sqrt(p*(1-p)/(n_done-1)); + printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); + printf("\n"); }