diff --git a/common/common.cpp b/common/common.cpp index b0128c86d..0e4b8bab2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -708,14 +708,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.winogrande_tasks = std::stoi(argv[i]); - } else if (arg == "--truthful-qa") { - params.truthful_qa = true; - } else if (arg == "--truthful-qa-tasks") { + } else if (arg == "--multiple-choice") { + params.multiple_choice = true; + } else if (arg == "--multiple-choice-tasks") { if (++i >= argc) { invalid_param = true; break; } - params.thruthful_qa_tasks = std::stoi(argv[i]); + params.multiple_choice_tasks = std::stoi(argv[i]); } else if (arg == "--ignore-eos") { params.ignore_eos = true; } else if (arg == "--no-penalize-nl") { @@ -915,6 +915,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); printf(" -f FNAME, --file FNAME\n"); printf(" prompt file to start generation.\n"); + printf(" -bf FNAME, --binary-file FNAME\n"); + printf(" binary file containing multiple choice tasks.\n"); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); @@ -963,8 +965,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n"); printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks); - printf(" --truthful-qa compute TruthFullQA multiple choice score over random tasks from datafile supplied with -f\n"); - printf(" --truthful-qa-tasks N number of tasks to use when computing the TruthFullQA score (default: %zu)\n", params.winogrande_tasks); + printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n"); + printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); diff --git a/common/common.h b/common/common.h index 82c1e5014..c69ad7e94 100644 --- a/common/common.h +++ b/common/common.h @@ -108,8 +108,8 @@ struct gpt_params { bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed - bool truthful_qa = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt - size_t thruthful_qa_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed + bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt + size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS bool random_prompt = false; // do not randomize prompt if none provided diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index e32f86130..ed0305fb5 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1040,7 +1040,7 @@ static bool deserialize_string(std::istream& in, std::string& str) { return false; } -struct truthful_qa_answer { +struct multiple_choice_answers { std::vector answers; std::vector labels; bool deserialize(std::istream& in) { @@ -1057,10 +1057,10 @@ struct truthful_qa_answer { } }; -struct truthful_qa_task { - std::string question; - truthful_qa_answer mc1; - truthful_qa_answer mc2; +struct multiple_choice_task { + std::string question; // the question (or context that needs to be continued) + multiple_choice_answers mc1; // possible answers (continuations) with a single correct answer + multiple_choice_answers mc2; // possible answers (continuations) with multiple correct answers - not handled yet bool deserialize(std::istream& in) { if (!deserialize_string(in, question)) return false; return mc1.deserialize(in) && mc2.deserialize(in); @@ -1074,7 +1074,7 @@ struct truthful_qa_task { std::vector log_probs; }; -static bool truthful_qa_prepare_one_task(llama_context * ctx, bool add_bos, truthful_qa_task& task, bool log_error) { +static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) { if (task.question.empty() || task.mc1.answers.empty()) { if (log_error) { printf("%s: found bad task with empty question and/or answers\n", __func__); @@ -1117,13 +1117,23 @@ static bool truthful_qa_prepare_one_task(llama_context * ctx, bool add_bos, trut return true; } -static void truthful_qa_score(llama_context * ctx, const gpt_params & params) { - // Calculates TruthFulQA score (multiple choice with single correct answer) from prompt - // - // Data extracted from https://huggingface.co/datasets/truthful_qa - // The validation dataset in the binary format that is being used can be found at - // https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp - // +// +// Calculates score for multiple choice tasks with single correct answer from prompt. +// Commonly used LLM evaluation metrics of this type are +// * ARC +// * HellaSwag +// * MMLU +// * TruthfulQA +// +// Validation datasets for these 4 tests can be found at +// https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp +// The data for these datasets was extracted from +// git@hf.co:datasets/allenai/ai2_arc +// https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl +// git@hf.co:datasets/Stevross/mmlu +// https://huggingface.co/datasets/truthful_qa +// +static void multiple_choice_score(llama_context * ctx, const gpt_params & params) { std::istringstream strstream(params.prompt); uint32_t n_task; @@ -1140,8 +1150,8 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) { return; } - std::vector tasks; - if (params.thruthful_qa_tasks == 0 || params.thruthful_qa_tasks >= (size_t)n_task) { + std::vector tasks; + if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) { // Use all tasks tasks.resize(n_task); printf("%s: reading tasks", __func__); @@ -1158,12 +1168,12 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) { printf("done\n"); } else { - printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.thruthful_qa_tasks, n_task); + printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task); std::mt19937 rng(1); std::vector aux(n_task); for (uint32_t i = 0; i < n_task; ++i) aux[i] = i; float scale = 1.f/(1.f + (float)std::mt19937::max()); - tasks.resize(params.thruthful_qa_tasks); + tasks.resize(params.multiple_choice_tasks); for (auto& task : tasks) { int j = (int)(scale * rng() * aux.size()); int idx = aux[j]; @@ -1175,7 +1185,7 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) { return; } } - n_task = params.thruthful_qa_tasks; + n_task = params.multiple_choice_tasks; } // This is needed as usual for LLaMA models @@ -1200,7 +1210,7 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) { } int last = std::min(first + k_chunk, num_tasks); for (int i = first; i < last; ++i) { - if (!truthful_qa_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local; + if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local; } } }; @@ -1222,7 +1232,7 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) { int i_task = 0; for (auto& task : tasks) { ++i_task; - if (!truthful_qa_prepare_one_task(ctx, add_bos, task, true)) { + if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) { return; } if (i_task%n_dot == 0) { @@ -1465,8 +1475,8 @@ int main(int argc, char ** argv) { hellaswag_score(ctx, params); } else if (params.winogrande) { winogrande_score(ctx, params); - } else if (params.truthful_qa) { - truthful_qa_score(ctx, params); + } else if (params.multiple_choice) { + multiple_choice_score(ctx, params); } else { results = perplexity(ctx, params); }