diff --git a/common/common.cpp b/common/common.cpp
index b0128c86d..0e4b8bab2 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -708,14 +708,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.winogrande_tasks = std::stoi(argv[i]);
-        } else if (arg == "--truthful-qa") {
-            params.truthful_qa = true;
-        } else if (arg == "--truthful-qa-tasks") {
+        } else if (arg == "--multiple-choice") {
+            params.multiple_choice = true;
+        } else if (arg == "--multiple-choice-tasks") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.thruthful_qa_tasks = std::stoi(argv[i]);
+            params.multiple_choice_tasks = std::stoi(argv[i]);
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--no-penalize-nl") {
@@ -915,6 +915,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
     printf("  -f FNAME, --file FNAME\n");
     printf("                        prompt file to start generation.\n");
+    printf("  -bf FNAME, --binary-file FNAME\n");
+    printf("                        binary file containing multiple choice tasks.\n");
     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
     printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@@ -963,8 +965,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
     printf("  --winogrande          compute Winogrande score over random tasks from datafile supplied with -f\n");
     printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
-    printf("  --truthful-qa         compute TruthFullQA multiple choice score over random tasks from datafile supplied with -f\n");
-    printf("  --truthful-qa-tasks N number of tasks to use when computing the TruthFullQA score (default: %zu)\n", params.winogrande_tasks);
+    printf("  --multiple-choice     compute multiple choice score over random tasks from datafile supplied with -f\n");
+    printf("  --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
     printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
     printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
     printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
diff --git a/common/common.h b/common/common.h
index 82c1e5014..c69ad7e94 100644
--- a/common/common.h
+++ b/common/common.h
@@ -108,8 +108,8 @@ struct gpt_params {
     bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
     size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
 
-    bool   truthful_qa     = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
-    size_t thruthful_qa_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
+    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
 
     bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
     bool random_prompt     = false; // do not randomize prompt if none provided
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index e32f86130..ed0305fb5 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1040,7 +1040,7 @@ static bool deserialize_string(std::istream& in, std::string& str) {
     return false;
 }
 
-struct truthful_qa_answer {
+struct multiple_choice_answers {
     std::vector<std::string> answers;
     std::vector<int>         labels;
     bool deserialize(std::istream& in) {
@@ -1057,10 +1057,10 @@ struct truthful_qa_answer {
     }
 };
 
-struct truthful_qa_task {
-    std::string question;
-    truthful_qa_answer mc1;
-    truthful_qa_answer mc2;
+struct multiple_choice_task {
+    std::string question;         // the question (or context that needs to be continued)
+    multiple_choice_answers mc1;  // possible answers (continuations) with a single correct answer
+    multiple_choice_answers mc2;  // possible answers (continuations) with multiple correct answers - not handled yet
     bool deserialize(std::istream& in) {
         if (!deserialize_string(in, question)) return false;
         return mc1.deserialize(in) && mc2.deserialize(in);
@@ -1074,7 +1074,7 @@ struct truthful_qa_task {
     std::vector<float> log_probs;
 };
 
-static bool truthful_qa_prepare_one_task(llama_context * ctx, bool add_bos, truthful_qa_task& task, bool log_error) {
+static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
     if (task.question.empty() || task.mc1.answers.empty()) {
         if (log_error) {
             printf("%s: found bad task with empty question and/or answers\n", __func__);
@@ -1117,13 +1117,23 @@ static bool truthful_qa_prepare_one_task(llama_context * ctx, bool add_bos, trut
     return true;
 }
 
-static void truthful_qa_score(llama_context * ctx, const gpt_params & params) {
-    // Calculates TruthFulQA score (multiple choice with single correct answer) from prompt
-    //
-    // Data extracted from https://huggingface.co/datasets/truthful_qa
-    // The validation dataset in the binary format that is being used can be found at
-    //     https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp
-    //
+//
+// Calculates score for multiple choice tasks with single correct answer from prompt.
+// Commonly used LLM evaluation metrics of this type are
+//   * ARC
+//   * HellaSwag
+//   * MMLU
+//   * TruthfulQA
+//
+// Validation datasets for these 4 tests can be found at
+//     https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp
+// The data for these datasets was extracted from
+//     git@hf.co:datasets/allenai/ai2_arc
+//     https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
+//     git@hf.co:datasets/Stevross/mmlu
+//     https://huggingface.co/datasets/truthful_qa
+//
+static void multiple_choice_score(llama_context * ctx, const gpt_params & params) {
 
     std::istringstream strstream(params.prompt);
     uint32_t n_task;
@@ -1140,8 +1150,8 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) {
         return;
     }
 
-    std::vector<truthful_qa_task> tasks;
-    if (params.thruthful_qa_tasks == 0 || params.thruthful_qa_tasks >= (size_t)n_task) {
+    std::vector<multiple_choice_task> tasks;
+    if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
         // Use all tasks
         tasks.resize(n_task);
         printf("%s: reading tasks", __func__);
@@ -1158,12 +1168,12 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) {
         printf("done\n");
     }
     else {
-        printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.thruthful_qa_tasks, n_task);
+        printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
         std::mt19937 rng(1);
         std::vector<int> aux(n_task);
         for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
         float scale = 1.f/(1.f + (float)std::mt19937::max());
-        tasks.resize(params.thruthful_qa_tasks);
+        tasks.resize(params.multiple_choice_tasks);
         for (auto& task : tasks) {
             int j = (int)(scale * rng() * aux.size());
             int idx = aux[j];
@@ -1175,7 +1185,7 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) {
                 return;
             }
         }
-        n_task = params.thruthful_qa_tasks;
+        n_task = params.multiple_choice_tasks;
     }
 
     // This is needed as usual for LLaMA models
@@ -1200,7 +1210,7 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) {
                 }
                 int last = std::min(first + k_chunk, num_tasks);
                 for (int i = first; i < last; ++i) {
-                    if (!truthful_qa_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
+                    if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
                 }
             }
         };
@@ -1222,7 +1232,7 @@ static void truthful_qa_score(llama_context * ctx, const gpt_params & params) {
         int i_task = 0;
         for (auto& task : tasks) {
             ++i_task;
-            if (!truthful_qa_prepare_one_task(ctx, add_bos, task, true)) {
+            if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
                 return;
             }
             if (i_task%n_dot == 0) {
@@ -1465,8 +1475,8 @@ int main(int argc, char ** argv) {
         hellaswag_score(ctx, params);
     } else if (params.winogrande) {
         winogrande_score(ctx, params);
-    } else if (params.truthful_qa) {
-        truthful_qa_score(ctx, params);
+    } else if (params.multiple_choice) {
+        multiple_choice_score(ctx, params);
     } else {
         results = perplexity(ctx, params);
     }