From 09db8bd59843c12fb0bfe060cf7a0f2ce7dc6f41 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 17 Jan 2024 21:40:52 +0200 Subject: [PATCH] winogrande: simple implementation It doesn't look like it is working - why? For Mistral-7B it is barely better than random chance (score ~60% for 1267 tasks), while I see Mistral-7B scoring 78.4% on the HF leader board. 1-sigma statistical uncertainty for 1267 tasks is ~1.4, so no way the difference is due to statistics. --- common/common.cpp | 10 ++ common/common.h | 3 + examples/perplexity/perplexity.cpp | 191 +++++++++++++++++++++++++++++ 3 files changed, 204 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index 2b0865fff..ce20360a4 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -681,6 +681,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.hellaswag_tasks = std::stoi(argv[i]); + } else if (arg == "--winogrande") { + params.winogrande = true; + } else if (arg == "--winogrande-tasks") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.winogrande_tasks = std::stoi(argv[i]); } else if (arg == "--ignore-eos") { params.ignore_eos = true; } else if (arg == "--no-penalize-nl") { @@ -926,6 +934,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); + printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n"); + printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); diff --git a/common/common.h b/common/common.h index 1f43e6282..0ae9c18b3 100644 --- a/common/common.h +++ b/common/common.h @@ -105,6 +105,9 @@ struct gpt_params { bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score + bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt + size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed + bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index b4fedf803..d5280f2d0 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -9,6 +9,9 @@ #include #include #include +#include +#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -676,6 +679,192 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { printf("\n"); } +struct winogrande_entry { + std::string first; + std::string second; + std::array choices; + int answer; +}; + +static std::vector load_winogrande_from_csv(const std::string& prompt) { + std::vector result; + std::istringstream in(prompt); + std::string line; + std::array comma_pos; + while (true) { + std::getline(in, line); + if (in.fail() || in.eof()) break; + int ipos = 0; + bool quote_open = false; + for (int i = 0; i < int(line.size()); ++i) { + if (!quote_open) { + if (line[i] == ',') { + comma_pos[ipos++] = i; + if (ipos == 4) break; + } + else if (line[i] == '"') { + quote_open = true; + } + } + else { + if (line[i] == '"') { + quote_open = false; + } + } + } + if (ipos != 4) { + printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str()); + continue; + } + auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3) + : line.substr(comma_pos[0]+1, comma_pos[1] - comma_pos[0] - 1); + auto choice1 = line.substr(comma_pos[1]+1, comma_pos[2] - comma_pos[1] - 1); + auto choice2 = line.substr(comma_pos[2]+1, comma_pos[3] - comma_pos[2] - 1); + auto answer = line.substr(comma_pos[3]+1, line.size() - comma_pos[3] - 1); + auto index = line.substr(0, comma_pos[0]); + int where = 0; + for ( ; where < int(sentence.size()); ++where) { + if (sentence[where] == '_') break; + } + if (where == int(sentence.size())) { + printf("%s: no _ in <%s>\n", __func__, sentence.c_str()); + continue; + } + std::istringstream stream(answer.c_str()); + int i_answer; stream >> i_answer; + if (stream.fail() || i_answer < 1 || i_answer > 2) { + printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str()); + continue; + } + result.emplace_back(); + auto& wg = result.back(); + wg.first = sentence.substr(0, where); + wg.second = sentence.substr(where + 1, sentence.size() - where - 1); + wg.choices[0] = std::move(choice1); + wg.choices[1] = std::move(choice2); + wg.answer = i_answer; + } + return result; +} + +static void winogrande_score(llama_context * ctx, const gpt_params & params) { + + auto data = load_winogrande_from_csv(params.prompt); + if (data.empty()) { + fprintf(stderr, "%s: no tasks\n", __func__); + return; + } + + fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size()); + + if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) { + fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks); + std::mt19937 rng(1); + std::vector aux(data.size()); + for (int i = 0; i < int(data.size()); ++i) { + aux[i] = i; + } + float scale = 1/(1.f + (float)rng.max()); + std::vector selected; + selected.reserve(params.winogrande_tasks); + for (int i = 0; i < int(params.winogrande_tasks); ++i) { + int j = int(scale*rng()*aux.size()); + selected[i] = std::move(data[aux[j]]); + aux[j] = aux.back(); + aux.pop_back(); + } + data = std::move(selected); + } + + const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM; + fprintf(stderr, "================================= is_spm = %d\n", is_spm); + + // This is needed as usual for LLaMA models + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + + fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__); + //printf("\ntask\tacc_norm\n"); + + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_ctx = llama_n_ctx(ctx); + + std::vector tok_logits(n_vocab); + + int n_correct = 0; + + for (size_t task_idx = 0; task_idx < data.size(); task_idx++) { + const auto& task = data[task_idx]; + + auto base_context = ::llama_tokenize(ctx, task.first, add_bos); + //auto base_ctx_1st = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos); + //auto base_ctx_2nd = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos); + + auto query_1st = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos); + auto query_2nd = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos); + + if (query_1st.size() > (size_t)n_ctx || query_2nd.size() > (size_t)n_ctx) { + fprintf(stderr, "%s : number of tokens in queries %zu, %zu > n_ctxl\n", __func__, query_1st.size(), query_2nd.size()); + return; + } + + // Speedup small evaluations by evaluating atleast 32 tokens + if (query_1st.size() < 32) query_1st.resize(32); + if (query_2nd.size() < 32) query_2nd.resize(32); + + llama_kv_cache_clear(ctx); + auto logits_1st = hellaswag_evaluate_tokens(ctx, query_1st, 0, params.n_batch, n_vocab); + + llama_kv_cache_clear(ctx); + auto logits_2nd = hellaswag_evaluate_tokens(ctx, query_2nd, 0, params.n_batch, n_vocab); + + if (logits_1st.empty() || logits_2nd.empty()) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return; + } + + float score_1st = 0; + //for (size_t j = base_ctx_1st.size()-1; j < query_1st.size()-1; ++j) { + // std::memcpy(tok_logits.data(), logits_1st.data() + j*n_vocab, n_vocab*sizeof(float)); + // const float prob = softmax(tok_logits)[query_1st[j+1]]; + // score_1st += std::log(prob); + //} + //score_1st /= (query_1st.size() - base_ctx_1st.size()); + for (size_t j = base_context.size(); j < query_1st.size()-1; ++j) { + std::memcpy(tok_logits.data(), logits_1st.data() + j*n_vocab, n_vocab*sizeof(float)); + const float prob = softmax(tok_logits)[query_1st[j+1]]; + score_1st += std::log(prob); + } + score_1st /= (query_1st.size() - base_context.size() - 1); + + float score_2nd = 0; + //for (size_t j = base_ctx_2nd.size(); j < query_2nd.size()-1; ++j) { + // std::memcpy(tok_logits.data(), logits_2nd.data() + j*n_vocab, n_vocab*sizeof(float)); + // const float prob = softmax(tok_logits)[query_2nd[j+1]]; + // score_2nd += std::log(prob); + //} + //score_2nd /= (query_2nd.size() - base_ctx_2nd.size()); + for (size_t j = base_context.size(); j < query_2nd.size()-1; ++j) { + std::memcpy(tok_logits.data(), logits_2nd.data() + j*n_vocab, n_vocab*sizeof(float)); + const float prob = softmax(tok_logits)[query_2nd[j+1]]; + score_2nd += std::log(prob); + } + score_2nd /= (query_2nd.size() - base_context.size() - 1); + + int result = score_1st > score_2nd ? 1 : 2; + + if (result == task.answer) { + ++n_correct; + } + + // Print the accumulated accuracy mean x 100 + printf("%zu\t%.8lf\t%10.6f %10.6f %d %d\n",task_idx+1, 100.0 * n_correct/(task_idx+1),score_1st,score_2nd,result,task.answer); + fflush(stdout); + } + + printf("\n"); +} + + int main(int argc, char ** argv) { gpt_params params; @@ -733,6 +922,8 @@ int main(int argc, char ** argv) { struct results_perplexity results; if (params.hellaswag) { hellaswag_score(ctx, params); + } else if (params.winogrande) { + winogrande_score(ctx, params); } else { results = perplexity(ctx, params); }