From 0652b4209fd05424bee48d0cb25f2ec5d9715719 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 3 May 2023 20:25:55 +0300 Subject: [PATCH] llama : require first token to be BOS --- examples/common.cpp | 4 ++-- examples/main/main.cpp | 4 ++-- examples/perplexity/perplexity.cpp | 34 +++++++++++++++++++----------- llama.cpp | 12 ++++++++++- 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 1a2f4743a..5ee574004 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -424,8 +424,8 @@ std::string gpt_random_prompt(std::mt19937 & rng) { // TODO: not great allocating this every time std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars - std::vector res(text.size() + (int)add_bos); - int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); + std::vector res(text.size() + (int) add_bos); + const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); assert(n >= 0); res.resize(n); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 125c189a3..79a69a9ed 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -309,7 +309,8 @@ int main(int argc, char ** argv) { if (n_past + (int) embd.size() > n_ctx) { const int n_left = n_past - params.n_keep; - n_past = params.n_keep; + // always keep the first token - BOS + n_past = std::max(1, params.n_keep); // insert n_left/2 tokens at the start of embd from last_n_tokens embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); @@ -327,7 +328,6 @@ int main(int argc, char ** argv) { } // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) - // REVIEW if (n_session_consumed < (int) session_tokens.size()) { size_t i = 0; for ( ; i < embd.size(); i++) { diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 299a19999..90dbfb6e0 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -32,36 +32,46 @@ void perplexity(llama_context * ctx, const gpt_params & params) { int n_vocab = llama_n_vocab(ctx); double nll = 0.0; - fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch); + fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch); for (int i = 0; i < seq_count; ++i) { - int start = i * params.n_ctx; - int end = start + params.n_ctx; + const int start = i * params.n_ctx; + const int end = start + params.n_ctx; std::vector logits; - int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch; - auto start_t = std::chrono::high_resolution_clock::now(); + const int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch; + + const auto start_t = std::chrono::high_resolution_clock::now(); + for (int j = 0; j < num_batches; ++j) { - int batch_start = start + j * params.n_batch; - int batch_size = std::min(end - batch_start, params.n_batch); + const int batch_start = start + j * params.n_batch; + const int batch_size = std::min(end - batch_start, params.n_batch); + + // TODO: not perfect since this can be in the middle of a word, but it is better than nothing + tokens[batch_start] = llama_token_bos(); + if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) { fprintf(stderr, "%s : failed to eval\n", __func__); return; } - auto batch_logits = llama_get_logits(ctx); + + const auto batch_logits = llama_get_logits(ctx); logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); } - auto end_t = std::chrono::high_resolution_clock::now(); + + const auto end_t = std::chrono::high_resolution_clock::now(); + if (i == 0) { const float seconds = std::chrono::duration(end_t - start_t).count(); - printf("%.2f seconds per pass - ETA ", seconds); + fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, seconds); int total_seconds = (int)(seconds * seq_count); if (total_seconds >= 60*60) { - printf("%d hours ", total_seconds / (60*60)); + fprintf(stderr, "%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } - printf("%d minutes\n", total_seconds / 60); + fprintf(stderr, "%d minutes\n", total_seconds / 60); } + // We get the logits for all the tokens in the context window (params.n_ctx) // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, // calculate the perplexity over the last half the window (so the model always has diff --git a/llama.cpp b/llama.cpp index 85af4dc49..cbb493647 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1052,6 +1052,13 @@ static bool llama_eval_internal( const int n_tokens, const int n_past, const int n_threads) { + + // enforce that the first token is BOS + if (n_past == 0 && tokens[0] != llama_token_bos()) { + fprintf(stderr, "%s: first token must be BOS\n", __func__); + return false; + } + const int64_t t_start_us = ggml_time_us(); const int N = n_tokens; @@ -1482,7 +1489,7 @@ static std::vector llama_tokenize(const llama_vocab & vocab, co } if (bos) { - output.push_back(1); + output.push_back(llama_token_bos()); } tokenizer.tokenize(text, output); @@ -2727,11 +2734,14 @@ int llama_eval( fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } + // get a more accurate load time, upon first eval + // TODO: fix this if (!ctx->has_evaluated_once) { ctx->t_load_us = ggml_time_us() - ctx->t_start_us; ctx->has_evaluated_once = true; } + return 0; }