From 0794b777148e3ab2c76b87c97d784df340f7bda2 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 25 May 2024 04:32:22 +0200 Subject: [PATCH] Move 'add_special_bos/eos' logic to llm_tokenizer_bpe --- llama.cpp | 58 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/llama.cpp b/llama.cpp index 1ffe4d9af..046449130 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12274,8 +12274,11 @@ struct llm_bigram_bpe { struct llm_tokenizer_bpe { llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) { GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE); + special_add_bos = vocab.special_add_bos == 1; + special_add_eos = vocab.special_add_eos == 1; switch (vocab.type_pre) { case LLAMA_VOCAB_PRE_TYPE_LLAMA3: + special_add_bos = true; ignore_merges = true; regex_exprs = { // original regex from tokenizer.json @@ -12362,6 +12365,39 @@ struct llm_tokenizer_bpe { } } + bool add_special_bos(std::vector & output) const { + if (special_add_bos) { + GGML_ASSERT(vocab.special_bos_id != -1); + output.push_back(vocab.special_bos_id); + return true; + } + return false; + } + + bool add_special_eos(std::vector & output) const { + if (special_add_eos) { + GGML_ASSERT(vocab.special_eos_id != -1); + output.push_back(vocab.special_eos_id); + return true; + } + return false; + } + + void check_add_special(const std::vector & output) const { + if (special_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) { + LLAMA_LOG_WARN( + "%s: Added a BOS token to the prompt as specified by the model but the prompt " + "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " + "Are you sure this is what you want?\n", __FUNCTION__); + } + if (special_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) { + LLAMA_LOG_WARN( + "%s: Added a EOS token to the prompt as specified by the model but the prompt " + "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. " + "Are you sure this is what you want?\n", __FUNCTION__); + } + } + void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; @@ -12499,8 +12535,10 @@ private: const llama_vocab & vocab; - bool ignore_merges = false; std::vector regex_exprs; + bool special_add_bos = false; + bool special_add_eos = false; + bool ignore_merges = false; std::vector symbols; std::vector symbols_final; @@ -12851,9 +12889,8 @@ static std::vector llama_tokenize_internal(const llama_vocab & { llm_tokenizer_bpe tokenizer(vocab); - if (add_special && vocab.special_add_bos != 0) { - GGML_ASSERT(vocab.special_bos_id != -1); - output.push_back(vocab.special_bos_id); + if (add_special) { + tokenizer.add_special_bos(output); } for (const auto & fragment : fragment_buffer) { @@ -12869,16 +12906,9 @@ static std::vector llama_tokenize_internal(const llama_vocab & } } - if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) { - LLAMA_LOG_WARN( - "%s: Added a BOS token to the prompt as specified by the model but the prompt " - "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. " - "Are you sure this is what you want?\n", __FUNCTION__); - } - - if (add_special && vocab.special_add_eos == 1) { - GGML_ASSERT(vocab.special_add_eos != -1); - output.push_back(vocab.special_eos_id); + if (add_special) { + tokenizer.add_special_eos(output); + tokenizer.check_add_special(output); } } break; case LLAMA_VOCAB_TYPE_WPM: