From 0794b777148e3ab2c76b87c97d784df340f7bda2 Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Sat, 25 May 2024 04:32:22 +0200
Subject: [PATCH] Move 'add_special_bos/eos' logic to llm_tokenizer_bpe

---
 llama.cpp | 58 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 1ffe4d9af..046449130 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12274,8 +12274,11 @@ struct llm_bigram_bpe {
 struct llm_tokenizer_bpe {
     llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
         GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
+        special_add_bos = vocab.special_add_bos == 1;
+        special_add_eos = vocab.special_add_eos == 1;
         switch (vocab.type_pre) {
             case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
+                special_add_bos = true;
                 ignore_merges = true;
                 regex_exprs = {
                     // original regex from tokenizer.json
@@ -12362,6 +12365,39 @@ struct llm_tokenizer_bpe {
         }
     }
 
+    bool add_special_bos(std::vector<llama_vocab::id> & output) const {
+        if (special_add_bos) {
+            GGML_ASSERT(vocab.special_bos_id != -1);
+            output.push_back(vocab.special_bos_id);
+            return true;
+        }
+        return false;
+    }
+
+    bool add_special_eos(std::vector<llama_vocab::id> & output) const {
+        if (special_add_eos) {
+            GGML_ASSERT(vocab.special_eos_id != -1);
+            output.push_back(vocab.special_eos_id);
+            return true;
+        }
+        return false;
+    }
+
+    void check_add_special(const std::vector<llama_vocab::id> & output) const {
+        if (special_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
+            LLAMA_LOG_WARN(
+                "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+                "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+                "Are you sure this is what you want?\n", __FUNCTION__);
+        }
+        if (special_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
+            LLAMA_LOG_WARN(
+                "%s: Added a EOS token to the prompt as specified by the model but the prompt "
+                "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
+                "Are you sure this is what you want?\n", __FUNCTION__);
+        }
+    }
+
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
         int final_prev_index = -1;
 
@@ -12499,8 +12535,10 @@ private:
 
     const llama_vocab & vocab;
 
-    bool ignore_merges = false;
     std::vector<std::string> regex_exprs;
+    bool special_add_bos = false;
+    bool special_add_eos = false;
+    bool ignore_merges = false;
 
     std::vector<llm_symbol> symbols;
     std::vector<llm_symbol> symbols_final;
@@ -12851,9 +12889,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
             {
                 llm_tokenizer_bpe tokenizer(vocab);
 
-                if (add_special && vocab.special_add_bos != 0) {
-                    GGML_ASSERT(vocab.special_bos_id != -1);
-                    output.push_back(vocab.special_bos_id);
+                if (add_special) {
+                    tokenizer.add_special_bos(output);
                 }
 
                 for (const auto & fragment : fragment_buffer) {
@@ -12869,16 +12906,9 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                     }
                 }
 
-                if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
-                    LLAMA_LOG_WARN(
-                        "%s: Added a BOS token to the prompt as specified by the model but the prompt "
-                        "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
-                        "Are you sure this is what you want?\n", __FUNCTION__);
-                }
-
-                if (add_special && vocab.special_add_eos == 1) {
-                    GGML_ASSERT(vocab.special_add_eos != -1);
-                    output.push_back(vocab.special_eos_id);
+                if (add_special) {
+                    tokenizer.add_special_eos(output);
+                    tokenizer.check_add_special(output);
                 }
             } break;
         case LLAMA_VOCAB_TYPE_WPM: