From a1f146dba1126c6557d9c7c8696753aba87ec5e4 Mon Sep 17 00:00:00 2001
From: Billel Mokeddem <billel.mokeddem.ml@gmail.com>
Date: Sun, 22 Dec 2024 20:12:46 +0000
Subject: [PATCH] Fix handling pre-normalized tokens

---
 convert_hf_to_gguf.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 06e3016cc..a55bedc72 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -525,10 +525,9 @@ class Model:
             else:
                 token: str = reverse_vocab[i]
                 if token in added_vocab:
-                    # We need to manually encode and decode the added tokens in case special characters
-                    # used for `\n` / `\t` have been manually added in the added tokens
-                    # To avoid unexpected issues - we make sure to encode single-char tokens
-                    if len(token) == 1:
+                    # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
+                    # To avoid unexpected issues - we make sure to normalize non-normalized tokens
+                    if not tokenizer.added_tokens_decoder[i].normalized:
                         previous_token = token
                         token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
                         if previous_token != token:
@@ -537,6 +536,8 @@ class Model:
                     if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
                         toktypes.append(gguf.TokenType.CONTROL)
                     else:
+                        # NOTE: this was added for Gemma.
+                        # Encoding and decoding the tokens above isn't sufficient for this case.
                         token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")  # pre-normalize user-defined spaces
                         toktypes.append(gguf.TokenType.USER_DEFINED)
                 else: