From 92e41ec4b993c75cc6cb4fa92f7d233084741bb8 Mon Sep 17 00:00:00 2001 From: Billel Mokeddem Date: Wed, 18 Dec 2024 08:20:28 +0000 Subject: [PATCH] Update log to only print when input and output characters are different --- convert_hf_to_gguf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cd5dd9435..06e3016cc 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -529,8 +529,10 @@ class Model: # used for `\n` / `\t` have been manually added in the added tokens # To avoid unexpected issues - we make sure to encode single-char tokens if len(token) == 1: - logger.info("Ecode-Decode special characters using AutoTokenizer") + previous_token = token token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL)