Update log to only print when input and output characters are different

This commit is contained in:
Billel Mokeddem 2024-12-18 08:20:28 +00:00
parent d8d2f370dc
commit 92e41ec4b9

View file

@ -529,8 +529,10 @@ class Model:
# used for `\n` / `\t` have been manually added in the added tokens # used for `\n` / `\t` have been manually added in the added tokens
# To avoid unexpected issues - we make sure to encode single-char tokens # To avoid unexpected issues - we make sure to encode single-char tokens
if len(token) == 1: if len(token) == 1:
logger.info("Ecode-Decode special characters using AutoTokenizer") previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
toktypes.append(gguf.TokenType.CONTROL) toktypes.append(gguf.TokenType.CONTROL)