Add fix for adding bos to added special tokens

This commit is contained in:
Billel Mokeddem 2024-12-18 04:58:00 +00:00
parent d146334c11
commit fc055407b7

View file

@ -527,7 +527,9 @@ class Model:
if token in added_vocab:
# We need to manually encode and decode the added tokens in case special characters
# used for `\n` / `\t` have been manually added in the added tokens
token = tokenizer.decode(tokenizer.encode(token))
if len(token) == 1:
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
toktypes.append(gguf.TokenType.CONTROL)
else: