From fc055407b7c557f8e935aa2191485b0de967e2a1 Mon Sep 17 00:00:00 2001 From: Billel Mokeddem Date: Wed, 18 Dec 2024 04:58:00 +0000 Subject: [PATCH] Add fix for adding bos to added special tokens --- convert_hf_to_gguf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 66e268af6..77ab5ef4a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -527,7 +527,9 @@ class Model: if token in added_vocab: # We need to manually encode and decode the added tokens in case special characters # used for `\n` / `\t` have been manually added in the added tokens - token = tokenizer.decode(tokenizer.encode(token)) + if len(token) == 1: + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) else: