Add special token handling to conver script

This commit is contained in:
Galunid 2023-10-24 12:47:00 +02:00
parent d9c0332323
commit fa2cd7e7b9

View file

@ -120,7 +120,10 @@ for i in range(vocab_size):
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.USER_DEFINED)
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)