This commit is contained in:
Galunid 2023-10-23 16:57:50 +02:00
parent 217f82e734
commit c04ddb6990

View file

@ -136,9 +136,11 @@ for i in range(vocab_size):
tokens.append(f"[PAD{i}]") tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED) toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab: elif reverse_vocab[i] in added_vocab:
# NOTE: wouldn't we like to distinguish CONTROL tokens here?
tokens.append(reverse_vocab[i]) tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.USER_DEFINED) if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else: else:
tokens.append(reverse_vocab[i]) tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL) toktypes.append(gguf.TokenType.NORMAL)