This commit is contained in:
Cebtenzzre 2023-10-01 17:04:01 -04:00
parent f18cfeab62
commit 02fbbf9099
3 changed files with 3 additions and 3 deletions

View file

@ -144,7 +144,7 @@ for i in range(vocab_size):
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
elif i in added_tokens:
elif i in added_token_ids:
# these tokens are not encoded, see https://github.com/huggingface/transformers/issues/1133
text = bytearray(reverse_vocab[i].encode('utf-8'))
else:

View file

@ -140,7 +140,7 @@ for i in range(vocab_size):
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
elif i in added_tokens:
elif i in added_token_ids:
# these tokens are not encoded, see https://github.com/huggingface/transformers/issues/1133
text = bytearray(reverse_vocab[i].encode('utf-8'))
else:

View file

@ -128,7 +128,7 @@ for i in range(vocab_size):
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
pad_token = f"[PAD{i}]".encode("utf8")
text = bytearray(pad_token)
elif i in added_tokens:
elif i in added_token_ids:
# these tokens are not encoded, see https://github.com/huggingface/transformers/issues/1133
text = bytearray(reverse_vocab[i].encode('utf-8'))
else: