convert : fix incorrect added token dedup in BpeVocab

This commit is contained in:
Jared Van Bortel 2024-03-27 13:35:33 -04:00
parent b2b63d1350
commit d12a63ca3e

View file

@ -387,7 +387,7 @@ class BpeVocab(Vocab):
(item['content'], item['id']) (item['content'], item['id'])
for item in tokenizer_json.get('added_tokens', []) for item in tokenizer_json.get('added_tokens', [])
# Added tokens here can be duplicates of the main vocabulary. # Added tokens here can be duplicates of the main vocabulary.
if item['content'] not in bpe_tokenizer) if item['content'] not in self.vocab)
vocab_size = len(self.vocab) vocab_size = len(self.vocab)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))