convert : fix incorrect added token dedup in BpeVocab

2024-03-27 13:35:33 -04:00 · 2024-03-27 13:35:33 -04:00 · d12a63ca3e
commit d12a63ca3e
parent b2b63d1350
1 changed files with 1 additions and 1 deletions
--- a/convert.py
+++ b/convert.py
@ -387,7 +387,7 @@ class BpeVocab(Vocab):
                    (item['content'], item['id'])
                    for item in tokenizer_json.get('added_tokens', [])
                    # Added tokens here can be duplicates of the main vocabulary.
-                    if item['content'] not in bpe_tokenizer)
+                    if item['content'] not in self.vocab)

        vocab_size   = len(self.vocab)
        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))