convert : fix incorrect added token dedup in BpeVocab
This commit is contained in:
parent
b2b63d1350
commit
d12a63ca3e
1 changed files with 1 additions and 1 deletions
|
@ -387,7 +387,7 @@ class BpeVocab(Vocab):
|
||||||
(item['content'], item['id'])
|
(item['content'], item['id'])
|
||||||
for item in tokenizer_json.get('added_tokens', [])
|
for item in tokenizer_json.get('added_tokens', [])
|
||||||
# Added tokens here can be duplicates of the main vocabulary.
|
# Added tokens here can be duplicates of the main vocabulary.
|
||||||
if item['content'] not in bpe_tokenizer)
|
if item['content'] not in self.vocab)
|
||||||
|
|
||||||
vocab_size = len(self.vocab)
|
vocab_size = len(self.vocab)
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue