update vocab class
This commit is contained in:
parent
026eb7cd01
commit
2e263ca200
1 changed files with 10 additions and 6 deletions
16
convert.py
16
convert.py
|
@ -307,15 +307,19 @@ class VocabLoader:
|
|||
"You can install it with `pip install transformers`."
|
||||
) from e
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
|
||||
vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()}
|
||||
try:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True)
|
||||
vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
|
||||
except:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
|
||||
vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
|
||||
|
||||
self.added_tokens_list = []
|
||||
self.added_tokens_dict = dict()
|
||||
self.added_tokens_ids = set()
|
||||
|
||||
for tok, tokidx in self.tokenizer.get_added_vocab().items():
|
||||
if tokidx >= params.n_vocab or toksize < self.tokenizer.vocab_size:
|
||||
for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
|
||||
if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
|
||||
continue
|
||||
|
||||
self.added_tokens_list.append(tok)
|
||||
|
@ -324,7 +328,7 @@ class VocabLoader:
|
|||
|
||||
self.unk_token_id = self.tokenizer.unk_token_id
|
||||
self.specials = {
|
||||
tok: self.tokenizer.vocab[tok]
|
||||
tok: self.tokenizer.get_vocab()[tok]
|
||||
for tok in self.tokenizer.all_special_tokens
|
||||
}
|
||||
print(self.specials)
|
||||
|
@ -343,7 +347,7 @@ class VocabLoader:
|
|||
|
||||
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
tokenizer = self.tokenizer
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
|
||||
special_ids = set(tokenizer.all_special_ids)
|
||||
|
||||
for i in range(self.vocab_size_base):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue