update vocab class

This commit is contained in:
wonjun Jang 2023-11-19 10:20:06 +09:00 committed by GitHub
parent 026eb7cd01
commit 2e263ca200
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -307,15 +307,19 @@ class VocabLoader:
"You can install it with `pip install transformers`." "You can install it with `pip install transformers`."
) from e ) from e
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer)) try:
vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()} self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True)
vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
except:
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
self.added_tokens_list = [] self.added_tokens_list = []
self.added_tokens_dict = dict() self.added_tokens_dict = dict()
self.added_tokens_ids = set() self.added_tokens_ids = set()
for tok, tokidx in self.tokenizer.get_added_vocab().items(): for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
if tokidx >= params.n_vocab or toksize < self.tokenizer.vocab_size: if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
continue continue
self.added_tokens_list.append(tok) self.added_tokens_list.append(tok)
@ -324,7 +328,7 @@ class VocabLoader:
self.unk_token_id = self.tokenizer.unk_token_id self.unk_token_id = self.tokenizer.unk_token_id
self.specials = { self.specials = {
tok: self.tokenizer.vocab[tok] tok: self.tokenizer.get_vocab()[tok]
for tok in self.tokenizer.all_special_tokens for tok in self.tokenizer.all_special_tokens
} }
print(self.specials) print(self.specials)
@ -343,7 +347,7 @@ class VocabLoader:
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.tokenizer tokenizer = self.tokenizer
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
special_ids = set(tokenizer.all_special_ids) special_ids = set(tokenizer.all_special_ids)
for i in range(self.vocab_size_base): for i in range(self.vocab_size_base):