Fix when params.n_vocab < tokenizer vocab size
This commit is contained in:
parent
cc1f3fcfad
commit
026eb7cd01
1 changed files with 19 additions and 12 deletions
31
convert.py
31
convert.py
|
@ -298,7 +298,7 @@ class Params:
|
||||||
|
|
||||||
|
|
||||||
class VocabLoader:
|
class VocabLoader:
|
||||||
def __init__(self, fname_tokenizer: Path) -> None:
|
def __init__(self, params: Params, fname_tokenizer: Path) -> None:
|
||||||
try:
|
try:
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
|
@ -309,10 +309,18 @@ class VocabLoader:
|
||||||
|
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
|
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
|
||||||
vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()}
|
vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()}
|
||||||
|
|
||||||
self.added_tokens_list = [tok for tok in self.tokenizer.get_added_vocab()]
|
self.added_tokens_list = []
|
||||||
self.added_tokens_dict = dict(self.tokenizer.get_added_vocab())
|
self.added_tokens_dict = dict()
|
||||||
self.added_tokens_ids = set(self.tokenizer.get_added_vocab().values())
|
self.added_tokens_ids = set()
|
||||||
|
|
||||||
|
for tok, tokidx in self.tokenizer.get_added_vocab().items():
|
||||||
|
if tokidx >= params.n_vocab or toksize < self.tokenizer.vocab_size:
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.added_tokens_list.append(tok)
|
||||||
|
self.added_tokens_dict[tok] = tokidx
|
||||||
|
self.added_tokens_ids.add(tokidx)
|
||||||
|
|
||||||
self.unk_token_id = self.tokenizer.unk_token_id
|
self.unk_token_id = self.tokenizer.unk_token_id
|
||||||
self.specials = {
|
self.specials = {
|
||||||
|
@ -321,8 +329,8 @@ class VocabLoader:
|
||||||
}
|
}
|
||||||
print(self.specials)
|
print(self.specials)
|
||||||
self.special_ids = set(self.tokenizer.all_special_ids)
|
self.special_ids = set(self.tokenizer.all_special_ids)
|
||||||
self.vocab_size_base: int = len(vocab_set)
|
self.vocab_size_base: int = self.tokenizer.vocab_size
|
||||||
self.vocab_size: int = len(vocab_set)
|
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
self.fname_tokenizer = fname_tokenizer
|
self.fname_tokenizer = fname_tokenizer
|
||||||
|
|
||||||
vocab_file = "tokenizer.model"
|
vocab_file = "tokenizer.model"
|
||||||
|
@ -374,7 +382,6 @@ class VocabLoader:
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
|
||||||
for text in self.added_tokens_list:
|
for text in self.added_tokens_list:
|
||||||
|
|
||||||
if text in self.specials:
|
if text in self.specials:
|
||||||
|
|
||||||
toktype = self.get_token_type(self.specials[text])
|
toktype = self.get_token_type(self.specials[text])
|
||||||
|
@ -1095,14 +1102,14 @@ def vocab_check_and_append_path(path: Path, vocab_file: str) -> bool:
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
|
||||||
def load_vocab(path: Path) -> Vocab:
|
def load_vocab(params: Params, path: Path) -> Vocab:
|
||||||
# Be extra-friendly and accept either a file or a directory. Also, if it's
|
# Be extra-friendly and accept either a file or a directory. Also, if it's
|
||||||
# a directory, it might be the model directory, and tokenizer.model might
|
# a directory, it might be the model directory, and tokenizer.model might
|
||||||
# be in the parent of that.
|
# be in the parent of that.
|
||||||
|
|
||||||
print(f"Loading vocab file '{path}'")
|
print(f"Loading vocab file '{path}'")
|
||||||
|
|
||||||
return VocabLoader(path)
|
return VocabLoader(params, path)
|
||||||
|
|
||||||
|
|
||||||
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
||||||
|
@ -1183,7 +1190,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
if not args.outfile:
|
if not args.outfile:
|
||||||
raise ValueError("need --outfile if using --vocab-only")
|
raise ValueError("need --outfile if using --vocab-only")
|
||||||
# FIXME: Try to respect vocab_dir somehow?
|
# FIXME: Try to respect vocab_dir somehow?
|
||||||
vocab = load_vocab(args.vocab_dir or args.model)
|
vocab = load_vocab(params, args.vocab_dir or args.model)
|
||||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||||
load_merges = True,
|
load_merges = True,
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
|
@ -1197,7 +1204,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
vocab = model_plus.vocab
|
vocab = model_plus.vocab
|
||||||
else:
|
else:
|
||||||
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
||||||
vocab = load_vocab(vocab_dir)
|
vocab = load_vocab(params, vocab_dir)
|
||||||
|
|
||||||
# FIXME: Try to respect vocab_dir somehow?
|
# FIXME: Try to respect vocab_dir somehow?
|
||||||
print(f"Vocab info: {vocab}")
|
print(f"Vocab info: {vocab}")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue