Remove unused variable/functions, add types to class variable and methods, delete blank liens
This commit is contained in:
parent
74d80a8862
commit
61edd1bc59
1 changed files with 33 additions and 47 deletions
50
convert.py
50
convert.py
|
@ -18,6 +18,7 @@ import sys
|
||||||
import time
|
import time
|
||||||
import zipfile
|
import zipfile
|
||||||
from abc import ABCMeta, abstractmethod
|
from abc import ABCMeta, abstractmethod
|
||||||
|
from collections import OrderedDict
|
||||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -314,28 +315,23 @@ class VocabLoader:
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
|
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
|
||||||
vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
|
vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
|
||||||
|
|
||||||
self.added_tokens_list = []
|
self.added_tokens_dict: OrderedDict[str, int] = OrderedDict()
|
||||||
self.added_tokens_dict = dict()
|
|
||||||
self.added_tokens_ids = set()
|
|
||||||
|
|
||||||
for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
|
for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
|
||||||
if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
|
if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self.added_tokens_list.append(tok)
|
|
||||||
self.added_tokens_dict[tok] = tokidx
|
self.added_tokens_dict[tok] = tokidx
|
||||||
self.added_tokens_ids.add(tokidx)
|
|
||||||
|
|
||||||
self.unk_token_id = self.tokenizer.unk_token_id
|
self.unk_token_id: int = self.tokenizer.unk_token_id
|
||||||
self.specials = {
|
self.specials: dict[str, int] = {
|
||||||
tok: self.tokenizer.get_vocab()[tok]
|
tok: self.tokenizer.get_vocab()[tok]
|
||||||
for tok in self.tokenizer.all_special_tokens
|
for tok in self.tokenizer.all_special_tokens
|
||||||
}
|
}
|
||||||
print(self.specials)
|
self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
|
||||||
self.special_ids = set(self.tokenizer.all_special_ids)
|
|
||||||
self.vocab_size_base: int = self.tokenizer.vocab_size
|
self.vocab_size_base: int = self.tokenizer.vocab_size
|
||||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
|
||||||
self.fname_tokenizer = fname_tokenizer
|
self.fname_tokenizer: str = fname_tokenizer
|
||||||
|
|
||||||
vocab_file = "tokenizer.model"
|
vocab_file = "tokenizer.model"
|
||||||
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
|
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
|
||||||
|
@ -348,16 +344,16 @@ class VocabLoader:
|
||||||
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.tokenizer
|
tokenizer = self.tokenizer
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
|
||||||
special_ids = set(tokenizer.all_special_ids)
|
added_tokens_ids = set(self.added_tokens_dict.values())
|
||||||
|
|
||||||
for i in range(self.vocab_size_base):
|
for i in range(self.vocab_size_base):
|
||||||
if i in self.added_tokens_ids:
|
if i in added_tokens_ids:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
text = reverse_vocab[i].encode("utf-8")
|
text = reverse_vocab[i].encode("utf-8")
|
||||||
yield text, self.get_token_score(i), self.get_token_type(i)
|
yield text, self.get_token_score(i), self.get_token_type(i)
|
||||||
|
|
||||||
def get_token_type(self, token_id):
|
def get_token_type(self, token_id: int) -> gguf.TokenType:
|
||||||
toktype = gguf.TokenType.NORMAL
|
toktype = gguf.TokenType.NORMAL
|
||||||
|
|
||||||
if self.spm is not None and token_id < self.spm.vocab_size():
|
if self.spm is not None and token_id < self.spm.vocab_size():
|
||||||
|
@ -377,7 +373,7 @@ class VocabLoader:
|
||||||
|
|
||||||
return toktype
|
return toktype
|
||||||
|
|
||||||
def get_token_score(self, token_id):
|
def get_token_score(self, token_id: int) -> float:
|
||||||
if self.spm is not None and token_id < self.spm.vocab_size():
|
if self.spm is not None and token_id < self.spm.vocab_size():
|
||||||
return self.spm.get_score(token_id)
|
return self.spm.get_score(token_id)
|
||||||
else:
|
else:
|
||||||
|
@ -385,7 +381,7 @@ class VocabLoader:
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
|
||||||
for text in self.added_tokens_list:
|
for text in self.added_tokens_dict:
|
||||||
if text in self.specials:
|
if text in self.specials:
|
||||||
|
|
||||||
toktype = self.get_token_type(self.specials[text])
|
toktype = self.get_token_type(self.specials[text])
|
||||||
|
@ -397,7 +393,7 @@ class VocabLoader:
|
||||||
|
|
||||||
yield text.encode("utf-8"), score, toktype
|
yield text.encode("utf-8"), score, toktype
|
||||||
|
|
||||||
def has_newline_token(self):
|
def has_newline_token(self) -> bool:
|
||||||
return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
|
return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
@ -432,7 +428,7 @@ class VocabLoader:
|
||||||
"if it's in another directory, pass the directory as --vocab-dir")
|
"if it's in another directory, pass the directory as --vocab-dir")
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_dict)} added tokens>"
|
||||||
|
|
||||||
Vocab: TypeAlias = 'VocabLoader'
|
Vocab: TypeAlias = 'VocabLoader'
|
||||||
|
|
||||||
|
@ -814,7 +810,7 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
|
||||||
if params.n_vocab != vocab.vocab_size:
|
if params.n_vocab != vocab.vocab_size:
|
||||||
if params.n_vocab == vocab.vocab_size:
|
if params.n_vocab == vocab.vocab_size:
|
||||||
print("Ignoring added_tokens.json since model matches vocab size without it.")
|
print("Ignoring added_tokens.json since model matches vocab size without it.")
|
||||||
vocab.added_tokens_list = []
|
vocab.added_tokens_dict = OrderedDict()
|
||||||
vocab.vocab_size = vocab.vocab_size
|
vocab.vocab_size = vocab.vocab_size
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -822,7 +818,7 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
|
||||||
pad_count = params.n_vocab - vocab.vocab_size
|
pad_count = params.n_vocab - vocab.vocab_size
|
||||||
print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
|
print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
|
||||||
for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
|
for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
|
||||||
vocab.added_tokens_list.append(f'<dummy{i:05}>')
|
vocab.added_tokens_dict[f'<dummy{i:05}>'] = -1
|
||||||
vocab.vocab_size = params.n_vocab
|
vocab.vocab_size = params.n_vocab
|
||||||
return
|
return
|
||||||
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
|
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
|
||||||
|
@ -1123,16 +1119,6 @@ def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]:
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
|
||||||
def load_vocab(params: Params, path: Path) -> Vocab:
|
|
||||||
# Be extra-friendly and accept either a file or a directory. Also, if it's
|
|
||||||
# a directory, it might be the model directory, and tokenizer.model might
|
|
||||||
# be in the parent of that.
|
|
||||||
|
|
||||||
print(f"Loading vocab file '{path}'")
|
|
||||||
|
|
||||||
return VocabLoader(params, path)
|
|
||||||
|
|
||||||
|
|
||||||
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
||||||
namestr = {
|
namestr = {
|
||||||
GGMLFileType.AllF32: "f32",
|
GGMLFileType.AllF32: "f32",
|
||||||
|
@ -1215,7 +1201,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
if not args.outfile:
|
if not args.outfile:
|
||||||
raise ValueError("need --outfile if using --vocab-only")
|
raise ValueError("need --outfile if using --vocab-only")
|
||||||
# FIXME: Try to respect vocab_dir somehow?
|
# FIXME: Try to respect vocab_dir somehow?
|
||||||
vocab = load_vocab(params, args.vocab_dir or args.model)
|
vocab = VocabLoader(params, args.vocab_dir or args.model)
|
||||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||||
load_merges = True,
|
load_merges = True,
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
|
@ -1229,7 +1215,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
vocab = model_plus.vocab
|
vocab = model_plus.vocab
|
||||||
else:
|
else:
|
||||||
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
||||||
vocab = load_vocab(params, vocab_dir)
|
vocab = VocabLoader(params, vocab_dir)
|
||||||
|
|
||||||
# FIXME: Try to respect vocab_dir somehow?
|
# FIXME: Try to respect vocab_dir somehow?
|
||||||
print(f"Vocab info: {vocab}")
|
print(f"Vocab info: {vocab}")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue