Remove unused variable/functions, add types to class variable and methods, delete blank liens

This commit is contained in:
wonjun Jang 2023-11-28 16:23:27 +09:00 committed by GitHub
parent 74d80a8862
commit 61edd1bc59
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -18,6 +18,7 @@ import sys
import time import time
import zipfile import zipfile
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
from collections import OrderedDict
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
@ -314,28 +315,23 @@ class VocabLoader:
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()} vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
self.added_tokens_list = [] self.added_tokens_dict: OrderedDict[str, int] = OrderedDict()
self.added_tokens_dict = dict()
self.added_tokens_ids = set()
for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]): for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size: if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
continue continue
self.added_tokens_list.append(tok)
self.added_tokens_dict[tok] = tokidx self.added_tokens_dict[tok] = tokidx
self.added_tokens_ids.add(tokidx)
self.unk_token_id = self.tokenizer.unk_token_id self.unk_token_id: int = self.tokenizer.unk_token_id
self.specials = { self.specials: dict[str, int] = {
tok: self.tokenizer.get_vocab()[tok] tok: self.tokenizer.get_vocab()[tok]
for tok in self.tokenizer.all_special_tokens for tok in self.tokenizer.all_special_tokens
} }
print(self.specials) self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
self.special_ids = set(self.tokenizer.all_special_ids)
self.vocab_size_base: int = self.tokenizer.vocab_size self.vocab_size_base: int = self.tokenizer.vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
self.fname_tokenizer = fname_tokenizer self.fname_tokenizer: str = fname_tokenizer
vocab_file = "tokenizer.model" vocab_file = "tokenizer.model"
path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file) path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
@ -348,16 +344,16 @@ class VocabLoader:
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.tokenizer tokenizer = self.tokenizer
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()} reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
special_ids = set(tokenizer.all_special_ids) added_tokens_ids = set(self.added_tokens_dict.values())
for i in range(self.vocab_size_base): for i in range(self.vocab_size_base):
if i in self.added_tokens_ids: if i in added_tokens_ids:
continue continue
text = reverse_vocab[i].encode("utf-8") text = reverse_vocab[i].encode("utf-8")
yield text, self.get_token_score(i), self.get_token_type(i) yield text, self.get_token_score(i), self.get_token_type(i)
def get_token_type(self, token_id): def get_token_type(self, token_id: int) -> gguf.TokenType:
toktype = gguf.TokenType.NORMAL toktype = gguf.TokenType.NORMAL
if self.spm is not None and token_id < self.spm.vocab_size(): if self.spm is not None and token_id < self.spm.vocab_size():
@ -377,7 +373,7 @@ class VocabLoader:
return toktype return toktype
def get_token_score(self, token_id): def get_token_score(self, token_id: int) -> float:
if self.spm is not None and token_id < self.spm.vocab_size(): if self.spm is not None and token_id < self.spm.vocab_size():
return self.spm.get_score(token_id) return self.spm.get_score(token_id)
else: else:
@ -385,7 +381,7 @@ class VocabLoader:
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list: for text in self.added_tokens_dict:
if text in self.specials: if text in self.specials:
toktype = self.get_token_type(self.specials[text]) toktype = self.get_token_type(self.specials[text])
@ -397,7 +393,7 @@ class VocabLoader:
yield text.encode("utf-8"), score, toktype yield text.encode("utf-8"), score, toktype
def has_newline_token(self): def has_newline_token(self) -> bool:
return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@ -432,7 +428,7 @@ class VocabLoader:
"if it's in another directory, pass the directory as --vocab-dir") "if it's in another directory, pass the directory as --vocab-dir")
def __repr__(self) -> str: def __repr__(self) -> str:
return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>" return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_dict)} added tokens>"
Vocab: TypeAlias = 'VocabLoader' Vocab: TypeAlias = 'VocabLoader'
@ -814,7 +810,7 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
if params.n_vocab != vocab.vocab_size: if params.n_vocab != vocab.vocab_size:
if params.n_vocab == vocab.vocab_size: if params.n_vocab == vocab.vocab_size:
print("Ignoring added_tokens.json since model matches vocab size without it.") print("Ignoring added_tokens.json since model matches vocab size without it.")
vocab.added_tokens_list = [] vocab.added_tokens_dict = OrderedDict()
vocab.vocab_size = vocab.vocab_size vocab.vocab_size = vocab.vocab_size
return return
@ -822,7 +818,7 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
pad_count = params.n_vocab - vocab.vocab_size pad_count = params.n_vocab - vocab.vocab_size
print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>') print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
for i in range(1, (params.n_vocab - vocab.vocab_size) + 1): for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
vocab.added_tokens_list.append(f'<dummy{i:05}>') vocab.added_tokens_dict[f'<dummy{i:05}>'] = -1
vocab.vocab_size = params.n_vocab vocab.vocab_size = params.n_vocab
return return
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}" msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
@ -1123,16 +1119,6 @@ def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]:
return path return path
def load_vocab(params: Params, path: Path) -> Vocab:
# Be extra-friendly and accept either a file or a directory. Also, if it's
# a directory, it might be the model directory, and tokenizer.model might
# be in the parent of that.
print(f"Loading vocab file '{path}'")
return VocabLoader(params, path)
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
namestr = { namestr = {
GGMLFileType.AllF32: "f32", GGMLFileType.AllF32: "f32",
@ -1215,7 +1201,7 @@ def main(args_in: list[str] | None = None) -> None:
if not args.outfile: if not args.outfile:
raise ValueError("need --outfile if using --vocab-only") raise ValueError("need --outfile if using --vocab-only")
# FIXME: Try to respect vocab_dir somehow? # FIXME: Try to respect vocab_dir somehow?
vocab = load_vocab(params, args.vocab_dir or args.model) vocab = VocabLoader(params, args.vocab_dir or args.model)
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
load_merges = True, load_merges = True,
n_vocab = vocab.vocab_size) n_vocab = vocab.vocab_size)
@ -1229,7 +1215,7 @@ def main(args_in: list[str] | None = None) -> None:
vocab = model_plus.vocab vocab = model_plus.vocab
else: else:
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
vocab = load_vocab(params, vocab_dir) vocab = VocabLoader(params, vocab_dir)
# FIXME: Try to respect vocab_dir somehow? # FIXME: Try to respect vocab_dir somehow?
print(f"Vocab info: {vocab}") print(f"Vocab info: {vocab}")