convert-hf : HfVocab -> LlamaHfVocab

This commit is contained in:
Jared Van Bortel 2024-03-27 16:13:09 -04:00
parent 79852ab884
commit ebad773e9d
2 changed files with 9 additions and 14 deletions

View file

@ -23,7 +23,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
from convert import HfVocab from convert import LlamaHfVocab
###### MODEL DEFINITIONS ###### ###### MODEL DEFINITIONS ######
@ -370,12 +370,8 @@ class Model(ABC):
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_hf(self): def _set_vocab_llama_hf(self):
path = self.dir_model vocab = LlamaHfVocab(self.dir_model)
added_tokens_path = self.dir_model
vocab = HfVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
tokens = [] tokens = []
scores = [] scores = []
toktypes = [] toktypes = []
@ -1097,7 +1093,7 @@ class MiniCPMModel(Model):
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
def set_vocab(self): def set_vocab(self):
self._set_vocab_hf() self._set_vocab_llama_hf()
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head: if n_kv_head is not None and n_head != n_kv_head:
@ -1698,11 +1694,8 @@ class BertModel(Model):
self.gguf_writer.add_pooling_type(pooling_type) self.gguf_writer.add_pooling_type(pooling_type)
def set_vocab(self): def set_vocab(self):
path = self.dir_model
added_tokens_path = self.dir_model if self.dir_model.exists() else None
# use huggingface vocab to get all tokens # use huggingface vocab to get all tokens
vocab = HfVocab(path, added_tokens_path) vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
tokens, scores, toktypes = zip(*vocab.all_tokens()) tokens, scores, toktypes = zip(*vocab.all_tokens())
assert len(tokens) == vocab.vocab_size assert len(tokens) == vocab.vocab_size
self.vocab_size = vocab.vocab_size self.vocab_size = vocab.vocab_size

View file

@ -516,7 +516,7 @@ class LlamaHfVocab(Vocab):
tokenizer_model = "llama" tokenizer_model = "llama"
name = "hfft" name = "hfft"
def __init__(self, base_path: Path): def __init__(self, base_path: Path, ignore_nonllama: bool = False):
fname_tokenizer = base_path / FAST_TOKENIZER_FILE fname_tokenizer = base_path / FAST_TOKENIZER_FILE
# if this fails, FileNotFoundError propagates to caller # if this fails, FileNotFoundError propagates to caller
with open(fname_tokenizer, encoding='utf-8') as f: with open(fname_tokenizer, encoding='utf-8') as f:
@ -524,7 +524,9 @@ class LlamaHfVocab(Vocab):
# pre-check so we know if we need transformers # pre-check so we know if we need transformers
tokenizer_model: dict[str, Any] = tokenizer_json['model'] tokenizer_model: dict[str, Any] = tokenizer_json['model']
if ( if ignore_nonllama:
pass # workaround incorrect use of this class for WordPiece
elif (
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False) tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'Sequence' or tokenizer_json['decoder']['type'] != 'Sequence'
): ):