convert-hf : HfVocab -> LlamaHfVocab

2024-03-27 16:13:09 -04:00 · 2024-03-27 16:13:09 -04:00 · ebad773e9d
commit ebad773e9d
parent 79852ab884
2 changed files with 9 additions and 14 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -23,7 +23,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
-from convert import HfVocab
+from convert import LlamaHfVocab
 ###### MODEL DEFINITIONS ######
@ -370,12 +370,8 @@ class Model(ABC):
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
-    def _set_vocab_hf(self):
+    def _set_vocab_llama_hf(self):
-        path = self.dir_model
+        vocab = LlamaHfVocab(self.dir_model)
        added_tokens_path = self.dir_model
        vocab = HfVocab(
            path, added_tokens_path if added_tokens_path.exists() else None
        )
        tokens = []
        scores = []
        toktypes = []
@ -1097,7 +1093,7 @@ class MiniCPMModel(Model):
        self.gguf_writer.add_file_type(self.ftype)
    def set_vocab(self):
-        self._set_vocab_hf()
+        self._set_vocab_llama_hf()
    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
        if n_kv_head is not None and n_head != n_kv_head:
@ -1698,11 +1694,8 @@ class BertModel(Model):
            self.gguf_writer.add_pooling_type(pooling_type)
    def set_vocab(self):
        path = self.dir_model
        added_tokens_path = self.dir_model if self.dir_model.exists() else None
        # use huggingface vocab to get all tokens
-        vocab = HfVocab(path, added_tokens_path)
+        vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
        tokens, scores, toktypes = zip(*vocab.all_tokens())
        assert len(tokens) == vocab.vocab_size
        self.vocab_size = vocab.vocab_size
--- a/convert.py
+++ b/convert.py
@ -516,7 +516,7 @@ class LlamaHfVocab(Vocab):
    tokenizer_model = "llama"
    name = "hfft"
-    def __init__(self, base_path: Path):
+    def __init__(self, base_path: Path, ignore_nonllama: bool = False):
        fname_tokenizer = base_path / FAST_TOKENIZER_FILE
        # if this fails, FileNotFoundError propagates to caller
        with open(fname_tokenizer, encoding='utf-8') as f:
@ -524,7 +524,9 @@ class LlamaHfVocab(Vocab):
        # pre-check so we know if we need transformers
        tokenizer_model: dict[str, Any] = tokenizer_json['model']
-        if (
+        if ignore_nonllama:
            pass  # workaround incorrect use of this class for WordPiece
        elif (
            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
            or tokenizer_json['decoder']['type'] != 'Sequence'
        ):