convert : refactor vocab selection logic

Fixes #5973 Fixes #6216
2024-03-27 15:43:16 -04:00 · 2024-03-27 15:43:16 -04:00 · 79852ab884
commit 79852ab884
parent 2e6fd63b29
1 changed files with 89 additions and 78 deletions
--- a/convert.py
+++ b/convert.py
@ -44,6 +44,9 @@ ARCH = gguf.MODEL_ARCH.LLAMA
 DEFAULT_CONCURRENCY = 8
 ADDED_TOKENS_FILE = 'added_tokens.json'
 FAST_TOKENIZER_FILE = 'tokenizer.json'
 #
 # data types
 #
@ -367,32 +370,42 @@ class BpeVocab(Vocab):
    tokenizer_model = "gpt2"
    name = "bpe"
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None):
+    def __init__(self, base_path: Path):
-        with open(fname_tokenizer, encoding="utf-8") as f:
+        added_tokens: dict[str, int] = {}
            bpe_tokenizer = json.load(f)
-        if isinstance(bpe_tokenizer.get('model'), dict):
+        if (fname_tokenizer := base_path / 'vocab.json').exists():
-            self.vocab = bpe_tokenizer["model"]["vocab"]
+            # "slow" tokenizer
            with open(fname_tokenizer, encoding="utf-8") as f:
                self.vocab = json.load(f)
            try:
                # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
                    added_tokens = json.load(f)
            except FileNotFoundError:
                pass
        else:
-            self.vocab = bpe_tokenizer
+            # "fast" tokenizer
-        added_tokens: dict[str, int]
+            fname_tokenizer = base_path / FAST_TOKENIZER_FILE
-        if fname_added_tokens is not None:
+
-            # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
+            # if this fails, FileNotFoundError propagates to caller
-            with open(fname_added_tokens, encoding="utf-8") as f:
+            with open(fname_tokenizer, encoding="utf-8") as f:
-                added_tokens = json.load(f)
+                tokenizer_json = json.load(f)
-        else:
+
-            # Fall back to trying to find the added tokens in tokenizer.json
+            tokenizer_model: dict[str, Any] = tokenizer_json['model']
-            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
+            if (
-            if not tokenizer_json_file.is_file():
+                tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
-                added_tokens = {}
+                or tokenizer_json['decoder']['type'] != 'ByteLevel'
-            else:
+            ):
-                with open(tokenizer_json_file, encoding="utf-8") as f:
+                raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
-                    tokenizer_json = json.load(f)
+
-                added_tokens = dict(
+            self.vocab = tokenizer_model["vocab"]
-                    (item['content'], item['id'])
+
-                    for item in tokenizer_json.get('added_tokens', [])
+            if (added := tokenizer_json.get('added_tokens')) is not None:
-                    # Added tokens here can be duplicates of the main vocabulary.
+                # Added tokens here can be duplicates of the main vocabulary.
-                    if item['content'] not in self.vocab)
+                added_tokens = {item['content']: item['id']
                                for item in added
                                if item['content'] not in self.vocab}
        vocab_size   = len(self.vocab)
        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
@ -432,15 +445,20 @@ class SentencePieceVocab(Vocab):
    tokenizer_model = "llama"
    name = "spm"
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None):
+    def __init__(self, base_path: Path):
-        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+        added_tokens: dict[str, int] = {}
-        added_tokens: dict[str, int]
+        if (fname_tokenizer := base_path / 'tokenizer.model').exists():
-        if fname_added_tokens is not None:
+            # normal location
-            with open(fname_added_tokens, encoding="utf-8") as f:
+            try:
-                added_tokens = json.load(f)
+                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
-        else:
+                    added_tokens = json.load(f)
-            added_tokens = {}
+            except FileNotFoundError:
                pass
        elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
            # not found in alternate location either
            raise FileNotFoundError('Cannot find tokenizer.model')
        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
        vocab_size = self.sentencepiece_tokenizer.vocab_size()
        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
@ -494,27 +512,40 @@ class SentencePieceVocab(Vocab):
        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-class HfVocab(Vocab):
+class LlamaHfVocab(Vocab):
    tokenizer_model = "llama"
    name = "hfft"
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None):
+    def __init__(self, base_path: Path):
        fname_tokenizer = base_path / FAST_TOKENIZER_FILE
        # if this fails, FileNotFoundError propagates to caller
        with open(fname_tokenizer, encoding='utf-8') as f:
            tokenizer_json = json.load(f)
        # pre-check so we know if we need transformers
        tokenizer_model: dict[str, Any] = tokenizer_json['model']
        if (
            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
            or tokenizer_json['decoder']['type'] != 'Sequence'
        ):
            raise FileNotFoundError('Cannot find Llama BPE tokenizer')
        try:
            from transformers import AutoTokenizer
        except ImportError as e:
            raise ImportError(
-                "To use HfVocab, please install the `transformers` package. "
+                "To use LlamaHfVocab, please install the `transformers` package. "
                "You can install it with `pip install transformers`."
            ) from e
        print("fname_tokenizer:", fname_tokenizer)
        # Allow the tokenizer to default to slow or fast versions.
        # Explicitly set tokenizer to use local paths.
        self.tokenizer = AutoTokenizer.from_pretrained(
-            fname_tokenizer,
+            base_path,
-            cache_dir=fname_tokenizer,
+            cache_dir=base_path,
            local_files_only=True,
        )
        assert self.tokenizer.is_fast  # assume tokenizer.json is used
        # Initialize lists and dictionaries for added tokens
        self.added_tokens_list = []
@ -594,7 +625,7 @@ class HfVocab(Vocab):
        yield from self.added_tokens()
    def __repr__(self) -> str:
-        return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+        return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 #
@ -1315,32 +1346,10 @@ def load_some_model(path: Path) -> ModelPlus:
 class VocabFactory:
-    _FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
+    _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
    def __init__(self, path: Path):
        self.path = path
        self.file_paths = self._detect_files()
        print(f"Found vocab files: {self.file_paths}")
    def _detect_files(self) -> dict[str, Path | None]:
        def locate(file: str) -> Path | None:
            if (path := self.path / file).exists():
                return path
            if (path := self.path.parent / file).exists():
                return path
            return None
        return {vt: locate(f) for vt, f in self._FILES.items()}
    def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
        for vtype in vocab_types:
            try:
                path = self.file_paths[vtype]
            except KeyError:
                raise ValueError(f"Unsupported vocabulary type {vtype}") from None
            if path is not None:
                return vtype, path
        raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
    def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
        load_merges = vocab.name == "bpe"
@ -1353,23 +1362,25 @@ class VocabFactory:
        )
    def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
-        vocab_type, path = self._select_file(vocab_types)
+        vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
-        print(f"Loading vocab file {path!r}, type {vocab_type!r}")
+        selected_vocabs: dict[str, type[Vocab]] = {}
        for vtype in vocab_types:
            try:
                selected_vocabs[vtype] = vocab_classes[vtype]
            except KeyError:
                raise ValueError(f"Unsupported vocabulary type {vtype}") from None
-        added_tokens_path = path.parent / "added_tokens.json"
+        for vtype, cls in selected_vocabs.items():
-        if vocab_type == "bpe":
+            try:
-            return BpeVocab(
+                vocab = cls(self.path)
-                path, added_tokens_path if added_tokens_path.exists() else None
+                break
-            )
+            except FileNotFoundError:
-        if vocab_type == "spm":
+                pass  # ignore unavailable tokenizers
-            return SentencePieceVocab(
+        else:
-                path, added_tokens_path if added_tokens_path.exists() else None
+            raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
-            )
+
-        if vocab_type == "hfft":
+        print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
-            return HfVocab(
+        return vocab
                path.parent, added_tokens_path if added_tokens_path.exists() else None
            )
        raise ValueError(vocab_type)
    def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
        vocab: BaseVocab