refactor: Add tokenizer path, add methods for extracting vocab metadata, fix checksum method name

2024-05-23 21:40:05 -04:00 · 2024-05-23 21:40:05 -04:00 · 77bc7394c8
commit 77bc7394c8
parent b4b553fe6c
1 changed files with 13 additions and 5 deletions
--- a/gguf-py/gguf/huggingface_hub.py
+++ b/gguf-py/gguf/huggingface_hub.py
@ -116,6 +116,10 @@ class HFVocabRequest(HFHubBase):
    def tokenizer_type(self) -> VocabType:
        return VocabType

+    @property
+    def tokenizer_path(self) -> pathlib.Path:
+        return self.model_path / "tokenizer.json"
+
    def get_vocab_name(self, vocab_type: VocabType) -> str:
        return VOCAB_TYPE_NAMES.get(vocab_type)

@ -147,13 +151,17 @@ class HFVocabRequest(HFHubBase):
        for vocab_file in vocab_list:
            self.get_vocab_file(model_repo, vocab_file, self.model_path)

-    def extract_normalizer(self) -> dict[str, object]:
-        pass
+    def get_normalizer(self) -> None | dict[str, object]:
+        with open(self.tokenizer_path, mode="r") as file:
+            tokenizer_json = json.load(file)
+        return tokenizer_json.get("normalizer")

-    def extract_pre_tokenizers(self) -> dict[str, object]:
-        pass
+    def get_pre_tokenizer(self) -> None | dict[str, object]:
+        with open(self.tokenizer_path, mode="r") as file:
+            tokenizer_json = json.load(file)
+        return tokenizer_json.get("pre_tokenizer")

-    def generate_checksums(self) -> None:
+    def generate_checksum(self) -> None:
        checksums = []
        for model in self.models:
            mapping = {}