From 77bc7394c8879396a39f5fd36cac8b55be21b29f Mon Sep 17 00:00:00 2001 From: teleprint-me <77757836+teleprint-me@users.noreply.github.com> Date: Thu, 23 May 2024 21:40:05 -0400 Subject: [PATCH] refactor: Add tokenizer path, add methods for extracting vocab metadata, fix checksum method name --- gguf-py/gguf/huggingface_hub.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/gguf-py/gguf/huggingface_hub.py b/gguf-py/gguf/huggingface_hub.py index 8e0327e67..c47e26344 100644 --- a/gguf-py/gguf/huggingface_hub.py +++ b/gguf-py/gguf/huggingface_hub.py @@ -116,6 +116,10 @@ class HFVocabRequest(HFHubBase): def tokenizer_type(self) -> VocabType: return VocabType + @property + def tokenizer_path(self) -> pathlib.Path: + return self.model_path / "tokenizer.json" + def get_vocab_name(self, vocab_type: VocabType) -> str: return VOCAB_TYPE_NAMES.get(vocab_type) @@ -147,13 +151,17 @@ class HFVocabRequest(HFHubBase): for vocab_file in vocab_list: self.get_vocab_file(model_repo, vocab_file, self.model_path) - def extract_normalizer(self) -> dict[str, object]: - pass + def get_normalizer(self) -> None | dict[str, object]: + with open(self.tokenizer_path, mode="r") as file: + tokenizer_json = json.load(file) + return tokenizer_json.get("normalizer") - def extract_pre_tokenizers(self) -> dict[str, object]: - pass + def get_pre_tokenizer(self) -> None | dict[str, object]: + with open(self.tokenizer_path, mode="r") as file: + tokenizer_json = json.load(file) + return tokenizer_json.get("pre_tokenizer") - def generate_checksums(self) -> None: + def generate_checksum(self) -> None: checksums = [] for model in self.models: mapping = {}