From 302258721bc5bfbf27bc47d6f539fc913be5424d Mon Sep 17 00:00:00 2001 From: teleprint-me <77757836+teleprint-me@users.noreply.github.com> Date: Sat, 18 May 2024 01:26:39 -0400 Subject: [PATCH] refactor: Apply model schema to tokenizer downloads - Add imports for json and hashlib - Add missing models: phi, stablelm, mistral, and mixtral - Fix constructor logic - Fix how models are accessed - Apply model schema to download_model method --- gguf-py/gguf/huggingface_hub.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/gguf-py/gguf/huggingface_hub.py b/gguf-py/gguf/huggingface_hub.py index fb7368b56..86b8312c0 100644 --- a/gguf-py/gguf/huggingface_hub.py +++ b/gguf-py/gguf/huggingface_hub.py @@ -1,7 +1,9 @@ +import json import logging import os import pathlib from enum import IntEnum, auto +from hashlib import sha256 import requests from transformers import AutoTokenizer @@ -42,6 +44,10 @@ MODELS = ( {"tokt": TokenizerType.WPM, "repo": "jinaai/jina-embeddings-v2-base-en", }, # WPM! {"tokt": TokenizerType.BPE, "repo": "jinaai/jina-embeddings-v2-base-es", }, {"tokt": TokenizerType.BPE, "repo": "jinaai/jina-embeddings-v2-base-de", }, + {"tokt": TokenizerType.BPE, "repo": "microsoft/phi-1", }, + {"tokt": TokenizerType.BPE, "repo": "stabilityai/stablelm-2-zephyr-1_6b", }, + {"tokt": TokenizerType.SPM, "repo": "mistralai/Mistral-7B-Instruct-v0.2", }, + {"tokt": TokenizerType.SPM, "repo": "mistralai/Mixtral-8x7B-Instruct-v0.1", }, ) @@ -103,10 +109,10 @@ class HFTokenizerRequest: auth_token: str, logger: None | logging.Logger ): - self._hub = HuggingFaceHub(auth_token, logger) - if dl_path is None: self._local_path = pathlib.Path("models/tokenizers") + elif isinstance(dl_path, str): + self._local_path = pathlib.Path(dl_path) else: self._local_path = dl_path @@ -116,13 +122,16 @@ class HFTokenizerRequest: logger = logging.getLogger("hf-tok-req") self.logger = logger + self._hub = HuggingFaceHub(auth_token, logger) + self._models = list(MODELS) + @property def hub(self) -> HuggingFaceHub: return self._hub @property def models(self) -> list[dict[str, object]]: - return MODEL_REPOS + return self._models @property def tokenizer_type(self) -> TokenizerType: @@ -157,12 +166,12 @@ class HFTokenizerRequest: def download_model(self) -> None: for model in self.models: - os.makedirs(f"{self.local_path}/{model['name']}", exist_ok=True) + os.makedirs(f"{self.local_path}/{model['repo']}", exist_ok=True) filenames = self.resolve_filenames(model['tokt']) for filename in filenames: - filepath = pathlib.Path(f"{self.local_path}/{model['name']}/{filename}") + filepath = pathlib.Path(f"{self.local_path}/{model['repo']}/{filename}") if filepath.is_file(): - self.logger.info(f"skipped pre-existing tokenizer {model['name']} at {filepath}") + self.logger.info(f"skipped pre-existing tokenizer {model['repo']} in {filepath}") continue self.resolve_tokenizer_model(filename, filepath, model)