diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 7e13e289a..29358eb12 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -29,6 +29,7 @@ class GGUFMetadataKeys: SOURCE_URL = "general.source.url" SOURCE_REPO = "general.source.repository" FILE_TYPE = "general.file_type" + ENDIANESS = "general.endianess" class LLM: VOCAB_SIZE = "{arch}.vocab_size" @@ -77,20 +78,20 @@ class GGUFMetadataKeys: TIME_STEP_RANK = "{arch}.ssm.time_step_rank" class Tokenizer: - MODEL = "tokenizer.model" # STRING: e.g. llama, gpt2, etc... - TYPE = "tokenizer.type" # STRING: BPE, SPM, WPM, etc. - NORM = "tokenizer.norm" # OBJECT {"type": "ByteLevel", ...} - PRE = "tokenizer.pre" # OBJECT {"type": "ByteLevel", ...} - ADDED = "tokenizer.added" # ARRAY of OBJECTs: [{"id": 1, ...}, ...] - VOCAB = "tokenizer.vocab" # ARRAY of STRINGs: ["[BOS]", ...] - MERGES = "tokenizer.merges" # ARRAY of STRINGs: ["▁ t", ...] - TOKEN_TYPE = "tokenizer.token_type" # ARRAY of INT [2, ...] - TOKEN_TYPE_COUNT = "tokenizer.token_type_count" # BERT token types - SCORES = "tokenizer.scores" # WPM only + MODEL = "tokenizer.model" # STRING: e.g. llama, gpt2, etc... + TYPE = "tokenizer.type" # STRING: BPE, SPM, WPM, etc. + NORM = "tokenizer.norm" # OBJECT {"type": "ByteLevel", ...} + PRE = "tokenizer.pre" # OBJECT {"type": "ByteLevel", ...} + ADDED = "tokenizer.added" # ARRAY of OBJECTs: [{"id": 1, ...}, ...] + VOCAB = "tokenizer.vocab" # ARRAY of STRINGs: ["[BOS]", ...] + MERGES = "tokenizer.merges" # ARRAY of STRINGs: ["▁ t", ...] + TOKEN_TYPE = "tokenizer.token_type" # ARRAY of INT [2, ...] + TOKEN_TYPE_COUNT = "tokenizer.token_type_count" # BERT token types + SCORES = "tokenizer.scores" # WPM only BOS_ID = "tokenizer.bos_token_id" EOS_ID = "tokenizer.eos_token_id" UNK_ID = "tokenizer.unknown_token_id" - SEP_ID = "tokenizer.seperator_token_id" + SEP_ID = "tokenizer.separator_token_id" # Fixed typo PAD_ID = "tokenizer.padding_token_id" CLS_ID = "tokenizer.cls_token_id" MASK_ID = "tokenizer.mask_token_id" @@ -1038,6 +1039,19 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { } +# +# Model File Types +# +class ModelFileExtension(Enum): + PT = ".pt" # torch + PTH = ".pth" # torch + BIN = ".bin" # torch + SAFETENSORS = ".safetensors" # safetensors + JSON = ".json" # transformers/tokenizers + MODEL = ".model" # sentencepiece + GGUF = ".gguf" # ggml/llama.cpp + + # # Tokenizer Types # @@ -1050,51 +1064,43 @@ class GGUFTokenType(IntEnum): BYTE = 6 -class GGUFTokenizerType(Enum): +class HFTokenizerType(Enum): SPM = "SPM" # SentencePiece LLaMa tokenizer BPE = "BPE" # BytePair GPT-2 tokenizer WPM = "WPM" # WordPiece BERT tokenizer -# -# Model File Types -# -class GGUFFileExtension(Enum): - PT = ".pt" # torch - PTH = ".pth" # torch - BIN = ".bin" # torch - SAFETENSORS = ".safetensors" # safetensors - JSON = ".json" # transformers/tokenizers - MODEL = ".model" # sentencepiece - GGUF = ".gguf" # ggml/llama.cpp - - # # Normalizer Types # -class GGUFNormalizerType(Enum): +class HFNormalizerType(Enum): SEQUENCE = "Sequence" - NFC = "NFC" - NFD = "NFD" - NFKC = "NFKC" - NFKD = "NFKD" + NFC = "NFC" + NFD = "NFD" + NFKC = "NFKC" + NFKD = "NFKD" # # Pre-tokenizer Types # -class GGUFPreTokenizerType(Enum): - WHITESPACE = "Whitespace" - METASPACE = "Metaspace" - BYTE_LEVEL = "ByteLevel" +class HFPreTokenizerType(Enum): + WHITESPACE = "Whitespace" + METASPACE = "Metaspace" + BYTE_LEVEL = "ByteLevel" BERT_PRE_TOKENIZER = "BertPreTokenizer" - SEQUENCE = "Sequence" + SEQUENCE = "Sequence" # # HF Vocab Files # -HF_TOKENIZER_BPE_FILES: tuple[str, ...] = ("config.json", "tokenizer_config.json", "tokenizer.json",) +HF_TOKENIZER_BPE_FILES = ( + "config.json", + "tokenizer_config.json", + "tokenizer.json", +) + HF_TOKENIZER_SPM_FILES: tuple[str, ...] = HF_TOKENIZER_BPE_FILES + ("tokenizer.model",) # @@ -1123,6 +1129,7 @@ KEY_GENERAL_LICENSE = GGUFMetadataKeys.General.LICENSE KEY_GENERAL_SOURCE_URL = GGUFMetadataKeys.General.SOURCE_URL KEY_GENERAL_SOURCE_REPO = GGUFMetadataKeys.General.SOURCE_REPO KEY_GENERAL_FILE_TYPE = GGUFMetadataKeys.General.FILE_TYPE +KEY_GENERAL_ENDIANESS = GGUFMetadataKeys.General.ENDIANESS # LLM KEY_VOCAB_SIZE = GGUFMetadataKeys.LLM.VOCAB_SIZE diff --git a/gguf-py/gguf/huggingface_hub.py b/gguf-py/gguf/huggingface_hub.py index 64ddd0346..5f72e183a 100644 --- a/gguf-py/gguf/huggingface_hub.py +++ b/gguf-py/gguf/huggingface_hub.py @@ -3,24 +3,20 @@ import logging import os import pathlib from hashlib import sha256 +from typing import Protocol import requests -from huggingface_hub import login, model_info from sentencepiece import SentencePieceProcessor +from tqdm import tqdm -from .constants import ( - MODEL_TOKENIZER_BPE_FILES, - MODEL_TOKENIZER_SPM_FILES, - ModelFileExtension, - ModelNormalizerType, - ModelPreTokenizerType, - ModelTokenizerType, -) +from .constants import HF_TOKENIZER_SPM_FILES -class HFHubBase: +class HFHubBase(Protocol): def __init__( - self, model_path: None | str | pathlib.Path, logger: None | logging.Logger + self, + model_path: None | str | pathlib.Path, + logger: None | logging.Logger, ): # Set the model path if model_path is None: @@ -43,7 +39,7 @@ class HFHubBase: def write_file(self, content: bytes, file_path: pathlib.Path) -> None: with open(file_path, "wb") as file: file.write(content) - self.logger.info(f"Wrote {len(content)} bytes to {file_path} successfully") + self.logger.debug(f"Wrote {len(content)} bytes to {file_path} successfully") class HFHubRequest(HFHubBase): @@ -59,6 +55,11 @@ class HFHubRequest(HFHubBase): if auth_token is None: self._headers = None else: + # headers = { + # "Authorization": f"Bearer {auth_token}", + # "securityStatus": True, + # "blobs": True, + # } self._headers = {"Authorization": f"Bearer {auth_token}"} # Persist across requests @@ -67,11 +68,12 @@ class HFHubRequest(HFHubBase): # This is read-only self._base_url = "https://huggingface.co" - # NOTE: Required for getting model_info - login(auth_token, add_to_git_credential=True) + # NOTE: Cache repeat calls + self._model_repo = None + self._model_files = None @property - def headers(self) -> str: + def headers(self) -> None | dict[str, str]: return self._headers @property @@ -82,34 +84,79 @@ class HFHubRequest(HFHubBase): def base_url(self) -> str: return self._base_url - @staticmethod - def list_remote_files(model_repo: str) -> list[str]: - # NOTE: Request repository metadata to extract remote filenames - return [x.rfilename for x in model_info(model_repo).siblings] - - def list_filtered_remote_files( - self, model_repo: str, file_extension: ModelFileExtension - ) -> list[str]: - model_files = [] - self.logger.info(f"Repo:{model_repo}") - self.logger.debug(f"FileExtension:{file_extension.value}") - for filename in HFHubRequest.list_remote_files(model_repo): - suffix = pathlib.Path(filename).suffix - self.logger.debug(f"Suffix: {suffix}") - if suffix == file_extension.value: - self.logger.info(f"File: {filename}") - model_files.append(filename) - return model_files - def resolve_url(self, repo: str, filename: str) -> str: return f"{self._base_url}/{repo}/resolve/main/{filename}" def get_response(self, url: str) -> requests.Response: + # TODO: Stream requests and use tqdm to output the progress live response = self._session.get(url, headers=self.headers) - self.logger.info(f"Response status was {response.status_code}") + self.logger.debug(f"Response status was {response.status_code}") response.raise_for_status() return response + def model_info(self, model_repo: str) -> dict[str, object]: + url = f"{self._base_url}/api/models/{model_repo}" + return self.get_response(url).json() + + def list_remote_files(self, model_repo: str) -> list[str]: + # NOTE: Reset the cache if the repo changed + if self._model_repo != model_repo: + self._model_repo = model_repo + self._model_files = [] + for f in self.model_info(self._model_repo)["siblings"]: + self._model_files.append(f["rfilename"]) + dump = json.dumps(self._model_files, indent=4) + self.logger.debug(f"Cached remote files: {dump}") + # Return the cached file listing + return self._model_files + + def list_filtered_remote_files( + self, model_repo: str, file_suffix: str + ) -> list[str]: + model_files = [] + self.logger.debug(f"Model Repo:{model_repo}") + self.logger.debug(f"File Suffix:{file_suffix}") + # NOTE: Valuable files are typically in the root path + for filename in self.list_remote_files(model_repo): + path = pathlib.Path(filename) + if len(path.parents) > 1: + continue # skip nested paths + self.logger.debug(f"Path Suffix: {path.suffix}") + if path.suffix == file_suffix: + self.logger.debug(f"File Name: {filename}") + model_files.append(filename) + return model_files + + def list_remote_safetensors(self, model_repo: str) -> list[str]: + # NOTE: HuggingFace recommends using safetensors to mitigate pickled injections + return [ + part + for part in self.list_filtered_remote_files(model_repo, ".safetensors") + if part.startswith("model") + ] + + def list_remote_bin(self, model_repo: str) -> list[str]: + # NOTE: HuggingFace is streamlining PyTorch models with the ".bin" extension + return [ + part + for part in self.list_filtered_remote_files(model_repo, ".bin") + if part.startswith("pytorch_model") + ] + + def list_remote_weights(self, model_repo: str) -> list[str]: + model_parts = self.list_remote_safetensors(model_repo) + if not model_parts: + model_parts = self.list_remote_bin(model_repo) + self.logger.debug(f"Remote model parts: {model_parts}") + return model_parts + + def list_remote_tokenizers(self, model_repo: str) -> list[str]: + return [ + tok + for tok in self.list_remote_files(model_repo) + if tok in HF_TOKENIZER_SPM_FILES + ] + class HFHubTokenizer(HFHubBase): def __init__( @@ -118,11 +165,8 @@ class HFHubTokenizer(HFHubBase): super().__init__(model_path, logger) @staticmethod - def list_vocab_files(vocab_type: ModelTokenizerType) -> tuple[str, ...]: - if vocab_type == ModelTokenizerType.SPM.value: - return MODEL_TOKENIZER_SPM_FILES - # NOTE: WPM and BPE are equivalent - return MODEL_TOKENIZER_BPE_FILES + def list_vocab_files() -> tuple[str, ...]: + return HF_TOKENIZER_SPM_FILES def model(self, model_repo: str) -> SentencePieceProcessor: path = self.model_path / model_repo / "tokenizer.model" @@ -216,58 +260,62 @@ class HFHubModel(HFHubBase): def _request_single_file( self, model_repo: str, file_name: str, file_path: pathlib.Path - ) -> bool: - # NOTE: Consider optional `force` parameter if files need to be updated. - # e.g. The model creator updated the vocabulary to resolve an issue or add a feature. - if file_path.exists(): - self.logger.info(f"skipped - downloaded {file_path} exists already.") - return False - + ) -> None: # NOTE: Do not use bare exceptions! They mask issues! # Allow the exception to occur or explicitly handle it. try: - self.logger.info(f"Downloading '{file_name}' from {model_repo}") resolved_url = self.request.resolve_url(model_repo, file_name) response = self.request.get_response(resolved_url) self.write_file(response.content, file_path) - self.logger.info(f"Model file successfully saved to {file_path}") - return True except requests.exceptions.HTTPError as e: - self.logger.error(f"Error while downloading '{file_name}': {str(e)}") - return False + self.logger.debug(f"Error while downloading '{file_name}': {str(e)}") - def _request_listed_files(self, model_repo: str, remote_files: list[str]) -> None: - for file_name in remote_files: + def _request_listed_files( + self, model_repo: str, remote_files: list[str, ...] + ) -> None: + for file_name in tqdm(remote_files, total=len(remote_files)): dir_path = self.model_path / model_repo os.makedirs(dir_path, exist_ok=True) - self._request_single_file(model_repo, file_name, dir_path / file_name) + + # NOTE: Consider optional `force` parameter if files need to be updated. + # e.g. The model creator updated the vocabulary to resolve an issue or add a feature. + file_path = dir_path / file_name + if file_path.exists(): + self.logger.debug(f"skipped - downloaded {file_path} exists already.") + continue # skip existing files + + self.logger.debug(f"Downloading '{file_name}' from {model_repo}") + self._request_single_file(model_repo, file_name, file_path) + self.logger.debug(f"Model file successfully saved to {file_path}") def config(self, model_repo: str) -> dict[str, object]: path = self.model_path / model_repo / "config.json" return json.loads(path.read_text(encoding="utf-8")) def architecture(self, model_repo: str) -> str: - config = self.config(model_repo) # NOTE: Allow IndexError to be raised because something unexpected happened. # The general assumption is there is only a single architecture, but # merged models may have multiple architecture types. This means this method # call is not guaranteed. - return config.get("architectures", [])[0] + try: + return self.config(model_repo).get("architectures", [])[0] + except IndexError: + self.logger.debug(f"Failed to get {model_repo} architecture") + return str() - def download_model_files( - self, model_repo: str, file_extension: ModelFileExtension - ) -> None: - filtered_files = self.request.list_filtered_remote_files( - model_repo, file_extension - ) - self._request_listed_files(model_repo, filtered_files) + def download_model_weights(self, model_repo: str) -> None: + remote_files = self.request.list_remote_weights(model_repo) + self._request_listed_files(model_repo, remote_files) - def download_all_vocab_files( - self, model_repo: str, vocab_type: ModelTokenizerType - ) -> None: - vocab_files = self.tokenizer.list_vocab_files(vocab_type) - self._request_listed_files(model_repo, vocab_files) + def download_model_tokenizers(self, model_repo: str) -> None: + remote_files = self.request.list_remote_tokenizers(model_repo) + self._request_listed_files(model_repo, remote_files) - def download_all_model_files(self, model_repo: str) -> None: + def download_model_weights_and_tokenizers(self, model_repo: str) -> None: + # attempt by priority + self.download_model_weights(model_repo) + self.download_model_tokenizers(model_repo) + + def download_all_repository_files(self, model_repo: str) -> None: all_files = self.request.list_remote_files(model_repo) self._request_listed_files(model_repo, all_files)