diff --git a/gguf-py/scripts/gguf-registry.py b/gguf-py/scripts/gguf-registry.py index df602af30..44bc3b79e 100644 --- a/gguf-py/scripts/gguf-registry.py +++ b/gguf-py/scripts/gguf-registry.py @@ -13,17 +13,17 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / sys.path.insert(0, str(Path(__file__).parent.parent)) from gguf.constants import ( - GPT_PRE_TOKENIZER_DEFAULT, HF_TOKENIZER_BPE_FILES, HF_TOKENIZER_SPM_FILES, MODEL_ARCH, MODEL_ARCH_NAMES, - ModelFileType, + ModelFileExtension, + PreTokenizerType, VocabType, ) from gguf.huggingface_hub import HFHubModel, HFHubTokenizer -logger = logging.getLogger("gguf-gen-pre") +logger = logging.getLogger(__file__) # # HuggingFace Model Map @@ -50,8 +50,8 @@ HF_MODEL_MAP = ( "model_repo": "meta-llama/Llama-2-7b-hf", "model_arch": MODEL_ARCH.LLAMA, "model_parts": 2, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.SPM, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.SPM.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_SPM_FILES, }, @@ -59,8 +59,8 @@ HF_MODEL_MAP = ( "model_repo": "mistralai/Mistral-7B-Instruct-v0.1", "model_arch": MODEL_ARCH.LLAMA, "model_parts": 2, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.SPM, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.SPM.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_SPM_FILES, }, @@ -68,8 +68,8 @@ HF_MODEL_MAP = ( "model_repo": "mistralai/Mistral-7B-Instruct-v0.2", "model_arch": MODEL_ARCH.LLAMA, "model_parts": 3, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.SPM, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.SPM.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_SPM_FILES, }, @@ -77,8 +77,8 @@ HF_MODEL_MAP = ( "model_repo": "mistralai/Mistral-7B-Instruct-v0.3", "model_arch": MODEL_ARCH.LLAMA, "model_parts": 3, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.SPM, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.SPM.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_SPM_FILES, }, @@ -86,8 +86,8 @@ HF_MODEL_MAP = ( "model_repo": "mistralai/Mixtral-8x7B-Instruct-v0.1", "model_arch": MODEL_ARCH.LLAMA, "model_parts": 8, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.SPM, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.SPM.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_SPM_FILES, }, @@ -95,8 +95,8 @@ HF_MODEL_MAP = ( "model_repo": "microsoft/Phi-3-mini-4k-instruct", "model_arch": MODEL_ARCH.PHI3, "model_parts": 2, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.SPM, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.SPM.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_SPM_FILES, }, @@ -106,8 +106,8 @@ HF_MODEL_MAP = ( "model_repo": "BAAI/bge-small-en-v1.5", "model_arch": MODEL_ARCH.BERT, "model_parts": 1, - "model_type": ModelFileType.BIN, - "vocab_type": VocabType.WPM, + "model_type": ModelFileExtension.BIN.value, + "vocab_type": VocabType.WPM.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -115,8 +115,8 @@ HF_MODEL_MAP = ( "model_repo": "jinaai/jina-embeddings-v2-base-en", "model_arch": MODEL_ARCH.JINA_BERT_V2, "model_parts": 1, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.WPM, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.WPM.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -125,8 +125,8 @@ HF_MODEL_MAP = ( "model_repo": "meta-llama/Meta-Llama-3-8B", "model_arch": MODEL_ARCH.LLAMA, "model_parts": 4, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -134,8 +134,8 @@ HF_MODEL_MAP = ( "model_repo": "tiiuae/falcon-7b", "model_arch": MODEL_ARCH.FALCON, "model_parts": 2, - "model_type": ModelFileType.BIN, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.BIN.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -143,8 +143,8 @@ HF_MODEL_MAP = ( "model_repo": "deepseek-ai/deepseek-llm-7b-base", "model_arch": MODEL_ARCH.LLAMA, "model_parts": 2, - "model_type": ModelFileType.BIN, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.BIN.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -152,8 +152,8 @@ HF_MODEL_MAP = ( "model_repo": "deepseek-ai/deepseek-coder-6.7b-base", "model_arch": MODEL_ARCH.LLAMA, "model_parts": 2, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -161,8 +161,8 @@ HF_MODEL_MAP = ( "model_repo": "mosaicml/mpt-7b", "model_arch": MODEL_ARCH.MPT, "model_parts": 2, - "model_type": ModelFileType.BIN, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.BIN.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -173,8 +173,8 @@ HF_MODEL_MAP = ( "model_repo": "bigcode/starcoder2-3b", "model_arch": MODEL_ARCH.STARCODER2, "model_parts": 1, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -182,8 +182,8 @@ HF_MODEL_MAP = ( "model_repo": "smallcloudai/Refact-1_6-base", "model_arch": MODEL_ARCH.REFACT, "model_parts": 1, - "model_type": ModelFileType.BIN, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.BIN.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -191,8 +191,8 @@ HF_MODEL_MAP = ( "model_repo": "CohereForAI/c4ai-command-r-v01", "model_arch": MODEL_ARCH.COMMAND_R, "model_parts": 15, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -203,8 +203,8 @@ HF_MODEL_MAP = ( "model_repo": "Qwen/Qwen1.5-7B", "model_arch": MODEL_ARCH.QWEN2, "model_parts": 4, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -212,8 +212,8 @@ HF_MODEL_MAP = ( "model_repo": "stabilityai/stablelm-2-zephyr-1_6b", "model_arch": MODEL_ARCH.STABLELM, "model_parts": 1, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -224,8 +224,8 @@ HF_MODEL_MAP = ( "model_repo": "openai-community/gpt2", "model_arch": MODEL_ARCH.GPT2, "model_parts": 1, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -233,8 +233,8 @@ HF_MODEL_MAP = ( "model_repo": "allenai/OLMo-1.7-7B-hf", "model_arch": MODEL_ARCH.OLMO, "model_parts": 6, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -242,8 +242,8 @@ HF_MODEL_MAP = ( # "model_repo": "databricks/dbrx-base", # "model_arch": MODEL_ARCH.DBRX, # "model_parts": 0, - # "model_type": ModelFileType.SAFETENSORS, - # "vocab_type": VocabType.BPE, + # "model_type": ModelFileExtension.SAFETENSORS.value, + # "vocab_type": VocabType.BPE.value, # "vocab_pre": None, # "vocab_files": HF_TOKENIZER_BPE_FILES, # }, @@ -251,8 +251,8 @@ HF_MODEL_MAP = ( "model_repo": "jinaai/jina-embeddings-v2-base-es", "model_arch": MODEL_ARCH.JINA_BERT_V2, "model_parts": 1, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -260,8 +260,8 @@ HF_MODEL_MAP = ( "model_repo": "jinaai/jina-embeddings-v2-base-de", "model_arch": MODEL_ARCH.JINA_BERT_V2, "model_parts": 1, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -269,8 +269,8 @@ HF_MODEL_MAP = ( "model_repo": "microsoft/phi-1", "model_arch": MODEL_ARCH.PHI2, "model_parts": 1, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -278,8 +278,8 @@ HF_MODEL_MAP = ( "model_repo": "microsoft/phi-1_5", "model_arch": MODEL_ARCH.PHI2, "model_parts": 1, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -287,8 +287,8 @@ HF_MODEL_MAP = ( "model_repo": "microsoft/phi-2", "model_arch": MODEL_ARCH.PHI2, "model_parts": 2, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.BPE, + "model_type": ModelFileExtension.SAFETENSORS.value, + "vocab_type": VocabType.BPE.value, "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, @@ -334,10 +334,7 @@ for model in HF_MODEL_MAP: model_arch = model["model_arch"] vocab_type = model["vocab_type"] - print( - "HUB_REPO:", model_repo, - "LLAMA_ARCH:", MODEL_ARCH_NAMES[model_arch] - ) + print("HUB_REPO:", model_repo, "LLAMA_ARCH:", MODEL_ARCH_NAMES[model_arch]) hub_model.download_all_vocab_files( model_repo=model_repo, @@ -346,24 +343,28 @@ for model in HF_MODEL_MAP: # log the downloaded results hub_tokenizer.log_tokenizer_json_info(model_repo) - model['model_arch'] = MODEL_ARCH_NAMES[model_arch] - model['vocab_type'] = hub_tokenizer.get_vocab_name(vocab_type) + model["model_arch"] = MODEL_ARCH_NAMES[model_arch] normalizer = hub_tokenizer.get_normalizer(model_repo) - # extract the normalizer metadata - model['normalizer'] = normalizer + # NOTE: Normalizer may be one of null, Sequence, NFC, NFD, NFKC, NFKD... + # Seems to be null, Sequence, or NFC in most cases + # Default to NFD + # TODO: Extract the normalizer metadata + model["normalizer"] = normalizer + # Seems safe to assume most basic types are of type "Sequence" + # I expect this to cause issues in the future. Needs more research. pre_tokenizer = hub_tokenizer.get_pre_tokenizer(model_repo) - # extract the pre-tokenizer metadata - model['pre_tokenizer'] = pre_tokenizer + # extract the added tokens metadata + model["pre_tokenizer"] = pre_tokenizer added_tokens = hub_tokenizer.get_added_tokens(model_repo) # extract the added tokens metadata - model['added_tokens'] = added_tokens + model["added_tokens"] = added_tokens sha256sum = hub_tokenizer.get_tokenizer_json_hash(model_repo) # use the hash to validate the models vocabulary - model['vocab_hash'] = sha256sum + model["vocab_hash"] = sha256sum metadata.append(model)