refactor: Apply updates to example script for generating the registry
This commit is contained in:
parent
f1d067e7a6
commit
5c92809397
1 changed files with 68 additions and 67 deletions
|
@ -13,17 +13,17 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent /
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
from gguf.constants import (
|
from gguf.constants import (
|
||||||
GPT_PRE_TOKENIZER_DEFAULT,
|
|
||||||
HF_TOKENIZER_BPE_FILES,
|
HF_TOKENIZER_BPE_FILES,
|
||||||
HF_TOKENIZER_SPM_FILES,
|
HF_TOKENIZER_SPM_FILES,
|
||||||
MODEL_ARCH,
|
MODEL_ARCH,
|
||||||
MODEL_ARCH_NAMES,
|
MODEL_ARCH_NAMES,
|
||||||
ModelFileType,
|
ModelFileExtension,
|
||||||
|
PreTokenizerType,
|
||||||
VocabType,
|
VocabType,
|
||||||
)
|
)
|
||||||
from gguf.huggingface_hub import HFHubModel, HFHubTokenizer
|
from gguf.huggingface_hub import HFHubModel, HFHubTokenizer
|
||||||
|
|
||||||
logger = logging.getLogger("gguf-gen-pre")
|
logger = logging.getLogger(__file__)
|
||||||
|
|
||||||
#
|
#
|
||||||
# HuggingFace Model Map
|
# HuggingFace Model Map
|
||||||
|
@ -50,8 +50,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "meta-llama/Llama-2-7b-hf",
|
"model_repo": "meta-llama/Llama-2-7b-hf",
|
||||||
"model_arch": MODEL_ARCH.LLAMA,
|
"model_arch": MODEL_ARCH.LLAMA,
|
||||||
"model_parts": 2,
|
"model_parts": 2,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.SPM,
|
"vocab_type": VocabType.SPM.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
||||||
},
|
},
|
||||||
|
@ -59,8 +59,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "mistralai/Mistral-7B-Instruct-v0.1",
|
"model_repo": "mistralai/Mistral-7B-Instruct-v0.1",
|
||||||
"model_arch": MODEL_ARCH.LLAMA,
|
"model_arch": MODEL_ARCH.LLAMA,
|
||||||
"model_parts": 2,
|
"model_parts": 2,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.SPM,
|
"vocab_type": VocabType.SPM.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
||||||
},
|
},
|
||||||
|
@ -68,8 +68,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "mistralai/Mistral-7B-Instruct-v0.2",
|
"model_repo": "mistralai/Mistral-7B-Instruct-v0.2",
|
||||||
"model_arch": MODEL_ARCH.LLAMA,
|
"model_arch": MODEL_ARCH.LLAMA,
|
||||||
"model_parts": 3,
|
"model_parts": 3,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.SPM,
|
"vocab_type": VocabType.SPM.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
||||||
},
|
},
|
||||||
|
@ -77,8 +77,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "mistralai/Mistral-7B-Instruct-v0.3",
|
"model_repo": "mistralai/Mistral-7B-Instruct-v0.3",
|
||||||
"model_arch": MODEL_ARCH.LLAMA,
|
"model_arch": MODEL_ARCH.LLAMA,
|
||||||
"model_parts": 3,
|
"model_parts": 3,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.SPM,
|
"vocab_type": VocabType.SPM.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
||||||
},
|
},
|
||||||
|
@ -86,8 +86,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
"model_repo": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
"model_arch": MODEL_ARCH.LLAMA,
|
"model_arch": MODEL_ARCH.LLAMA,
|
||||||
"model_parts": 8,
|
"model_parts": 8,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.SPM,
|
"vocab_type": VocabType.SPM.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
||||||
},
|
},
|
||||||
|
@ -95,8 +95,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "microsoft/Phi-3-mini-4k-instruct",
|
"model_repo": "microsoft/Phi-3-mini-4k-instruct",
|
||||||
"model_arch": MODEL_ARCH.PHI3,
|
"model_arch": MODEL_ARCH.PHI3,
|
||||||
"model_parts": 2,
|
"model_parts": 2,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.SPM,
|
"vocab_type": VocabType.SPM.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
||||||
},
|
},
|
||||||
|
@ -106,8 +106,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "BAAI/bge-small-en-v1.5",
|
"model_repo": "BAAI/bge-small-en-v1.5",
|
||||||
"model_arch": MODEL_ARCH.BERT,
|
"model_arch": MODEL_ARCH.BERT,
|
||||||
"model_parts": 1,
|
"model_parts": 1,
|
||||||
"model_type": ModelFileType.BIN,
|
"model_type": ModelFileExtension.BIN.value,
|
||||||
"vocab_type": VocabType.WPM,
|
"vocab_type": VocabType.WPM.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -115,8 +115,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "jinaai/jina-embeddings-v2-base-en",
|
"model_repo": "jinaai/jina-embeddings-v2-base-en",
|
||||||
"model_arch": MODEL_ARCH.JINA_BERT_V2,
|
"model_arch": MODEL_ARCH.JINA_BERT_V2,
|
||||||
"model_parts": 1,
|
"model_parts": 1,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.WPM,
|
"vocab_type": VocabType.WPM.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -125,8 +125,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "meta-llama/Meta-Llama-3-8B",
|
"model_repo": "meta-llama/Meta-Llama-3-8B",
|
||||||
"model_arch": MODEL_ARCH.LLAMA,
|
"model_arch": MODEL_ARCH.LLAMA,
|
||||||
"model_parts": 4,
|
"model_parts": 4,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -134,8 +134,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "tiiuae/falcon-7b",
|
"model_repo": "tiiuae/falcon-7b",
|
||||||
"model_arch": MODEL_ARCH.FALCON,
|
"model_arch": MODEL_ARCH.FALCON,
|
||||||
"model_parts": 2,
|
"model_parts": 2,
|
||||||
"model_type": ModelFileType.BIN,
|
"model_type": ModelFileExtension.BIN.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -143,8 +143,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "deepseek-ai/deepseek-llm-7b-base",
|
"model_repo": "deepseek-ai/deepseek-llm-7b-base",
|
||||||
"model_arch": MODEL_ARCH.LLAMA,
|
"model_arch": MODEL_ARCH.LLAMA,
|
||||||
"model_parts": 2,
|
"model_parts": 2,
|
||||||
"model_type": ModelFileType.BIN,
|
"model_type": ModelFileExtension.BIN.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -152,8 +152,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "deepseek-ai/deepseek-coder-6.7b-base",
|
"model_repo": "deepseek-ai/deepseek-coder-6.7b-base",
|
||||||
"model_arch": MODEL_ARCH.LLAMA,
|
"model_arch": MODEL_ARCH.LLAMA,
|
||||||
"model_parts": 2,
|
"model_parts": 2,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -161,8 +161,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "mosaicml/mpt-7b",
|
"model_repo": "mosaicml/mpt-7b",
|
||||||
"model_arch": MODEL_ARCH.MPT,
|
"model_arch": MODEL_ARCH.MPT,
|
||||||
"model_parts": 2,
|
"model_parts": 2,
|
||||||
"model_type": ModelFileType.BIN,
|
"model_type": ModelFileExtension.BIN.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -173,8 +173,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "bigcode/starcoder2-3b",
|
"model_repo": "bigcode/starcoder2-3b",
|
||||||
"model_arch": MODEL_ARCH.STARCODER2,
|
"model_arch": MODEL_ARCH.STARCODER2,
|
||||||
"model_parts": 1,
|
"model_parts": 1,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -182,8 +182,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "smallcloudai/Refact-1_6-base",
|
"model_repo": "smallcloudai/Refact-1_6-base",
|
||||||
"model_arch": MODEL_ARCH.REFACT,
|
"model_arch": MODEL_ARCH.REFACT,
|
||||||
"model_parts": 1,
|
"model_parts": 1,
|
||||||
"model_type": ModelFileType.BIN,
|
"model_type": ModelFileExtension.BIN.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -191,8 +191,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "CohereForAI/c4ai-command-r-v01",
|
"model_repo": "CohereForAI/c4ai-command-r-v01",
|
||||||
"model_arch": MODEL_ARCH.COMMAND_R,
|
"model_arch": MODEL_ARCH.COMMAND_R,
|
||||||
"model_parts": 15,
|
"model_parts": 15,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -203,8 +203,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "Qwen/Qwen1.5-7B",
|
"model_repo": "Qwen/Qwen1.5-7B",
|
||||||
"model_arch": MODEL_ARCH.QWEN2,
|
"model_arch": MODEL_ARCH.QWEN2,
|
||||||
"model_parts": 4,
|
"model_parts": 4,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -212,8 +212,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "stabilityai/stablelm-2-zephyr-1_6b",
|
"model_repo": "stabilityai/stablelm-2-zephyr-1_6b",
|
||||||
"model_arch": MODEL_ARCH.STABLELM,
|
"model_arch": MODEL_ARCH.STABLELM,
|
||||||
"model_parts": 1,
|
"model_parts": 1,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -224,8 +224,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "openai-community/gpt2",
|
"model_repo": "openai-community/gpt2",
|
||||||
"model_arch": MODEL_ARCH.GPT2,
|
"model_arch": MODEL_ARCH.GPT2,
|
||||||
"model_parts": 1,
|
"model_parts": 1,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -233,8 +233,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "allenai/OLMo-1.7-7B-hf",
|
"model_repo": "allenai/OLMo-1.7-7B-hf",
|
||||||
"model_arch": MODEL_ARCH.OLMO,
|
"model_arch": MODEL_ARCH.OLMO,
|
||||||
"model_parts": 6,
|
"model_parts": 6,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -242,8 +242,8 @@ HF_MODEL_MAP = (
|
||||||
# "model_repo": "databricks/dbrx-base",
|
# "model_repo": "databricks/dbrx-base",
|
||||||
# "model_arch": MODEL_ARCH.DBRX,
|
# "model_arch": MODEL_ARCH.DBRX,
|
||||||
# "model_parts": 0,
|
# "model_parts": 0,
|
||||||
# "model_type": ModelFileType.SAFETENSORS,
|
# "model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
# "vocab_type": VocabType.BPE,
|
# "vocab_type": VocabType.BPE.value,
|
||||||
# "vocab_pre": None,
|
# "vocab_pre": None,
|
||||||
# "vocab_files": HF_TOKENIZER_BPE_FILES,
|
# "vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
# },
|
# },
|
||||||
|
@ -251,8 +251,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "jinaai/jina-embeddings-v2-base-es",
|
"model_repo": "jinaai/jina-embeddings-v2-base-es",
|
||||||
"model_arch": MODEL_ARCH.JINA_BERT_V2,
|
"model_arch": MODEL_ARCH.JINA_BERT_V2,
|
||||||
"model_parts": 1,
|
"model_parts": 1,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -260,8 +260,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "jinaai/jina-embeddings-v2-base-de",
|
"model_repo": "jinaai/jina-embeddings-v2-base-de",
|
||||||
"model_arch": MODEL_ARCH.JINA_BERT_V2,
|
"model_arch": MODEL_ARCH.JINA_BERT_V2,
|
||||||
"model_parts": 1,
|
"model_parts": 1,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -269,8 +269,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "microsoft/phi-1",
|
"model_repo": "microsoft/phi-1",
|
||||||
"model_arch": MODEL_ARCH.PHI2,
|
"model_arch": MODEL_ARCH.PHI2,
|
||||||
"model_parts": 1,
|
"model_parts": 1,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -278,8 +278,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "microsoft/phi-1_5",
|
"model_repo": "microsoft/phi-1_5",
|
||||||
"model_arch": MODEL_ARCH.PHI2,
|
"model_arch": MODEL_ARCH.PHI2,
|
||||||
"model_parts": 1,
|
"model_parts": 1,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -287,8 +287,8 @@ HF_MODEL_MAP = (
|
||||||
"model_repo": "microsoft/phi-2",
|
"model_repo": "microsoft/phi-2",
|
||||||
"model_arch": MODEL_ARCH.PHI2,
|
"model_arch": MODEL_ARCH.PHI2,
|
||||||
"model_parts": 2,
|
"model_parts": 2,
|
||||||
"model_type": ModelFileType.SAFETENSORS,
|
"model_type": ModelFileExtension.SAFETENSORS.value,
|
||||||
"vocab_type": VocabType.BPE,
|
"vocab_type": VocabType.BPE.value,
|
||||||
"vocab_pre": None,
|
"vocab_pre": None,
|
||||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||||
},
|
},
|
||||||
|
@ -334,10 +334,7 @@ for model in HF_MODEL_MAP:
|
||||||
model_arch = model["model_arch"]
|
model_arch = model["model_arch"]
|
||||||
vocab_type = model["vocab_type"]
|
vocab_type = model["vocab_type"]
|
||||||
|
|
||||||
print(
|
print("HUB_REPO:", model_repo, "LLAMA_ARCH:", MODEL_ARCH_NAMES[model_arch])
|
||||||
"HUB_REPO:", model_repo,
|
|
||||||
"LLAMA_ARCH:", MODEL_ARCH_NAMES[model_arch]
|
|
||||||
)
|
|
||||||
|
|
||||||
hub_model.download_all_vocab_files(
|
hub_model.download_all_vocab_files(
|
||||||
model_repo=model_repo,
|
model_repo=model_repo,
|
||||||
|
@ -346,24 +343,28 @@ for model in HF_MODEL_MAP:
|
||||||
# log the downloaded results
|
# log the downloaded results
|
||||||
hub_tokenizer.log_tokenizer_json_info(model_repo)
|
hub_tokenizer.log_tokenizer_json_info(model_repo)
|
||||||
|
|
||||||
model['model_arch'] = MODEL_ARCH_NAMES[model_arch]
|
model["model_arch"] = MODEL_ARCH_NAMES[model_arch]
|
||||||
model['vocab_type'] = hub_tokenizer.get_vocab_name(vocab_type)
|
|
||||||
|
|
||||||
normalizer = hub_tokenizer.get_normalizer(model_repo)
|
normalizer = hub_tokenizer.get_normalizer(model_repo)
|
||||||
# extract the normalizer metadata
|
# NOTE: Normalizer may be one of null, Sequence, NFC, NFD, NFKC, NFKD...
|
||||||
model['normalizer'] = normalizer
|
# Seems to be null, Sequence, or NFC in most cases
|
||||||
|
# Default to NFD
|
||||||
|
# TODO: Extract the normalizer metadata
|
||||||
|
model["normalizer"] = normalizer
|
||||||
|
|
||||||
|
# Seems safe to assume most basic types are of type "Sequence"
|
||||||
|
# I expect this to cause issues in the future. Needs more research.
|
||||||
pre_tokenizer = hub_tokenizer.get_pre_tokenizer(model_repo)
|
pre_tokenizer = hub_tokenizer.get_pre_tokenizer(model_repo)
|
||||||
# extract the pre-tokenizer metadata
|
# extract the added tokens metadata
|
||||||
model['pre_tokenizer'] = pre_tokenizer
|
model["pre_tokenizer"] = pre_tokenizer
|
||||||
|
|
||||||
added_tokens = hub_tokenizer.get_added_tokens(model_repo)
|
added_tokens = hub_tokenizer.get_added_tokens(model_repo)
|
||||||
# extract the added tokens metadata
|
# extract the added tokens metadata
|
||||||
model['added_tokens'] = added_tokens
|
model["added_tokens"] = added_tokens
|
||||||
|
|
||||||
sha256sum = hub_tokenizer.get_tokenizer_json_hash(model_repo)
|
sha256sum = hub_tokenizer.get_tokenizer_json_hash(model_repo)
|
||||||
# use the hash to validate the models vocabulary
|
# use the hash to validate the models vocabulary
|
||||||
model['vocab_hash'] = sha256sum
|
model["vocab_hash"] = sha256sum
|
||||||
|
|
||||||
metadata.append(model)
|
metadata.append(model)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue