refactor: Add experimental model mappings
This commit is contained in:
parent
b2aac685d5
commit
34e14ae96d
1 changed files with 72 additions and 22 deletions
|
@ -955,23 +955,34 @@ class HFModelFileType(IntEnum):
|
|||
SFT = auto() # SafeTensor file type
|
||||
|
||||
|
||||
# NOTE: It's easier to map out which files we need in advance.
|
||||
HF_TOKENIZER_BPE_FILES = ("config.json", "tokenizer_config.json", "tokenizer.json",)
|
||||
HF_TOKENIZER_SPM_FILES = (HF_TOKENIZER_BPE_FILES + ("tokenizer.model",))
|
||||
|
||||
# NOTE: Tokenizers defaults to OpenAI GPT-2 Byte Level Reg-Exp
|
||||
# The pattern uses perl, is grammatical, and splits are technically arbitrary.
|
||||
# https://github.com/openai/gpt-2/blob/master/src/encoder.py#L53
|
||||
# https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L40-L42
|
||||
HF_TOKENIZER_DEFAULT_PRE = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+"
|
||||
|
||||
# NOTE: It's easier to map out which files we need in advance.
|
||||
HF_TOKENIZER_DEFAULT_BPE = ("config.json", "tokenizer_config.json", "tokenizer.json",)
|
||||
HF_TOKENIZER_DEFAULT_SPM = (HF_TOKENIZER_DEFAULT_BPE + ("tokenizer.model",))
|
||||
BPE_PRE_TOKENIZER_DEFAULT = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)"
|
||||
GPT_PRE_TOKENIZER_DEFAULT = f"{BPE_PRE_TOKENIZER_DEFAULT}|\\s+"
|
||||
|
||||
#
|
||||
# HuggingFace Model Map
|
||||
#
|
||||
# NOTE:
|
||||
# - Repository paths are required
|
||||
# - Allow the user to specify the tokenizer model type themselves
|
||||
# - Use architecture types because they are explicitly defined
|
||||
# NOTE: All prerequisite model metadata must be defined here.
|
||||
#
|
||||
# Defines metadata for each Hugging Face model required during conversion to GGUF
|
||||
#
|
||||
# Field Descriptions
|
||||
# - `model_repo` (str): The HuggingFace endpoint or local path to the models repository
|
||||
# - `model_arch` (MODEL_ARCH): Model architecture type
|
||||
# - `model_parts` (int): Number of parts required to join the model during conversion
|
||||
# - `model_type` (FileFormatType): File format for the Hugging Face model files
|
||||
# - `vocab_type` (VocabType): Vocabulary type used by the tokenizer
|
||||
# - `vocab_pre` (Optional[Tuple[str]]): List of pre-tokenizer pattern strings for this model
|
||||
# - `vocab_files` (Tuple[str]): List of file names required to extract vocabulary and other metadata
|
||||
#
|
||||
# NOTES
|
||||
# - Possible algorithms are WordLevel, BPE, WordPiece, or Unigram
|
||||
# - Possible LLaMa tokenizer model types are: None, SPM, BPE, or WPM
|
||||
HF_MODEL_MAP = (
|
||||
|
@ -982,7 +993,16 @@ HF_MODEL_MAP = (
|
|||
"model_type": HFModelFileType.SFT,
|
||||
"vocab_type": LLaMaVocabType.SPM,
|
||||
"vocab_pre": (),
|
||||
"vocab_files": HF_TOKENIZER_DEFAULT_SPM,
|
||||
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
||||
},
|
||||
{
|
||||
"model_repo": "tiiuae/falcon-7b",
|
||||
"model_arch": MODEL_ARCH.FALCON,
|
||||
"model_parts": 2,
|
||||
"model_type": HFModelFileType.BIN,
|
||||
"vocab_type": LLaMaVocabType.BPE,
|
||||
"vocab_pre": BPE_PRE_TOKENIZERS[MODEL_ARCH_NAMES[MODEL_ARCH.FALCON]],
|
||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||
},
|
||||
{
|
||||
"model_repo": "meta-llama/Meta-Llama-3-8B",
|
||||
|
@ -990,10 +1010,9 @@ HF_MODEL_MAP = (
|
|||
"model_parts": 4,
|
||||
"model_type": HFModelFileType.SFT,
|
||||
"vocab_type": LLaMaVocabType.BPE,
|
||||
"vocab_pre": (
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
),
|
||||
"vocab_files": HF_TOKENIZER_DEFAULT_BPE,
|
||||
# PR#6920: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
||||
"vocab_pre": BPE_PRE_TOKENIZERS[MODEL_ARCH_NAMES[MODEL_ARCH.LLAMA]],
|
||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||
},
|
||||
{
|
||||
"model_repo": "microsoft/Phi-3-mini-4k-instruct",
|
||||
|
@ -1002,7 +1021,7 @@ HF_MODEL_MAP = (
|
|||
"model_type": HFModelFileType.SFT,
|
||||
"vocab_type": LLaMaVocabType.SPM,
|
||||
"vocab_pre": (),
|
||||
"vocab_files": HF_TOKENIZER_DEFAULT_SPM,
|
||||
"vocab_files": HF_TOKENIZER_SPM_FILES,
|
||||
},
|
||||
{
|
||||
"model_repo": "deepseek-ai/deepseek-llm-7b-base",
|
||||
|
@ -1010,33 +1029,64 @@ HF_MODEL_MAP = (
|
|||
"model_parts": 2,
|
||||
"model_type": HFModelFileType.BIN,
|
||||
"vocab_type": LLaMaVocabType.BPE,
|
||||
"vocab_pre": (),
|
||||
"vocab_file": HF_TOKENIZER_DEFAULT_BPE,
|
||||
"vocab_pre": (
|
||||
"[\r\n]",
|
||||
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
||||
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
||||
"\\s+$",
|
||||
"[一-龥ࠀ-一가-]+",
|
||||
"\\p{N}+",
|
||||
),
|
||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||
},
|
||||
{
|
||||
"model_repo": "deepseek-ai/deepseek-coder-6.7b-base",
|
||||
"model_arch": MODEL_ARCH.LLAMA,
|
||||
"model_parts": 2,
|
||||
"model_type": HFModelFileType.SFT,
|
||||
"vocab_type": LLaMaVocabType.BPE,
|
||||
},
|
||||
{
|
||||
"model_repo": "tiiuae/falcon-7b",
|
||||
"model_arch": MODEL_ARCH.FALCON,
|
||||
"vocab_type": LLaMaVocabType.BPE,
|
||||
"vocab_pre": (
|
||||
"[\r\n]",
|
||||
"\\s?\\p{L}+",
|
||||
"\\s?\\p{P}+",
|
||||
"[一-龥ࠀ-一가-]+",
|
||||
"\\p{N}",
|
||||
),
|
||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||
},
|
||||
{
|
||||
"model_repo": "BAAI/bge-small-en-v1.5",
|
||||
"model_arch": MODEL_ARCH.BERT,
|
||||
"model_parts": 1,
|
||||
"model_type": HFModelFileType.BIN,
|
||||
"vocab_type": LLaMaVocabType.WPM,
|
||||
"vocab_pre": (),
|
||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||
},
|
||||
{
|
||||
"model_repo": "mosaicml/mpt-7b",
|
||||
"model_arch": MODEL_ARCH.MPT,
|
||||
"model_parts": 2,
|
||||
"model_type": HFModelFileType.BIN,
|
||||
"vocab_type": LLaMaVocabType.BPE,
|
||||
"vocab_pre": (
|
||||
"\\s?\\p{L}+",
|
||||
"\\s?\\p{P}+",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
),
|
||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||
},
|
||||
{
|
||||
"model_repo": "bigcode/starcoder2-3b",
|
||||
"model_arch": MODEL_ARCH.STARCODER2,
|
||||
"model_parts": 1,
|
||||
"model_type": HFModelFileType.SFT,
|
||||
"vocab_type": LLaMaVocabType.BPE,
|
||||
"vocab_pre": (
|
||||
"\\p{N}",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
),
|
||||
"vocab_files": HF_TOKENIZER_BPE_FILES,
|
||||
},
|
||||
{
|
||||
"model_repo": "openai-community/gpt2",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue