chore: Map model file and vocab types
This commit is contained in:
parent
0b43e14030
commit
12285b5325
1 changed files with 19 additions and 4 deletions
|
@ -945,16 +945,32 @@ class LLaMaVocabType(IntEnum):
|
||||||
WPM = auto() # WordPiece BERT tokenizer
|
WPM = auto() # WordPiece BERT tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
LLaMaVocabTypeNames: dict[LLaMaVocabType, str] = {
|
||||||
|
LLaMaVocabType.SPM: "SPM",
|
||||||
|
LLaMaVocabType.BPE: "BPE",
|
||||||
|
LLaMaVocabType.WPM: "WPM",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# HuggingFace Model File Types
|
# HuggingFace Model File Types
|
||||||
#
|
#
|
||||||
class HFModelFileType(IntEnum):
|
class HFModelFileType(IntEnum):
|
||||||
UNK = auto() # Unsupported file type
|
UNK = auto() # Unsupported file type
|
||||||
BIN = auto() # PyTorch file type
|
PT = auto() # PyTorch file type
|
||||||
PTH = auto() # PyTorch file type
|
PTH = auto() # PyTorch file type
|
||||||
|
BIN = auto() # Pickled file type
|
||||||
SFT = auto() # SafeTensor file type
|
SFT = auto() # SafeTensor file type
|
||||||
|
|
||||||
|
|
||||||
|
LLaMaVocabTypeNames: dict[LLaMaVocabType, str] = {
|
||||||
|
HFModelFileType.PT: "pt",
|
||||||
|
HFModelFileType.PTH: "pth",
|
||||||
|
HFModelFileType.BIN: "bin",
|
||||||
|
HFModelFileType.SFT: "safetensors",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# NOTE: It's easier to map out which files we need in advance.
|
# NOTE: It's easier to map out which files we need in advance.
|
||||||
HF_TOKENIZER_BPE_FILES = ("config.json", "tokenizer_config.json", "tokenizer.json",)
|
HF_TOKENIZER_BPE_FILES = ("config.json", "tokenizer_config.json", "tokenizer.json",)
|
||||||
HF_TOKENIZER_SPM_FILES = (HF_TOKENIZER_BPE_FILES + ("tokenizer.model",))
|
HF_TOKENIZER_SPM_FILES = (HF_TOKENIZER_BPE_FILES + ("tokenizer.model",))
|
||||||
|
@ -967,7 +983,6 @@ HF_TOKENIZER_SPM_FILES = (HF_TOKENIZER_BPE_FILES + ("tokenizer.model",))
|
||||||
# The pattern uses perl, is grammatical, and splits are technically arbitrary.
|
# The pattern uses perl, is grammatical, and splits are technically arbitrary.
|
||||||
# https://github.com/openai/gpt-2/blob/master/src/encoder.py#L53
|
# https://github.com/openai/gpt-2/blob/master/src/encoder.py#L53
|
||||||
# https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L40-L42
|
# https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L40-L42
|
||||||
UNI_PRE_TOKENIZER_DEFAULT = "\\p{N}"
|
|
||||||
BPE_PRE_TOKENIZER_DEFAULT = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)"
|
BPE_PRE_TOKENIZER_DEFAULT = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)"
|
||||||
GPT_PRE_TOKENIZER_DEFAULT = f"{BPE_PRE_TOKENIZER_DEFAULT}|\\s+"
|
GPT_PRE_TOKENIZER_DEFAULT = f"{BPE_PRE_TOKENIZER_DEFAULT}|\\s+"
|
||||||
|
|
||||||
|
@ -1014,8 +1029,8 @@ BPE_PRE_TOKENIZERS = {
|
||||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
),
|
),
|
||||||
MODEL_ARCH_NAMES[MODEL_ARCH.QWEN2MOE]: (),
|
MODEL_ARCH_NAMES[MODEL_ARCH.QWEN2MOE]: (),
|
||||||
MODEL_ARCH_NAMES[MODEL_ARCH.PHI2]: (),
|
MODEL_ARCH_NAMES[MODEL_ARCH.PHI2]: (GPT_PRE_TOKENIZER_DEFAULT,),
|
||||||
MODEL_ARCH_NAMES[MODEL_ARCH.PHI3]: (),
|
MODEL_ARCH_NAMES[MODEL_ARCH.PHI3]: (GPT_PRE_TOKENIZER_DEFAULT,),
|
||||||
MODEL_ARCH_NAMES[MODEL_ARCH.PLAMO]: (),
|
MODEL_ARCH_NAMES[MODEL_ARCH.PLAMO]: (),
|
||||||
MODEL_ARCH_NAMES[MODEL_ARCH.CODESHELL]: (),
|
MODEL_ARCH_NAMES[MODEL_ARCH.CODESHELL]: (),
|
||||||
MODEL_ARCH_NAMES[MODEL_ARCH.ORION]: (),
|
MODEL_ARCH_NAMES[MODEL_ARCH.ORION]: (),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue