chore: Add pre tokenizers and include enum mappings
This commit is contained in:
parent
215394947e
commit
0a478c048a
1 changed files with 39 additions and 0 deletions
|
@ -988,6 +988,12 @@ VOCAB_TYPE_NAMES: dict[VocabType, str] = {
|
|||
VocabType.WPM: "WPM",
|
||||
}
|
||||
|
||||
VOCAB_TYPE_MAP: dict[str, VocabType] = {
|
||||
"SPM": VocabType.SPM,
|
||||
"BPE": VocabType.BPE,
|
||||
"WPM": VocabType.WPM,
|
||||
}
|
||||
|
||||
|
||||
#
|
||||
# Model File Types
|
||||
|
@ -1013,6 +1019,39 @@ MODEL_FILE_TYPE_NAMES: dict[ModelFileType, str] = {
|
|||
ModelFileType.GGUF: ".gguf",
|
||||
}
|
||||
|
||||
MODEL_FILE_TYPE_MAP: dict[ModelFileType, str] = {
|
||||
".pt": ModelFileType.PT,
|
||||
".pth": ModelFileType.PTH,
|
||||
".bin": ModelFileType.BIN,
|
||||
".safetensors": ModelFileType.SAFETENSORS,
|
||||
".json": ModelFileType.JSON,
|
||||
".model": ModelFileType.MODEL,
|
||||
".gguf": ModelFileType.GGUF,
|
||||
}
|
||||
|
||||
|
||||
class PreTokenizerType(IntEnum):
|
||||
NON = auto()
|
||||
BYTE_LEVEL = auto()
|
||||
BERT_PRE_TOKENIZER = auto()
|
||||
METASPACE = auto()
|
||||
SEQUENCE = auto()
|
||||
|
||||
|
||||
PRE_TOKENIZER_TYPE_NAMES: dict[PreTokenizerType, str] = {
|
||||
PreTokenizerType.BYTE_LEVEL: "ByteLevel",
|
||||
PreTokenizerType.BERT_PRE_TOKENIZER: "BertPreTokenizer",
|
||||
PreTokenizerType.METASPACE: "Metaspace",
|
||||
PreTokenizerType.SEQUENCE: "Sequence",
|
||||
}
|
||||
|
||||
PRE_TOKENIZER_TYPE_MAP: dict[PreTokenizerType, str] = {
|
||||
"ByteLevel": PreTokenizerType.BYTE_LEVEL,
|
||||
"BertPreTokenizer": PreTokenizerType.BERT_PRE_TOKENIZER,
|
||||
"Metaspace": PreTokenizerType.METASPACE,
|
||||
"Sequence": PreTokenizerType.SEQUENCE,
|
||||
}
|
||||
|
||||
#
|
||||
# HF Vocab Files
|
||||
#
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue