chore: Add pre tokenizers and include enum mappings

This commit is contained in:
teleprint-me 2024-05-27 03:11:40 -04:00
parent 215394947e
commit 0a478c048a
No known key found for this signature in database
GPG key ID: B0D11345E65C4D48

View file

@ -988,6 +988,12 @@ VOCAB_TYPE_NAMES: dict[VocabType, str] = {
VocabType.WPM: "WPM", VocabType.WPM: "WPM",
} }
VOCAB_TYPE_MAP: dict[str, VocabType] = {
"SPM": VocabType.SPM,
"BPE": VocabType.BPE,
"WPM": VocabType.WPM,
}
# #
# Model File Types # Model File Types
@ -1013,6 +1019,39 @@ MODEL_FILE_TYPE_NAMES: dict[ModelFileType, str] = {
ModelFileType.GGUF: ".gguf", ModelFileType.GGUF: ".gguf",
} }
MODEL_FILE_TYPE_MAP: dict[ModelFileType, str] = {
".pt": ModelFileType.PT,
".pth": ModelFileType.PTH,
".bin": ModelFileType.BIN,
".safetensors": ModelFileType.SAFETENSORS,
".json": ModelFileType.JSON,
".model": ModelFileType.MODEL,
".gguf": ModelFileType.GGUF,
}
class PreTokenizerType(IntEnum):
NON = auto()
BYTE_LEVEL = auto()
BERT_PRE_TOKENIZER = auto()
METASPACE = auto()
SEQUENCE = auto()
PRE_TOKENIZER_TYPE_NAMES: dict[PreTokenizerType, str] = {
PreTokenizerType.BYTE_LEVEL: "ByteLevel",
PreTokenizerType.BERT_PRE_TOKENIZER: "BertPreTokenizer",
PreTokenizerType.METASPACE: "Metaspace",
PreTokenizerType.SEQUENCE: "Sequence",
}
PRE_TOKENIZER_TYPE_MAP: dict[PreTokenizerType, str] = {
"ByteLevel": PreTokenizerType.BYTE_LEVEL,
"BertPreTokenizer": PreTokenizerType.BERT_PRE_TOKENIZER,
"Metaspace": PreTokenizerType.METASPACE,
"Sequence": PreTokenizerType.SEQUENCE,
}
# #
# HF Vocab Files # HF Vocab Files
# #