diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 83a0bde60..2eecacb69 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -988,6 +988,12 @@ VOCAB_TYPE_NAMES: dict[VocabType, str] = { VocabType.WPM: "WPM", } +VOCAB_TYPE_MAP: dict[str, VocabType] = { + "SPM": VocabType.SPM, + "BPE": VocabType.BPE, + "WPM": VocabType.WPM, +} + # # Model File Types @@ -1013,6 +1019,39 @@ MODEL_FILE_TYPE_NAMES: dict[ModelFileType, str] = { ModelFileType.GGUF: ".gguf", } +MODEL_FILE_TYPE_MAP: dict[ModelFileType, str] = { + ".pt": ModelFileType.PT, + ".pth": ModelFileType.PTH, + ".bin": ModelFileType.BIN, + ".safetensors": ModelFileType.SAFETENSORS, + ".json": ModelFileType.JSON, + ".model": ModelFileType.MODEL, + ".gguf": ModelFileType.GGUF, +} + + +class PreTokenizerType(IntEnum): + NON = auto() + BYTE_LEVEL = auto() + BERT_PRE_TOKENIZER = auto() + METASPACE = auto() + SEQUENCE = auto() + + +PRE_TOKENIZER_TYPE_NAMES: dict[PreTokenizerType, str] = { + PreTokenizerType.BYTE_LEVEL: "ByteLevel", + PreTokenizerType.BERT_PRE_TOKENIZER: "BertPreTokenizer", + PreTokenizerType.METASPACE: "Metaspace", + PreTokenizerType.SEQUENCE: "Sequence", +} + +PRE_TOKENIZER_TYPE_MAP: dict[PreTokenizerType, str] = { + "ByteLevel": PreTokenizerType.BYTE_LEVEL, + "BertPreTokenizer": PreTokenizerType.BERT_PRE_TOKENIZER, + "Metaspace": PreTokenizerType.METASPACE, + "Sequence": PreTokenizerType.SEQUENCE, +} + # # HF Vocab Files #