From 9ba6b92c2d09ece1856fcdd84666cfe8cbf50c16 Mon Sep 17 00:00:00 2001 From: teleprint-me <77757836+teleprint-me@users.noreply.github.com> Date: Thu, 23 May 2024 16:57:14 -0400 Subject: [PATCH] chore: Add required vocabulary constants --- gguf-py/gguf/constants.py | 54 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 67e23dcc1..34bfce4f3 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -940,6 +940,60 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { } +# +# Tokenizer Types +# +class VOCAB_TYPE(IntEnum): + NON = auto() # For models without vocab + SPM = auto() # SentencePiece LLaMa tokenizer + BPE = auto() # BytePair GPT-2 tokenizer + WPM = auto() # WordPiece BERT tokenizer + + +VOCAB_TYPE_NAMES: dict[VOCAB_TYPE, str] = { + VOCAB_TYPE.SPM: "SPM", + VOCAB_TYPE.BPE: "BPE", + VOCAB_TYPE.WPM: "WPM", +} + + +# +# Model File Types +# +class MODEL_FILE_TYPE(IntEnum): + UNK = auto() # Unsupported file type + SFT = auto() # SafeTensor file type + PTH = auto() # PyTorch file type + BIN = auto() # Pickled file type + PT = auto() # PyTorch file type + + +MODEL_FILE_TYPE_NAMES: dict[MODEL_FILE_TYPE, str] = { + MODEL_FILE_TYPE.PT: "pt", + MODEL_FILE_TYPE.PTH: "pth", + MODEL_FILE_TYPE.BIN: "bin", + MODEL_FILE_TYPE.SFT: "safetensors", +} + +# +# HF Vocab Files +# +HF_TOKENIZER_BPE_FILES = ("config.json", "tokenizer_config.json", "tokenizer.json",) +HF_TOKENIZER_SPM_FILES = (HF_TOKENIZER_BPE_FILES + ("tokenizer.model",)) + +# +# Pre-tokenization Regular Expressions +# + +# NOTE: `tokenizers` defaults to OpenAI GPT-2 `ByteLevel` RegEx. +# The pattern uses perl regex and formatting is arbitrary. +# https://github.com/openai/gpt-2/blob/master/src/encoder.py#L53 +# https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L40-L42 + +# These are fallback values if the pre-tokenizer cannot be dynamically discovered at runtime. +BPE_PRE_TOKENIZER_DEFAULT = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)" +GPT_PRE_TOKENIZER_DEFAULT = f"{BPE_PRE_TOKENIZER_DEFAULT}|\\s+" + # Aliases for backward compatibility. # general