From 9ba6b92c2d09ece1856fcdd84666cfe8cbf50c16 Mon Sep 17 00:00:00 2001
From: teleprint-me <77757836+teleprint-me@users.noreply.github.com>
Date: Thu, 23 May 2024 16:57:14 -0400
Subject: [PATCH] chore: Add required vocabulary constants

---
 gguf-py/gguf/constants.py | 54 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 67e23dcc1..34bfce4f3 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -940,6 +940,60 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
 }
 
 
+#
+# Tokenizer Types
+#
+class VOCAB_TYPE(IntEnum):
+    NON = auto()  # For models without vocab
+    SPM = auto()  # SentencePiece LLaMa tokenizer
+    BPE = auto()  # BytePair GPT-2 tokenizer
+    WPM = auto()  # WordPiece BERT tokenizer
+
+
+VOCAB_TYPE_NAMES: dict[VOCAB_TYPE, str] = {
+    VOCAB_TYPE.SPM: "SPM",
+    VOCAB_TYPE.BPE: "BPE",
+    VOCAB_TYPE.WPM: "WPM",
+}
+
+
+#
+# Model File Types
+#
+class MODEL_FILE_TYPE(IntEnum):
+    UNK = auto()  # Unsupported file type
+    SFT = auto()  # SafeTensor file type
+    PTH = auto()  # PyTorch file type
+    BIN = auto()  # Pickled file type
+    PT  = auto()  # PyTorch file type
+
+
+MODEL_FILE_TYPE_NAMES: dict[MODEL_FILE_TYPE, str] = {
+    MODEL_FILE_TYPE.PT:  "pt",
+    MODEL_FILE_TYPE.PTH: "pth",
+    MODEL_FILE_TYPE.BIN: "bin",
+    MODEL_FILE_TYPE.SFT: "safetensors",
+}
+
+#
+# HF Vocab Files
+#
+HF_TOKENIZER_BPE_FILES = ("config.json", "tokenizer_config.json", "tokenizer.json",)
+HF_TOKENIZER_SPM_FILES = (HF_TOKENIZER_BPE_FILES + ("tokenizer.model",))
+
+#
+# Pre-tokenization Regular Expressions
+#
+
+# NOTE: `tokenizers` defaults to OpenAI GPT-2 `ByteLevel` RegEx.
+# The pattern uses perl regex and formatting is arbitrary.
+# https://github.com/openai/gpt-2/blob/master/src/encoder.py#L53
+# https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L40-L42
+
+# These are fallback values if the pre-tokenizer cannot be dynamically discovered at runtime.
+BPE_PRE_TOKENIZER_DEFAULT = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)"
+GPT_PRE_TOKENIZER_DEFAULT = f"{BPE_PRE_TOKENIZER_DEFAULT}|\\s+"
+
 # Aliases for backward compatibility.
 
 # general