From 34e14ae96d3072a2ad9e1fe137f3c2ed919c7276 Mon Sep 17 00:00:00 2001 From: teleprint-me <77757836+teleprint-me@users.noreply.github.com> Date: Tue, 21 May 2024 19:11:51 -0400 Subject: [PATCH] refactor: Add experimental model mappings --- gguf-py/gguf/constants.py | 94 ++++++++++++++++++++++++++++++--------- 1 file changed, 72 insertions(+), 22 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 48a3f31ee..067344988 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -955,23 +955,34 @@ class HFModelFileType(IntEnum): SFT = auto() # SafeTensor file type +# NOTE: It's easier to map out which files we need in advance. +HF_TOKENIZER_BPE_FILES = ("config.json", "tokenizer_config.json", "tokenizer.json",) +HF_TOKENIZER_SPM_FILES = (HF_TOKENIZER_BPE_FILES + ("tokenizer.model",)) + # NOTE: Tokenizers defaults to OpenAI GPT-2 Byte Level Reg-Exp # The pattern uses perl, is grammatical, and splits are technically arbitrary. # https://github.com/openai/gpt-2/blob/master/src/encoder.py#L53 # https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L40-L42 -HF_TOKENIZER_DEFAULT_PRE = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+" - -# NOTE: It's easier to map out which files we need in advance. -HF_TOKENIZER_DEFAULT_BPE = ("config.json", "tokenizer_config.json", "tokenizer.json",) -HF_TOKENIZER_DEFAULT_SPM = (HF_TOKENIZER_DEFAULT_BPE + ("tokenizer.model",)) +BPE_PRE_TOKENIZER_DEFAULT = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)" +GPT_PRE_TOKENIZER_DEFAULT = f"{BPE_PRE_TOKENIZER_DEFAULT}|\\s+" # # HuggingFace Model Map # -# NOTE: -# - Repository paths are required -# - Allow the user to specify the tokenizer model type themselves -# - Use architecture types because they are explicitly defined +# NOTE: All prerequisite model metadata must be defined here. +# +# Defines metadata for each Hugging Face model required during conversion to GGUF +# +# Field Descriptions +# - `model_repo` (str): The HuggingFace endpoint or local path to the models repository +# - `model_arch` (MODEL_ARCH): Model architecture type +# - `model_parts` (int): Number of parts required to join the model during conversion +# - `model_type` (FileFormatType): File format for the Hugging Face model files +# - `vocab_type` (VocabType): Vocabulary type used by the tokenizer +# - `vocab_pre` (Optional[Tuple[str]]): List of pre-tokenizer pattern strings for this model +# - `vocab_files` (Tuple[str]): List of file names required to extract vocabulary and other metadata +# +# NOTES # - Possible algorithms are WordLevel, BPE, WordPiece, or Unigram # - Possible LLaMa tokenizer model types are: None, SPM, BPE, or WPM HF_MODEL_MAP = ( @@ -982,7 +993,16 @@ HF_MODEL_MAP = ( "model_type": HFModelFileType.SFT, "vocab_type": LLaMaVocabType.SPM, "vocab_pre": (), - "vocab_files": HF_TOKENIZER_DEFAULT_SPM, + "vocab_files": HF_TOKENIZER_SPM_FILES, + }, + { + "model_repo": "tiiuae/falcon-7b", + "model_arch": MODEL_ARCH.FALCON, + "model_parts": 2, + "model_type": HFModelFileType.BIN, + "vocab_type": LLaMaVocabType.BPE, + "vocab_pre": BPE_PRE_TOKENIZERS[MODEL_ARCH_NAMES[MODEL_ARCH.FALCON]], + "vocab_files": HF_TOKENIZER_BPE_FILES, }, { "model_repo": "meta-llama/Meta-Llama-3-8B", @@ -990,10 +1010,9 @@ HF_MODEL_MAP = ( "model_parts": 4, "model_type": HFModelFileType.SFT, "vocab_type": LLaMaVocabType.BPE, - "vocab_pre": ( - "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", - ), - "vocab_files": HF_TOKENIZER_DEFAULT_BPE, + # PR#6920: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989 + "vocab_pre": BPE_PRE_TOKENIZERS[MODEL_ARCH_NAMES[MODEL_ARCH.LLAMA]], + "vocab_files": HF_TOKENIZER_BPE_FILES, }, { "model_repo": "microsoft/Phi-3-mini-4k-instruct", @@ -1002,7 +1021,7 @@ HF_MODEL_MAP = ( "model_type": HFModelFileType.SFT, "vocab_type": LLaMaVocabType.SPM, "vocab_pre": (), - "vocab_files": HF_TOKENIZER_DEFAULT_SPM, + "vocab_files": HF_TOKENIZER_SPM_FILES, }, { "model_repo": "deepseek-ai/deepseek-llm-7b-base", @@ -1010,33 +1029,64 @@ HF_MODEL_MAP = ( "model_parts": 2, "model_type": HFModelFileType.BIN, "vocab_type": LLaMaVocabType.BPE, - "vocab_pre": (), - "vocab_file": HF_TOKENIZER_DEFAULT_BPE, + "vocab_pre": ( + "[\r\n]", + "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", + "\\s?[!-/:-~!-/:-~‘-‟ -。]+", + "\\s+$", + "[一-龥ࠀ-一가-퟿]+", + "\\p{N}+", + ), + "vocab_files": HF_TOKENIZER_BPE_FILES, }, { "model_repo": "deepseek-ai/deepseek-coder-6.7b-base", "model_arch": MODEL_ARCH.LLAMA, + "model_parts": 2, + "model_type": HFModelFileType.SFT, "vocab_type": LLaMaVocabType.BPE, - }, - { - "model_repo": "tiiuae/falcon-7b", - "model_arch": MODEL_ARCH.FALCON, - "vocab_type": LLaMaVocabType.BPE, + "vocab_pre": ( + "[\r\n]", + "\\s?\\p{L}+", + "\\s?\\p{P}+", + "[一-龥ࠀ-一가-퟿]+", + "\\p{N}", + ), + "vocab_files": HF_TOKENIZER_BPE_FILES, }, { "model_repo": "BAAI/bge-small-en-v1.5", "model_arch": MODEL_ARCH.BERT, + "model_parts": 1, + "model_type": HFModelFileType.BIN, "vocab_type": LLaMaVocabType.WPM, + "vocab_pre": (), + "vocab_files": HF_TOKENIZER_BPE_FILES, }, { "model_repo": "mosaicml/mpt-7b", "model_arch": MODEL_ARCH.MPT, + "model_parts": 2, + "model_type": HFModelFileType.BIN, "vocab_type": LLaMaVocabType.BPE, + "vocab_pre": ( + "\\s?\\p{L}+", + "\\s?\\p{P}+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + ), + "vocab_files": HF_TOKENIZER_BPE_FILES, }, { "model_repo": "bigcode/starcoder2-3b", "model_arch": MODEL_ARCH.STARCODER2, + "model_parts": 1, + "model_type": HFModelFileType.SFT, "vocab_type": LLaMaVocabType.BPE, + "vocab_pre": ( + "\\p{N}", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + ), + "vocab_files": HF_TOKENIZER_BPE_FILES, }, { "model_repo": "openai-community/gpt2",