refactor: Simplify BPE pre-tokenizer mapping

This commit is contained in:
teleprint-me 2024-05-22 16:57:29 -04:00
parent 12285b5325
commit 1957ca41f2
No known key found for this signature in database
GPG key ID: B0D11345E65C4D48

View file

@ -957,10 +957,10 @@ LLaMaVocabTypeNames: dict[LLaMaVocabType, str] = {
# #
class HFModelFileType(IntEnum): class HFModelFileType(IntEnum):
UNK = auto() # Unsupported file type UNK = auto() # Unsupported file type
PT = auto() # PyTorch file type SFT = auto() # SafeTensor file type
PTH = auto() # PyTorch file type PTH = auto() # PyTorch file type
BIN = auto() # Pickled file type BIN = auto() # Pickled file type
SFT = auto() # SafeTensor file type PT = auto() # PyTorch file type
LLaMaVocabTypeNames: dict[LLaMaVocabType, str] = { LLaMaVocabTypeNames: dict[LLaMaVocabType, str] = {
@ -991,64 +991,53 @@ BPE_PRE_PROCESSOR_DEFAULT = (
"[\\p{P}\\$\\+<=>\\^~\\|]+", BPE_PRE_TOKENIZER_DEFAULT, "\\p{N}+", "[0-9][0-9][0-9]", "[\\p{P}\\$\\+<=>\\^~\\|]+", BPE_PRE_TOKENIZER_DEFAULT, "\\p{N}+", "[0-9][0-9][0-9]",
) )
# NOTE: GPT-2 is the standard default pre-tokenizer for all models
BPE_PRE_TOKENIZERS = { BPE_PRE_TOKENIZERS = {
"default": GPT_PRE_TOKENIZER_DEFAULT, # gpt2, olmo, phi (1, 1_5, 2, 3, ...)
# NOTE: LLAMA and DBRX (MODEL_ARCH_NAMES[MODEL_ARCH.DBRX]) are the same "gpt2": (GPT_PRE_TOKENIZER_DEFAULT,),
MODEL_ARCH_NAMES[MODEL_ARCH.LLAMA]: ( # dbrx
"llama3": (
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
), ),
MODEL_ARCH_NAMES[MODEL_ARCH.FALCON]: ( "falcon": (
"[\\p{P}\\$\\+<=>\\^~\\|]+", BPE_PRE_TOKENIZER_DEFAULT, "[0-9][0-9][0-9]", "[\\p{P}\\$\\+<=>\\^~\\|]+", BPE_PRE_TOKENIZER_DEFAULT, "[0-9][0-9][0-9]",
), ),
MODEL_ARCH_NAMES[MODEL_ARCH.BAICHUAN]: (), "mpt": (
MODEL_ARCH_NAMES[MODEL_ARCH.GROK]: (),
# NOTE: GPT-2 is the standard default pre-tokenizer for all models
MODEL_ARCH_NAMES[MODEL_ARCH.GPT2]: (
GPT_PRE_TOKENIZER_DEFAULT,
),
MODEL_ARCH_NAMES[MODEL_ARCH.GPTJ]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.GPTNEOX]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.MPT]: (
"\\s?\\p{L}+", "\\s?\\p{P}+", BPE_PRE_TOKENIZER_DEFAULT, "\\s?\\p{L}+", "\\s?\\p{P}+", BPE_PRE_TOKENIZER_DEFAULT,
), ),
MODEL_ARCH_NAMES[MODEL_ARCH.STARCODER]: ( # starcoder, refact, command-r
"starcoder": (
"\\p{N}", BPE_PRE_TOKENIZER_DEFAULT, "\\p{N}", BPE_PRE_TOKENIZER_DEFAULT,
), ),
MODEL_ARCH_NAMES[MODEL_ARCH.REFACT]: ( # qwen, qwen2, stablelm
"\\p{N}", BPE_PRE_TOKENIZER_DEFAULT, "qwen": (
),
MODEL_ARCH_NAMES[MODEL_ARCH.BERT]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.NOMIC_BERT]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.JINA_BERT_V2]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.BLOOM]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.STABLELM]: (
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
), ),
MODEL_ARCH_NAMES[MODEL_ARCH.QWEN]: (), # NOTE: deepseek uses the 'llama' arch, but diverges with the pre-tok.
MODEL_ARCH_NAMES[MODEL_ARCH.QWEN2]: ( "deepseek": (
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", "[\r\n]",
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ--ℝℤΩℨK--ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA--z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
"\\s?[!-/:-~---‟ -。]+",
"\\s+$",
"[一-龥ࠀ-一가-퟿]+",
"\\p{N}+",
), ),
MODEL_ARCH_NAMES[MODEL_ARCH.QWEN2MOE]: (), "deepseek-coder": (
MODEL_ARCH_NAMES[MODEL_ARCH.PHI2]: (GPT_PRE_TOKENIZER_DEFAULT,), "[\r\n]",
MODEL_ARCH_NAMES[MODEL_ARCH.PHI3]: (GPT_PRE_TOKENIZER_DEFAULT,), "\\s?\\p{L}+",
MODEL_ARCH_NAMES[MODEL_ARCH.PLAMO]: (), "\\s?\\p{P}+",
MODEL_ARCH_NAMES[MODEL_ARCH.CODESHELL]: (), "[一-龥ࠀ-一가-퟿]+",
MODEL_ARCH_NAMES[MODEL_ARCH.ORION]: (), "\\p{N}",
MODEL_ARCH_NAMES[MODEL_ARCH.INTERNLM2]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.MINICPM]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.GEMMA]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.STARCODER2]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.MAMBA]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.XVERSE]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.COMMAND_R]: (
"\\p{N}", BPE_PRE_TOKENIZER_DEFAULT,
),
MODEL_ARCH_NAMES[MODEL_ARCH.DBRX]: (
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
),
MODEL_ARCH_NAMES[MODEL_ARCH.OLMO]: (
GPT_PRE_TOKENIZER_DEFAULT,
), ),
# NOTE: ONLY ADD MODELS ON A AS NEEDED BASIS.
# This will get out of control if not properly managed.
# This needs a proper solution. The short-term solution is to manually build a map here.
# A proper long-term solution would be to build a dynamic registry.
# The issue is that this requires a mapping or a database.
# Possible solutions are to use JSON, HDF5, or SQLite.
# Some of these mappings could be dynamically generated, but it's sketchy at best.
# Model versions should be included along with the model name to mitigate name conflicts.
# This entire setup is extremely fragile and will result in breaking changes in the future.
} }
# #