refactor: Simplify BPE pre-tokenizer mapping

This commit is contained in:
teleprint-me 2024-05-22 16:57:29 -04:00
parent 12285b5325
commit 1957ca41f2
No known key found for this signature in database
GPG key ID: B0D11345E65C4D48

View file

@ -957,10 +957,10 @@ LLaMaVocabTypeNames: dict[LLaMaVocabType, str] = {
#
class HFModelFileType(IntEnum):
UNK = auto() # Unsupported file type
PT = auto() # PyTorch file type
SFT = auto() # SafeTensor file type
PTH = auto() # PyTorch file type
BIN = auto() # Pickled file type
SFT = auto() # SafeTensor file type
PT = auto() # PyTorch file type
LLaMaVocabTypeNames: dict[LLaMaVocabType, str] = {
@ -991,64 +991,53 @@ BPE_PRE_PROCESSOR_DEFAULT = (
"[\\p{P}\\$\\+<=>\\^~\\|]+", BPE_PRE_TOKENIZER_DEFAULT, "\\p{N}+", "[0-9][0-9][0-9]",
)
# NOTE: GPT-2 is the standard default pre-tokenizer for all models
BPE_PRE_TOKENIZERS = {
"default": GPT_PRE_TOKENIZER_DEFAULT,
# NOTE: LLAMA and DBRX (MODEL_ARCH_NAMES[MODEL_ARCH.DBRX]) are the same
MODEL_ARCH_NAMES[MODEL_ARCH.LLAMA]: (
# gpt2, olmo, phi (1, 1_5, 2, 3, ...)
"gpt2": (GPT_PRE_TOKENIZER_DEFAULT,),
# dbrx
"llama3": (
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
),
MODEL_ARCH_NAMES[MODEL_ARCH.FALCON]: (
"falcon": (
"[\\p{P}\\$\\+<=>\\^~\\|]+", BPE_PRE_TOKENIZER_DEFAULT, "[0-9][0-9][0-9]",
),
MODEL_ARCH_NAMES[MODEL_ARCH.BAICHUAN]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.GROK]: (),
# NOTE: GPT-2 is the standard default pre-tokenizer for all models
MODEL_ARCH_NAMES[MODEL_ARCH.GPT2]: (
GPT_PRE_TOKENIZER_DEFAULT,
),
MODEL_ARCH_NAMES[MODEL_ARCH.GPTJ]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.GPTNEOX]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.MPT]: (
"mpt": (
"\\s?\\p{L}+", "\\s?\\p{P}+", BPE_PRE_TOKENIZER_DEFAULT,
),
MODEL_ARCH_NAMES[MODEL_ARCH.STARCODER]: (
# starcoder, refact, command-r
"starcoder": (
"\\p{N}", BPE_PRE_TOKENIZER_DEFAULT,
),
MODEL_ARCH_NAMES[MODEL_ARCH.REFACT]: (
"\\p{N}", BPE_PRE_TOKENIZER_DEFAULT,
),
MODEL_ARCH_NAMES[MODEL_ARCH.BERT]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.NOMIC_BERT]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.JINA_BERT_V2]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.BLOOM]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.STABLELM]: (
# qwen, qwen2, stablelm
"qwen": (
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
),
MODEL_ARCH_NAMES[MODEL_ARCH.QWEN]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.QWEN2]: (
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
# NOTE: deepseek uses the 'llama' arch, but diverges with the pre-tok.
"deepseek": (
"[\r\n]",
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ--ℝℤΩℨK--ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA--z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
"\\s?[!-/:-~---‟ -。]+",
"\\s+$",
"[一-龥ࠀ-一가-퟿]+",
"\\p{N}+",
),
MODEL_ARCH_NAMES[MODEL_ARCH.QWEN2MOE]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.PHI2]: (GPT_PRE_TOKENIZER_DEFAULT,),
MODEL_ARCH_NAMES[MODEL_ARCH.PHI3]: (GPT_PRE_TOKENIZER_DEFAULT,),
MODEL_ARCH_NAMES[MODEL_ARCH.PLAMO]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.CODESHELL]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.ORION]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.INTERNLM2]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.MINICPM]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.GEMMA]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.STARCODER2]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.MAMBA]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.XVERSE]: (),
MODEL_ARCH_NAMES[MODEL_ARCH.COMMAND_R]: (
"\\p{N}", BPE_PRE_TOKENIZER_DEFAULT,
),
MODEL_ARCH_NAMES[MODEL_ARCH.DBRX]: (
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
),
MODEL_ARCH_NAMES[MODEL_ARCH.OLMO]: (
GPT_PRE_TOKENIZER_DEFAULT,
"deepseek-coder": (
"[\r\n]",
"\\s?\\p{L}+",
"\\s?\\p{P}+",
"[一-龥ࠀ-一가-퟿]+",
"\\p{N}",
),
# NOTE: ONLY ADD MODELS ON A AS NEEDED BASIS.
# This will get out of control if not properly managed.
# This needs a proper solution. The short-term solution is to manually build a map here.
# A proper long-term solution would be to build a dynamic registry.
# The issue is that this requires a mapping or a database.
# Possible solutions are to use JSON, HDF5, or SQLite.
# Some of these mappings could be dynamically generated, but it's sketchy at best.
# Model versions should be included along with the model name to mitigate name conflicts.
# This entire setup is extremely fragile and will result in breaking changes in the future.
}
#