From aed0573f68df3814baa929f34fed775c7dcdbd3f Mon Sep 17 00:00:00 2001 From: teleprint-me <77757836+teleprint-me@users.noreply.github.com> Date: Tue, 21 May 2024 00:14:26 -0400 Subject: [PATCH] proto: Add experimental vocab pre-tokenizer regular expressions --- gguf-py/gguf/constants.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 219d82e42..71fa305fb 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -973,6 +973,12 @@ class LLaMaModelType(IntEnum): SFT = auto() # SafeTensor file type +# NOTE: Tokenizers defaults to OpenAI GPT-2 Byte Level Reg-Exp +# The pattern uses perl, is grammatical, and splits are technically arbitrary. +# https://github.com/openai/gpt-2/blob/master/src/encoder.py#L53 +# https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L40-L42 +LLAMA_TOKENIZER_DEFAULT_PRE = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+" + # # HuggingFace Model Map # @@ -983,8 +989,20 @@ class LLaMaModelType(IntEnum): # - Possible algorithms are WordLevel, BPE, WordPiece, or Unigram # - Possible LLaMa tokenizer model types are: None, SPM, BPE, or WPM HF_MODEL_MAP = ( - {"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.SPM, "repo": "meta-llama/Llama-2-7b-hf", }, - {"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.BPE, "repo": "meta-llama/Meta-Llama-3-8B", }, + { + "model_repo": "meta-llama/Llama-2-7b-hf", + "model_arch": MODEL_ARCH.LLAMA, + "vocab_type": LLaMaVocabType.SPM, + "vocab_pre": [], + }, + { + "model_arch": MODEL_ARCH.LLAMA, + "vocab_type": LLaMaVocabType.BPE, + "repo": "meta-llama/Meta-Llama-3-8B", + "vocab_pre": [ + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + ], + }, {"model_arch": MODEL_ARCH.PHI3, "vocab_type": LLaMaVocabType.SPM, "repo": "microsoft/Phi-3-mini-4k-instruct", }, {"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.BPE, "repo": "deepseek-ai/deepseek-llm-7b-base", }, {"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.BPE, "repo": "deepseek-ai/deepseek-coder-6.7b-base", },