From b1c922fec79dfbc3e3140a84f39ef19d08f3411d Mon Sep 17 00:00:00 2001 From: teleprint-me <77757836+teleprint-me@users.noreply.github.com> Date: Mon, 27 May 2024 00:06:39 -0400 Subject: [PATCH] feat: Add a proto sketch for handling mode vocab metadata --- gguf-py/scripts/gguf-registry.py | 100 ++++++++++++++++++++++++++++--- 1 file changed, 91 insertions(+), 9 deletions(-) diff --git a/gguf-py/scripts/gguf-registry.py b/gguf-py/scripts/gguf-registry.py index 1f2dd9eb7..f7b2ea30c 100644 --- a/gguf-py/scripts/gguf-registry.py +++ b/gguf-py/scripts/gguf-registry.py @@ -54,6 +54,15 @@ HF_MODEL_MAP = ( "vocab_pre": None, "vocab_files": HF_TOKENIZER_SPM_FILES, }, + { + "model_repo": "mistralai/Mistral-7B-Instruct-v0.1", + "model_arch": MODEL_ARCH.LLAMA, + "model_parts": 2, + "model_type": ModelFileType.SAFETENSORS, + "vocab_type": VocabType.SPM, + "vocab_pre": None, + "vocab_files": HF_TOKENIZER_SPM_FILES, + }, { "model_repo": "mistralai/Mistral-7B-Instruct-v0.2", "model_arch": MODEL_ARCH.LLAMA, @@ -63,6 +72,15 @@ HF_MODEL_MAP = ( "vocab_pre": None, "vocab_files": HF_TOKENIZER_SPM_FILES, }, + { # NOTE: Mistral v0.3 has a 'tokenizer.model.v3' file + "model_repo": "mistralai/Mistral-7B-Instruct-v0.3", + "model_arch": MODEL_ARCH.LLAMA, + "model_parts": 3, + "model_type": ModelFileType.SAFETENSORS, + "vocab_type": VocabType.SPM, + "vocab_pre": None, + "vocab_files": HF_TOKENIZER_SPM_FILES, + }, { "model_repo": "mistralai/Mixtral-8x7B-Instruct-v0.1", "model_arch": MODEL_ARCH.LLAMA, @@ -219,15 +237,15 @@ HF_MODEL_MAP = ( "vocab_pre": None, "vocab_files": HF_TOKENIZER_BPE_FILES, }, - { # NOTE: I don't have access to this model - "model_repo": "databricks/dbrx-base", - "model_arch": MODEL_ARCH.DBRX, - "model_parts": 0, - "model_type": ModelFileType.SAFETENSORS, - "vocab_type": VocabType.BPE, - "vocab_pre": None, - "vocab_files": HF_TOKENIZER_BPE_FILES, - }, + # { # NOTE: I don't have access to this model + # "model_repo": "databricks/dbrx-base", + # "model_arch": MODEL_ARCH.DBRX, + # "model_parts": 0, + # "model_type": ModelFileType.SAFETENSORS, + # "vocab_type": VocabType.BPE, + # "vocab_pre": None, + # "vocab_files": HF_TOKENIZER_BPE_FILES, + # }, { # NOTE: RoBERTa post processor "model_repo": "jinaai/jina-embeddings-v2-base-es", "model_arch": MODEL_ARCH.JINA_BERT_V2, @@ -274,3 +292,67 @@ HF_MODEL_MAP = ( "vocab_files": HF_TOKENIZER_BPE_FILES, }, ) + + +def get_arguments() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("auth_token", help="A huggingface read auth token") + parser.add_argument( + "-v", "--verbose", action="store_true", help="Increase output verbosity." + ) + parser.add_argument( + "--model-path", + default="models", + help="The models storage path. Default is 'models'.", + ) + return parser.parse_args() + + +args = get_arguments() + +if args.verbose: + logging.basicConfig(level=logging.DEBUG) +else: + logging.basicConfig(level=logging.INFO) + +hub_model = HFHubModel( + auth_token=args.auth_token, + model_path=args.model_path, + logger=logger, +) + +hub_tokenizer = HFHubTokenizer( + model_path=args.model_path, + logger=logger, +) + + +for model in HF_MODEL_MAP: + + model_repo = model["model_repo"] + model_arch = model["model_arch"] + vocab_type = model["vocab_type"] + + print( + "HUB_REPO:", model_repo, + "LLAMA_ARCH:", MODEL_ARCH_NAMES[model_arch] + ) + + hub_model.download_all_vocab_files( + model_repo=model_repo, + vocab_type=vocab_type, + ) + # log the downloaded results + hub_tokenizer.log_tokenizer_json_info(model_repo) + + normalizer = hub_tokenizer.get_normalizer(model_repo) + # extract the normalizer metadata + + pre_tokenizer = hub_tokenizer.get_pre_tokenizer(model_repo) + # extract the pre-tokenizer metadata + + added_tokens = hub_tokenizer.get_added_tokens(model_repo) + # extract the added tokens metadata + + sha256sum = hub_tokenizer.get_tokenizer_json_hash(model_repo) + # use the hash to validate the models vocabulary