chore: Move enums and model map to constants

This commit is contained in:
teleprint-me 2024-05-20 14:51:03 -04:00
parent d9ba963cd4
commit 2fa2c7a86c
No known key found for this signature in database
GPG key ID: B0D11345E65C4D48
2 changed files with 51 additions and 46 deletions

View file

@ -954,6 +954,55 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
} }
#
# LLaMa Tokenizer Types
#
class LLaMaVocabType(IntEnum):
NON = auto() # For models without vocab
SPM = auto() # SentencePiece LLaMa tokenizer
BPE = auto() # BytePair GPT-2 tokenizer
WPM = auto() # WordPiece BERT tokenizer
class LLaMaModelType(IntEnum):
UNK = auto() # Unsupported file type
PTH = auto() # PyTorch file type
SFT = auto() # SafeTensor file type
#
# LLaMa Tokenizer Map
#
# NOTE:
# - Repository paths are required
# - Allow the user to specify the tokenizer model type themselves
# - Use architecture types because they are explicitly defined
# - Possible tokenizer model types are: SentencePiece, WordPiece, or BytePair
HF_MODEL_MAP = (
{"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.SPM, "repo": "meta-llama/Llama-2-7b-hf", },
{"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.BPE, "repo": "meta-llama/Meta-Llama-3-8B", },
{"model_arch": MODEL_ARCH.PHI3, "vocab_type": LLaMaVocabType.SPM, "repo": "microsoft/Phi-3-mini-4k-instruct", },
{"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.BPE, "repo": "deepseek-ai/deepseek-llm-7b-base", },
{"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.BPE, "repo": "deepseek-ai/deepseek-coder-6.7b-base", },
{"model_arch": MODEL_ARCH.FALCON, "vocab_type": LLaMaVocabType.BPE, "repo": "tiiuae/falcon-7b", },
{"model_arch": MODEL_ARCH.BERT, "vocab_type": LLaMaVocabType.WPM, "repo": "BAAI/bge-small-en-v1.5", },
{"model_arch": MODEL_ARCH.MPT, "vocab_type": LLaMaVocabType.BPE, "repo": "mosaicml/mpt-7b", },
{"model_arch": MODEL_ARCH.STARCODER2, "vocab_type": LLaMaVocabType.BPE, "repo": "bigcode/starcoder2-3b", },
{"model_arch": MODEL_ARCH.GPT2, "vocab_type": LLaMaVocabType.BPE, "repo": "openai-community/gpt2", },
{"model_arch": MODEL_ARCH.REFACT, "vocab_type": LLaMaVocabType.BPE, "repo": "smallcloudai/Refact-1_6-base", },
{"model_arch": MODEL_ARCH.COMMAND_R, "vocab_type": LLaMaVocabType.BPE, "repo": "CohereForAI/c4ai-command-r-v01", },
{"model_arch": MODEL_ARCH.QWEN2, "vocab_type": LLaMaVocabType.BPE, "repo": "Qwen/Qwen1.5-7B", },
{"model_arch": MODEL_ARCH.OLMO, "vocab_type": LLaMaVocabType.BPE, "repo": "allenai/OLMo-1.7-7B-hf", },
{"model_arch": MODEL_ARCH.DBRX, "vocab_type": LLaMaVocabType.BPE, "repo": "databricks/dbrx-base", },
{"model_arch": MODEL_ARCH.JINA_BERT_V2, "vocab_type": LLaMaVocabType.WPM, "repo": "jinaai/jina-embeddings-v2-base-en", },
{"model_arch": MODEL_ARCH.JINA_BERT_V2, "vocab_type": LLaMaVocabType.BPE, "repo": "jinaai/jina-embeddings-v2-base-es", },
{"model_arch": MODEL_ARCH.JINA_BERT_V2, "vocab_type": LLaMaVocabType.BPE, "repo": "jinaai/jina-embeddings-v2-base-de", },
{"model_arch": MODEL_ARCH.PHI2, "vocab_type": LLaMaVocabType.BPE, "repo": "microsoft/phi-1", },
{"model_arch": MODEL_ARCH.STABLELM, "vocab_type": LLaMaVocabType.BPE, "repo": "stabilityai/stablelm-2-zephyr-1_6b", },
{"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.SPM, "repo": "mistralai/Mistral-7B-Instruct-v0.2", },
{"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.SPM, "repo": "mistralai/Mixtral-8x7B-Instruct-v0.1", },
)
# Aliases for backward compatibility. # Aliases for backward compatibility.
# general # general

View file

@ -2,56 +2,12 @@ import json
import logging import logging
import os import os
import pathlib import pathlib
from enum import IntEnum, auto
from hashlib import sha256 from hashlib import sha256
import requests import requests
from transformers import AutoTokenizer from transformers import AutoTokenizer
from .constants import MODEL_ARCH from .constants import HF_MODEL_MAP, LLaMaModelType, LLaMaVocabType
class LLaMaVocabType(IntEnum):
NON = auto() # For models without vocab
SPM = auto() # SentencePiece LLaMa tokenizer
BPE = auto() # BytePair GPT-2 tokenizer
WPM = auto() # WordPiece BERT tokenizer
class LLaMaModelType(IntEnum):
PTH = auto() # PyTorch
SFT = auto() # SafeTensor
# NOTE:
# - Repository paths are required
# - Allow the user to specify the tokenizer model type themselves
# - Use architecture types because they are explicitly defined
# - Possible tokenizer model types are: SentencePiece, WordPiece, or BytePair
MODELS = (
{"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.SPM, "repo": "meta-llama/Llama-2-7b-hf", },
{"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.BPE, "repo": "meta-llama/Meta-Llama-3-8B", },
{"model_arch": MODEL_ARCH.PHI3, "vocab_type": LLaMaVocabType.SPM, "repo": "microsoft/Phi-3-mini-4k-instruct", },
{"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.BPE, "repo": "deepseek-ai/deepseek-llm-7b-base", },
{"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.BPE, "repo": "deepseek-ai/deepseek-coder-6.7b-base", },
{"model_arch": MODEL_ARCH.FALCON, "vocab_type": LLaMaVocabType.BPE, "repo": "tiiuae/falcon-7b", },
{"model_arch": MODEL_ARCH.BERT, "vocab_type": LLaMaVocabType.WPM, "repo": "BAAI/bge-small-en-v1.5", },
{"model_arch": MODEL_ARCH.MPT, "vocab_type": LLaMaVocabType.BPE, "repo": "mosaicml/mpt-7b", },
{"model_arch": MODEL_ARCH.STARCODER2, "vocab_type": LLaMaVocabType.BPE, "repo": "bigcode/starcoder2-3b", },
{"model_arch": MODEL_ARCH.GPT2, "vocab_type": LLaMaVocabType.BPE, "repo": "openai-community/gpt2", },
{"model_arch": MODEL_ARCH.REFACT, "vocab_type": LLaMaVocabType.BPE, "repo": "smallcloudai/Refact-1_6-base", },
{"model_arch": MODEL_ARCH.COMMAND_R, "vocab_type": LLaMaVocabType.BPE, "repo": "CohereForAI/c4ai-command-r-v01", },
{"model_arch": MODEL_ARCH.QWEN2, "vocab_type": LLaMaVocabType.BPE, "repo": "Qwen/Qwen1.5-7B", },
{"model_arch": MODEL_ARCH.OLMO, "vocab_type": LLaMaVocabType.BPE, "repo": "allenai/OLMo-1.7-7B-hf", },
{"model_arch": MODEL_ARCH.DBRX, "vocab_type": LLaMaVocabType.BPE, "repo": "databricks/dbrx-base", },
{"model_arch": MODEL_ARCH.JINA_BERT_V2, "vocab_type": LLaMaVocabType.WPM, "repo": "jinaai/jina-embeddings-v2-base-en", },
{"model_arch": MODEL_ARCH.JINA_BERT_V2, "vocab_type": LLaMaVocabType.BPE, "repo": "jinaai/jina-embeddings-v2-base-es", },
{"model_arch": MODEL_ARCH.JINA_BERT_V2, "vocab_type": LLaMaVocabType.BPE, "repo": "jinaai/jina-embeddings-v2-base-de", },
{"model_arch": MODEL_ARCH.PHI2, "vocab_type": LLaMaVocabType.BPE, "repo": "microsoft/phi-1", },
{"model_arch": MODEL_ARCH.STABLELM, "vocab_type": LLaMaVocabType.BPE, "repo": "stabilityai/stablelm-2-zephyr-1_6b", },
{"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.SPM, "repo": "mistralai/Mistral-7B-Instruct-v0.2", },
{"model_arch": MODEL_ARCH.LLAMA, "vocab_type": LLaMaVocabType.SPM, "repo": "mistralai/Mixtral-8x7B-Instruct-v0.1", },
)
class HFHubRequest: class HFHubRequest:
@ -125,7 +81,7 @@ class HFHubBase:
self.logger = logger self.logger = logger
self._hub = HFHubRequest(auth_token, logger) self._hub = HFHubRequest(auth_token, logger)
self._models = list(MODELS) self._models = list(HF_MODEL_MAP)
@property @property
def hub(self) -> HFHubRequest: def hub(self) -> HFHubRequest: