convert : add t5 tokenizer tests
This commit is contained in:
parent
6dc9eb4040
commit
9eb5d5617d
27 changed files with 47 additions and 4 deletions
|
@ -45,6 +45,7 @@ class TOKENIZER_TYPE(IntEnum):
|
|||
SPM = auto()
|
||||
BPE = auto()
|
||||
WPM = auto()
|
||||
UGM = auto()
|
||||
|
||||
|
||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||
|
@ -85,6 +86,7 @@ models = [
|
|||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
||||
]
|
||||
|
||||
|
||||
|
@ -106,9 +108,13 @@ def download_model(model):
|
|||
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
|
||||
|
||||
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
|
||||
|
||||
if tokt == TOKENIZER_TYPE.SPM:
|
||||
files.append("tokenizer.model")
|
||||
|
||||
if tokt == TOKENIZER_TYPE.UGM:
|
||||
files.append("spiece.model")
|
||||
|
||||
for file in files:
|
||||
save_path = f"models/tokenizers/{name}/{file}"
|
||||
if os.path.isfile(save_path):
|
||||
|
@ -131,7 +137,7 @@ for model in models:
|
|||
name = model["name"]
|
||||
tokt = model["tokt"]
|
||||
|
||||
if tokt == TOKENIZER_TYPE.SPM:
|
||||
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
|
||||
continue
|
||||
|
||||
# Skip if the tokenizer folder does not exist or there are other download issues previously
|
||||
|
@ -262,6 +268,7 @@ tests = [
|
|||
"\n =",
|
||||
"' era",
|
||||
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
|
||||
"!!!!!!",
|
||||
"3",
|
||||
"33",
|
||||
"333",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue