feat: Add example script for automating generating tokenizer model checksums and tests
This commit is contained in:
parent
006bb60d27
commit
1a82573126
1 changed files with 131 additions and 4 deletions
|
@ -7,8 +7,6 @@ import os
|
|||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
# Necessary to load the local gguf package
|
||||
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
@ -18,7 +16,119 @@ from gguf.huggingface_hub import HFVocabRequest
|
|||
logger = logging.getLogger("gguf-gen-pre")
|
||||
|
||||
|
||||
def test_pre_tok(content) -> None:
|
||||
# NOTE: It's impossible to catch all edge cases.
|
||||
# Most naive way to handle this is to a have a pre-compiled unicode list of all 1.1 million characters
|
||||
# as it's finite and iso standardized.
|
||||
# This means we can predict the upper bound and can apply known time complexity solutions to
|
||||
# discover the best way resolve it.
|
||||
def test_pre_tok_params() -> list[str]:
|
||||
return [
|
||||
"ü, ǖ, ǘ, ǚ, ǜ", # diaeresis
|
||||
"綠, 女, 怒, 玉, 句", # pinyin
|
||||
"ied 4 ½ months", # ordinal
|
||||
"¡Hola Mundo!", # spanish
|
||||
"Olá Mundo!", # portuguese
|
||||
"Selam Dünya!", # turkish
|
||||
"Salam, dünýä!", # turkman
|
||||
"Γειά σου Κόσμε!", # greek
|
||||
"हैलो वर्ल्ड!", # hindi
|
||||
"สวัสดีชาวโลก!", # thai
|
||||
"こんにちは世界!", # japanese
|
||||
"你好世界!", # chinese
|
||||
"Hàlo a Shaoghail!", # gaelic
|
||||
"Chào thế giới!", # vietnamese
|
||||
"Привет, мир!", # russian
|
||||
"Здравей свят!", # bulgarian
|
||||
"សួស្តីពិភពលោក!", # kymer
|
||||
"Le rapide renard brun sauta par dessus le chien paresseux.", # french
|
||||
"\tWil je een kopje thee?\n", # dutch
|
||||
" Te gustaría algo de té ? ", # spanish
|
||||
# NOTE: I expect right-to-left languages to fail
|
||||
"העלא וועלט!", # yiddish (r-to-l)
|
||||
"سلام دنیا!", # persian (r-to-l)
|
||||
"", # Why?; This is a falsy value in python, no symbols.
|
||||
" ",
|
||||
" ",
|
||||
" ",
|
||||
"\t",
|
||||
"\n",
|
||||
"\n\n",
|
||||
"\n\n\n",
|
||||
"\t\n",
|
||||
"Hello world",
|
||||
" Hello world",
|
||||
"Hello World",
|
||||
" Hello World",
|
||||
" Hello World!",
|
||||
"Hello, world!",
|
||||
" Hello, world!",
|
||||
" this is 🦙.cpp",
|
||||
"w048 7tuijk dsdfhu",
|
||||
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||
"Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello",
|
||||
" Hello\n Hello",
|
||||
" (",
|
||||
"\n =",
|
||||
"' era",
|
||||
"Hello, y'all! How are you 😁 局外人?苹果apple工作work3.14159天God~",
|
||||
"3",
|
||||
"33",
|
||||
"333",
|
||||
"3333",
|
||||
"33333",
|
||||
"333333",
|
||||
"3333333",
|
||||
]
|
||||
|
||||
|
||||
def test_pre_tok(hf_voc_req: HFVocabRequest) -> None:
|
||||
# NOTE: aggregate all models to their respective paths
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
params = test_pre_tok_params()
|
||||
for model in hf_voc_req.models:
|
||||
# set the model path, e.g. 'models/meta-llama/Llama-2-7b-hf'
|
||||
path = Path(f"{hf_voc_req.model_path}/{model["repo"]}")
|
||||
# set the model name, e.g. llama-2-7b-hf
|
||||
name = path.stem.lower()
|
||||
# model input encodings, e.g. 'models/meta-llama/Llama-2-7b-hf/llama-2-7b-hf.vocab.gguf.inp'
|
||||
inp = path / f"ggml-vocab-{name}.inp"
|
||||
# model output encodings, e.g. 'models/meta-llama/Llama-2-7b-hf/llama-2-7b-hf.vocab.gguf.out'
|
||||
out = path / f"ggml-vocab-{name}.out"
|
||||
# extracted tokenizer model
|
||||
final = path / f"ggml-vocab-{name}.gguf"
|
||||
|
||||
# skip tokenizer folder if unavailable
|
||||
if not path.exists():
|
||||
logger.warning(f"skipped - {model['repo']} not found.")
|
||||
continue
|
||||
|
||||
try: # create the tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(path)
|
||||
except OSError as e:
|
||||
logger.error(f"{model['repo']} not found: {e}")
|
||||
continue # skip this tokenizer model
|
||||
|
||||
with open(inp, "w", encoding="utf-8") as f:
|
||||
for test in params:
|
||||
f.write(f"{test}")
|
||||
f.write("\n__ggml_vocab_test__\n")
|
||||
|
||||
with open(out, "w", encoding="utf-8") as f:
|
||||
for test in params:
|
||||
encodings = tokenizer.encode(test, add_special_tokens=False)
|
||||
for encoding in encodings:
|
||||
f.write(f" {encoding}")
|
||||
f.write("\n")
|
||||
|
||||
logger.info(f"Tests for {model["repo"]} written in {final}.*")
|
||||
|
||||
|
||||
def generate_tokenizers(hf_voc_req: HFVocabRequest) -> None:
|
||||
pass
|
||||
|
||||
|
||||
|
@ -29,7 +139,13 @@ def main():
|
|||
"-v", "--verbose", action="store_true", help="A huggingface read auth token"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m", "--model-path", default=None, help="The models storage path"
|
||||
"-m", "--model-path", default=None, help="The models storage path. Default is 'models/'."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t", "--gen-tests", action="store_true", help="Generate the tokenizer tests. Default is False."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-g", "--gen-toks", action="store_true", help="Generate the gguf vocab files. Default is False."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
@ -42,5 +158,16 @@ def main():
|
|||
args.model_path, args.hf_auth_token, logger
|
||||
)
|
||||
|
||||
hf_vocab_req.download_models()
|
||||
hf_vocab_req.generate_checksums()
|
||||
hf_vocab_req.log_pre_tokenizer_info()
|
||||
|
||||
if args.gen_tests:
|
||||
test_pre_tok(hf_vocab_req)
|
||||
|
||||
if args.gen_toks:
|
||||
generate_tokenizers(hf_vocab_req)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue