feat: Add example script for automating generating tokenizer model checksums and tests

This commit is contained in:
teleprint-me 2024-05-18 20:49:22 -04:00
parent 006bb60d27
commit 1a82573126
No known key found for this signature in database
GPG key ID: B0D11345E65C4D48

View file

@ -7,8 +7,6 @@ import os
import sys
from pathlib import Path
from tqdm import tqdm
# Necessary to load the local gguf package
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
sys.path.insert(0, str(Path(__file__).parent.parent))
@ -18,7 +16,119 @@ from gguf.huggingface_hub import HFVocabRequest
logger = logging.getLogger("gguf-gen-pre")
def test_pre_tok(content) -> None:
# NOTE: It's impossible to catch all edge cases.
# Most naive way to handle this is to a have a pre-compiled unicode list of all 1.1 million characters
# as it's finite and iso standardized.
# This means we can predict the upper bound and can apply known time complexity solutions to
# discover the best way resolve it.
def test_pre_tok_params() -> list[str]:
return [
"ü, ǖ, ǘ, ǚ, ǜ", # diaeresis
"綠, 女, 怒, 玉, 句", # pinyin
"ied 4 ½ months", # ordinal
"¡Hola Mundo!", # spanish
"Olá Mundo!", # portuguese
"Selam Dünya!", # turkish
"Salam, dünýä!", # turkman
"Γειά σου Κόσμε!", # greek
"हैलो वर्ल्ड!", # hindi
"สวัสดีชาวโลก!", # thai
"こんにちは世界!", # japanese
"你好世界!", # chinese
"Hàlo a Shaoghail!", # gaelic
"Chào thế giới!", # vietnamese
"Привет, мир!", # russian
"Здравей свят!", # bulgarian
"សួស្តី​ពិភពលោក!", # kymer
"Le rapide renard brun sauta par dessus le chien paresseux.", # french
"\tWil je een kopje thee?\n", # dutch
" Te gustaría algo de té ? ", # spanish
# NOTE: I expect right-to-left languages to fail
"העלא וועלט!", # yiddish (r-to-l)
"سلام دنیا!", # persian (r-to-l)
"", # Why?; This is a falsy value in python, no symbols.
" ",
" ",
" ",
"\t",
"\n",
"\n\n",
"\n\n\n",
"\t\n",
"Hello world",
" Hello world",
"Hello World",
" Hello World",
" Hello World!",
"Hello, world!",
" Hello, world!",
" this is 🦙.cpp",
"w048 7tuijk dsdfhu",
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
"Hello",
" Hello",
" Hello",
" Hello",
" Hello",
" Hello\n Hello",
" (",
"\n =",
"' era",
"Hello, y'all! How are you 😁 局外人?苹果apple工作work3.14159天God",
"3",
"33",
"333",
"3333",
"33333",
"333333",
"3333333",
]
def test_pre_tok(hf_voc_req: HFVocabRequest) -> None:
# NOTE: aggregate all models to their respective paths
from transformers import AutoTokenizer
params = test_pre_tok_params()
for model in hf_voc_req.models:
# set the model path, e.g. 'models/meta-llama/Llama-2-7b-hf'
path = Path(f"{hf_voc_req.model_path}/{model["repo"]}")
# set the model name, e.g. llama-2-7b-hf
name = path.stem.lower()
# model input encodings, e.g. 'models/meta-llama/Llama-2-7b-hf/llama-2-7b-hf.vocab.gguf.inp'
inp = path / f"ggml-vocab-{name}.inp"
# model output encodings, e.g. 'models/meta-llama/Llama-2-7b-hf/llama-2-7b-hf.vocab.gguf.out'
out = path / f"ggml-vocab-{name}.out"
# extracted tokenizer model
final = path / f"ggml-vocab-{name}.gguf"
# skip tokenizer folder if unavailable
if not path.exists():
logger.warning(f"skipped - {model['repo']} not found.")
continue
try: # create the tokenizer
tokenizer = AutoTokenizer.from_pretrained(path)
except OSError as e:
logger.error(f"{model['repo']} not found: {e}")
continue # skip this tokenizer model
with open(inp, "w", encoding="utf-8") as f:
for test in params:
f.write(f"{test}")
f.write("\n__ggml_vocab_test__\n")
with open(out, "w", encoding="utf-8") as f:
for test in params:
encodings = tokenizer.encode(test, add_special_tokens=False)
for encoding in encodings:
f.write(f" {encoding}")
f.write("\n")
logger.info(f"Tests for {model["repo"]} written in {final}.*")
def generate_tokenizers(hf_voc_req: HFVocabRequest) -> None:
pass
@ -29,7 +139,13 @@ def main():
"-v", "--verbose", action="store_true", help="A huggingface read auth token"
)
parser.add_argument(
"-m", "--model-path", default=None, help="The models storage path"
"-m", "--model-path", default=None, help="The models storage path. Default is 'models/'."
)
parser.add_argument(
"-t", "--gen-tests", action="store_true", help="Generate the tokenizer tests. Default is False."
)
parser.add_argument(
"-g", "--gen-toks", action="store_true", help="Generate the gguf vocab files. Default is False."
)
args = parser.parse_args()
@ -42,5 +158,16 @@ def main():
args.model_path, args.hf_auth_token, logger
)
hf_vocab_req.download_models()
hf_vocab_req.generate_checksums()
hf_vocab_req.log_pre_tokenizer_info()
if args.gen_tests:
test_pre_tok(hf_vocab_req)
if args.gen_toks:
generate_tokenizers(hf_vocab_req)
if __name__ == '__main__':
main()