feat: Add example script for automating generating tokenizer model checksums and tests

2024-05-18 20:49:22 -04:00 · 2024-05-18 20:49:22 -04:00 · 1a82573126
commit 1a82573126
parent 006bb60d27
1 changed files with 131 additions and 4 deletions
--- a/gguf-py/scripts/gguf-gen-pre.py
+++ b/gguf-py/scripts/gguf-gen-pre.py
@ -7,8 +7,6 @@ import os
 import sys
 from pathlib import Path
 from tqdm import tqdm
 # Necessary to load the local gguf package
 if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
    sys.path.insert(0, str(Path(__file__).parent.parent))
@ -18,7 +16,119 @@ from gguf.huggingface_hub import HFVocabRequest
 logger = logging.getLogger("gguf-gen-pre")
-def test_pre_tok(content) -> None:
+# NOTE: It's impossible to catch all edge cases.
 # Most naive way to handle this is to a have a pre-compiled unicode list of all 1.1 million characters
 # as it's finite and iso standardized.
 # This means we can predict the upper bound and can apply known time complexity solutions to
 # discover the best way resolve it.
 def test_pre_tok_params() -> list[str]:
    return [
        "ü, ǖ, ǘ, ǚ, ǜ",  # diaeresis
        "綠, 女, 怒, 玉, 句",  # pinyin
        "ied 4 ½ months",  # ordinal
        "¡Hola Mundo!",  # spanish
        "Olá Mundo!", # portuguese
        "Selam Dünya!",  # turkish
        "Salam, dünýä!", # turkman
        "Γειά σου Κόσμε!",  # greek
        "हैलो वर्ल्ड!",  # hindi
        "สวัสดีชาวโลก!", # thai
        "こんにちは世界！",  # japanese
        "你好世界！",  # chinese
        "Hàlo a Shaoghail!",  # gaelic
        "Chào thế giới!",  # vietnamese
        "Привет, мир!", # russian
        "Здравей свят!", # bulgarian
        "សួស្តីពិភពលោក!",  # kymer
        "Le rapide renard brun sauta par dessus le chien paresseux.", # french
        "\tWil je een kopje thee?\n",  # dutch
        " Te gustaría algo de té ?   ",  # spanish
        # NOTE: I expect right-to-left languages to fail
        "העלא וועלט!", # yiddish (r-to-l)
        "سلام دنیا!",  # persian (r-to-l)
        "",  # Why?; This is a falsy value in python, no symbols.
        " ",
        "  ",
        "   ",
        "\t",
        "\n",
        "\n\n",
        "\n\n\n",
        "\t\n",
        "Hello world",
        " Hello world",
        "Hello World",
        " Hello World",
        " Hello World!",
        "Hello, world!",
        " Hello, world!",
        " this is 🦙.cpp",
        "w048 7tuijk dsdfhu",
        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
        "Hello",
        " Hello",
        "  Hello",
        "   Hello",
        "    Hello",
        "    Hello\n    Hello",
        " (",
        "\n =",
        "' era",
        "Hello, y'all! How are you 😁 局外人?苹果apple工作work3.14159天God～",
        "3",
        "33",
        "333",
        "3333",
        "33333",
        "333333",
        "3333333",
    ]
 def test_pre_tok(hf_voc_req: HFVocabRequest) -> None:
    # NOTE: aggregate all models to their respective paths
    from transformers import AutoTokenizer
    params = test_pre_tok_params()
    for model in hf_voc_req.models:
        # set the model path, e.g. 'models/meta-llama/Llama-2-7b-hf'
        path = Path(f"{hf_voc_req.model_path}/{model["repo"]}")
        # set the model name, e.g. llama-2-7b-hf
        name = path.stem.lower()
        # model input encodings, e.g. 'models/meta-llama/Llama-2-7b-hf/llama-2-7b-hf.vocab.gguf.inp'
        inp = path / f"ggml-vocab-{name}.inp"
        # model output encodings, e.g. 'models/meta-llama/Llama-2-7b-hf/llama-2-7b-hf.vocab.gguf.out'
        out = path / f"ggml-vocab-{name}.out"
        # extracted tokenizer model
        final = path / f"ggml-vocab-{name}.gguf"
        # skip tokenizer folder if unavailable
        if not path.exists():
            logger.warning(f"skipped - {model['repo']} not found.")
            continue
        try:  # create the tokenizer
            tokenizer = AutoTokenizer.from_pretrained(path)
        except OSError as e:
            logger.error(f"{model['repo']} not found: {e}")
            continue  # skip this tokenizer model
        with open(inp, "w", encoding="utf-8") as f:
            for test in params:
                f.write(f"{test}")
                f.write("\n__ggml_vocab_test__\n")
        with open(out, "w", encoding="utf-8") as f:
            for test in params:
                encodings = tokenizer.encode(test, add_special_tokens=False)
                for encoding in encodings:
                    f.write(f" {encoding}")
                f.write("\n")
        logger.info(f"Tests for {model["repo"]} written in {final}.*")
 def generate_tokenizers(hf_voc_req: HFVocabRequest) -> None:
    pass
@ -29,7 +139,13 @@ def main():
        "-v", "--verbose", action="store_true", help="A huggingface read auth token"
    )
    parser.add_argument(
-        "-m", "--model-path", default=None, help="The models storage path"
+        "-m", "--model-path", default=None, help="The models storage path. Default is 'models/'."
    )
    parser.add_argument(
        "-t", "--gen-tests", action="store_true", help="Generate the tokenizer tests. Default is False."
    )
    parser.add_argument(
        "-g", "--gen-toks", action="store_true", help="Generate the gguf vocab files. Default is False."
    )
    args = parser.parse_args()
@ -42,5 +158,16 @@ def main():
        args.model_path, args.hf_auth_token, logger
    )
    hf_vocab_req.download_models()
    hf_vocab_req.generate_checksums()
    hf_vocab_req.log_pre_tokenizer_info()
    if args.gen_tests:
        test_pre_tok(hf_vocab_req)
    if args.gen_toks:
        generate_tokenizers(hf_vocab_req)
 if __name__ == '__main__':
    main()