feat: Add example script for automating generating tokenizer model checksums and tests

2024-05-18 20:49:22 -04:00 · 2024-05-18 20:49:22 -04:00 · 1a82573126
commit 1a82573126
parent 006bb60d27
1 changed files with 131 additions and 4 deletions
--- a/gguf-py/scripts/gguf-gen-pre.py
+++ b/gguf-py/scripts/gguf-gen-pre.py
@ -7,8 +7,6 @@ import os
 import sys
 from pathlib import Path

-from tqdm import tqdm
-
 # Necessary to load the local gguf package
 if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
    sys.path.insert(0, str(Path(__file__).parent.parent))
@ -18,7 +16,119 @@ from gguf.huggingface_hub import HFVocabRequest
 logger = logging.getLogger("gguf-gen-pre")


-def test_pre_tok(content) -> None:
+# NOTE: It's impossible to catch all edge cases.
+# Most naive way to handle this is to a have a pre-compiled unicode list of all 1.1 million characters
+# as it's finite and iso standardized.
+# This means we can predict the upper bound and can apply known time complexity solutions to
+# discover the best way resolve it.
+def test_pre_tok_params() -> list[str]:
+    return [
+        "ü, ǖ, ǘ, ǚ, ǜ",  # diaeresis
+        "綠, 女, 怒, 玉, 句",  # pinyin
+        "ied 4 ½ months",  # ordinal
+        "¡Hola Mundo!",  # spanish
+        "Olá Mundo!", # portuguese
+        "Selam Dünya!",  # turkish
+        "Salam, dünýä!", # turkman
+        "Γειά σου Κόσμε!",  # greek
+        "हैलो वर्ल्ड!",  # hindi
+        "สวัสดีชาวโลก!", # thai
+        "こんにちは世界！",  # japanese
+        "你好世界！",  # chinese
+        "Hàlo a Shaoghail!",  # gaelic
+        "Chào thế giới!",  # vietnamese
+        "Привет, мир!", # russian
+        "Здравей свят!", # bulgarian
+        "សួស្តីពិភពលោក!",  # kymer
+        "Le rapide renard brun sauta par dessus le chien paresseux.", # french
+        "\tWil je een kopje thee?\n",  # dutch
+        " Te gustaría algo de té ?   ",  # spanish
+        # NOTE: I expect right-to-left languages to fail
+        "העלא וועלט!", # yiddish (r-to-l)
+        "سلام دنیا!",  # persian (r-to-l)
+        "",  # Why?; This is a falsy value in python, no symbols.
+        " ",
+        "  ",
+        "   ",
+        "\t",
+        "\n",
+        "\n\n",
+        "\n\n\n",
+        "\t\n",
+        "Hello world",
+        " Hello world",
+        "Hello World",
+        " Hello World",
+        " Hello World!",
+        "Hello, world!",
+        " Hello, world!",
+        " this is 🦙.cpp",
+        "w048 7tuijk dsdfhu",
+        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
+        "Hello",
+        " Hello",
+        "  Hello",
+        "   Hello",
+        "    Hello",
+        "    Hello\n    Hello",
+        " (",
+        "\n =",
+        "' era",
+        "Hello, y'all! How are you 😁 局外人?苹果apple工作work3.14159天God～",
+        "3",
+        "33",
+        "333",
+        "3333",
+        "33333",
+        "333333",
+        "3333333",
+    ]
+
+
+def test_pre_tok(hf_voc_req: HFVocabRequest) -> None:
+    # NOTE: aggregate all models to their respective paths
+    from transformers import AutoTokenizer
+
+    params = test_pre_tok_params()
+    for model in hf_voc_req.models:
+        # set the model path, e.g. 'models/meta-llama/Llama-2-7b-hf'
+        path = Path(f"{hf_voc_req.model_path}/{model["repo"]}")
+        # set the model name, e.g. llama-2-7b-hf
+        name = path.stem.lower()
+        # model input encodings, e.g. 'models/meta-llama/Llama-2-7b-hf/llama-2-7b-hf.vocab.gguf.inp'
+        inp = path / f"ggml-vocab-{name}.inp"
+        # model output encodings, e.g. 'models/meta-llama/Llama-2-7b-hf/llama-2-7b-hf.vocab.gguf.out'
+        out = path / f"ggml-vocab-{name}.out"
+        # extracted tokenizer model
+        final = path / f"ggml-vocab-{name}.gguf"
+
+        # skip tokenizer folder if unavailable
+        if not path.exists():
+            logger.warning(f"skipped - {model['repo']} not found.")
+            continue
+
+        try:  # create the tokenizer
+            tokenizer = AutoTokenizer.from_pretrained(path)
+        except OSError as e:
+            logger.error(f"{model['repo']} not found: {e}")
+            continue  # skip this tokenizer model
+
+        with open(inp, "w", encoding="utf-8") as f:
+            for test in params:
+                f.write(f"{test}")
+                f.write("\n__ggml_vocab_test__\n")
+
+        with open(out, "w", encoding="utf-8") as f:
+            for test in params:
+                encodings = tokenizer.encode(test, add_special_tokens=False)
+                for encoding in encodings:
+                    f.write(f" {encoding}")
+                f.write("\n")
+
+        logger.info(f"Tests for {model["repo"]} written in {final}.*")
+
+
+def generate_tokenizers(hf_voc_req: HFVocabRequest) -> None:
    pass


@ -29,7 +139,13 @@ def main():
        "-v", "--verbose", action="store_true", help="A huggingface read auth token"
    )
    parser.add_argument(
-        "-m", "--model-path", default=None, help="The models storage path"
+        "-m", "--model-path", default=None, help="The models storage path. Default is 'models/'."
+    )
+    parser.add_argument(
+        "-t", "--gen-tests", action="store_true", help="Generate the tokenizer tests. Default is False."
+    )
+    parser.add_argument(
+        "-g", "--gen-toks", action="store_true", help="Generate the gguf vocab files. Default is False."
    )
    args = parser.parse_args()

@ -42,5 +158,16 @@ def main():
        args.model_path, args.hf_auth_token, logger
    )

+    hf_vocab_req.download_models()
+    hf_vocab_req.generate_checksums()
+    hf_vocab_req.log_pre_tokenizer_info()
+
+    if args.gen_tests:
+        test_pre_tok(hf_vocab_req)
+
+    if args.gen_toks:
+        generate_tokenizers(hf_vocab_req)
+
+
 if __name__ == '__main__':
    main()