diff --git a/gguf-py/scripts/gguf-gen-pre.py b/gguf-py/scripts/gguf-gen-pre.py index 8af326cdb..d9b85f748 100644 --- a/gguf-py/scripts/gguf-gen-pre.py +++ b/gguf-py/scripts/gguf-gen-pre.py @@ -7,8 +7,6 @@ import os import sys from pathlib import Path -from tqdm import tqdm - # Necessary to load the local gguf package if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): sys.path.insert(0, str(Path(__file__).parent.parent)) @@ -18,7 +16,119 @@ from gguf.huggingface_hub import HFVocabRequest logger = logging.getLogger("gguf-gen-pre") -def test_pre_tok(content) -> None: +# NOTE: It's impossible to catch all edge cases. +# Most naive way to handle this is to a have a pre-compiled unicode list of all 1.1 million characters +# as it's finite and iso standardized. +# This means we can predict the upper bound and can apply known time complexity solutions to +# discover the best way resolve it. +def test_pre_tok_params() -> list[str]: + return [ + "ü, ǖ, ǘ, ǚ, ǜ", # diaeresis + "綠, 女, 怒, 玉, 句", # pinyin + "ied 4 ½ months", # ordinal + "¡Hola Mundo!", # spanish + "Olá Mundo!", # portuguese + "Selam Dünya!", # turkish + "Salam, dünýä!", # turkman + "Γειά σου Κόσμε!", # greek + "हैलो वर्ल्ड!", # hindi + "สวัสดีชาวโลก!", # thai + "こんにちは世界!", # japanese + "你好世界!", # chinese + "Hàlo a Shaoghail!", # gaelic + "Chào thế giới!", # vietnamese + "Привет, мир!", # russian + "Здравей свят!", # bulgarian + "សួស្តី​ពិភពលោក!", # kymer + "Le rapide renard brun sauta par dessus le chien paresseux.", # french + "\tWil je een kopje thee?\n", # dutch + " Te gustaría algo de té ? ", # spanish + # NOTE: I expect right-to-left languages to fail + "העלא וועלט!", # yiddish (r-to-l) + "سلام دنیا!", # persian (r-to-l) + "", # Why?; This is a falsy value in python, no symbols. + " ", + " ", + " ", + "\t", + "\n", + "\n\n", + "\n\n\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is 🦙.cpp", + "w048 7tuijk dsdfhu", + "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + " (", + "\n =", + "' era", + "Hello, y'all! How are you 😁 局外人?苹果apple工作work3.14159天God~", + "3", + "33", + "333", + "3333", + "33333", + "333333", + "3333333", + ] + + +def test_pre_tok(hf_voc_req: HFVocabRequest) -> None: + # NOTE: aggregate all models to their respective paths + from transformers import AutoTokenizer + + params = test_pre_tok_params() + for model in hf_voc_req.models: + # set the model path, e.g. 'models/meta-llama/Llama-2-7b-hf' + path = Path(f"{hf_voc_req.model_path}/{model["repo"]}") + # set the model name, e.g. llama-2-7b-hf + name = path.stem.lower() + # model input encodings, e.g. 'models/meta-llama/Llama-2-7b-hf/llama-2-7b-hf.vocab.gguf.inp' + inp = path / f"ggml-vocab-{name}.inp" + # model output encodings, e.g. 'models/meta-llama/Llama-2-7b-hf/llama-2-7b-hf.vocab.gguf.out' + out = path / f"ggml-vocab-{name}.out" + # extracted tokenizer model + final = path / f"ggml-vocab-{name}.gguf" + + # skip tokenizer folder if unavailable + if not path.exists(): + logger.warning(f"skipped - {model['repo']} not found.") + continue + + try: # create the tokenizer + tokenizer = AutoTokenizer.from_pretrained(path) + except OSError as e: + logger.error(f"{model['repo']} not found: {e}") + continue # skip this tokenizer model + + with open(inp, "w", encoding="utf-8") as f: + for test in params: + f.write(f"{test}") + f.write("\n__ggml_vocab_test__\n") + + with open(out, "w", encoding="utf-8") as f: + for test in params: + encodings = tokenizer.encode(test, add_special_tokens=False) + for encoding in encodings: + f.write(f" {encoding}") + f.write("\n") + + logger.info(f"Tests for {model["repo"]} written in {final}.*") + + +def generate_tokenizers(hf_voc_req: HFVocabRequest) -> None: pass @@ -29,7 +139,13 @@ def main(): "-v", "--verbose", action="store_true", help="A huggingface read auth token" ) parser.add_argument( - "-m", "--model-path", default=None, help="The models storage path" + "-m", "--model-path", default=None, help="The models storage path. Default is 'models/'." + ) + parser.add_argument( + "-t", "--gen-tests", action="store_true", help="Generate the tokenizer tests. Default is False." + ) + parser.add_argument( + "-g", "--gen-toks", action="store_true", help="Generate the gguf vocab files. Default is False." ) args = parser.parse_args() @@ -42,5 +158,16 @@ def main(): args.model_path, args.hf_auth_token, logger ) + hf_vocab_req.download_models() + hf_vocab_req.generate_checksums() + hf_vocab_req.log_pre_tokenizer_info() + + if args.gen_tests: + test_pre_tok(hf_vocab_req) + + if args.gen_toks: + generate_tokenizers(hf_vocab_req) + + if __name__ == '__main__': main()