From 832b449cbd185257bf8a531e7f4f25e7d42c83f4 Mon Sep 17 00:00:00 2001 From: teleprint-me <77757836+teleprint-me@users.noreply.github.com> Date: Sat, 18 May 2024 14:33:56 -0400 Subject: [PATCH] feat: Add pre-tokenizer CLI tooling --- gguf-py/scripts/gguf-gen-pre.py | 46 +++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 gguf-py/scripts/gguf-gen-pre.py diff --git a/gguf-py/scripts/gguf-gen-pre.py b/gguf-py/scripts/gguf-gen-pre.py new file mode 100644 index 000000000..8af326cdb --- /dev/null +++ b/gguf-py/scripts/gguf-gen-pre.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import logging +import os +import sys +from pathlib import Path + +from tqdm import tqdm + +# Necessary to load the local gguf package +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): + sys.path.insert(0, str(Path(__file__).parent.parent)) + +from gguf.huggingface_hub import HFVocabRequest + +logger = logging.getLogger("gguf-gen-pre") + + +def test_pre_tok(content) -> None: + pass + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("hf_auth_token", help="A huggingface read auth token") + parser.add_argument( + "-v", "--verbose", action="store_true", help="A huggingface read auth token" + ) + parser.add_argument( + "-m", "--model-path", default=None, help="The models storage path" + ) + args = parser.parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + hf_vocab_req = HFVocabRequest( + args.model_path, args.hf_auth_token, logger + ) + +if __name__ == '__main__': + main()