feat: Add example script for automating generating tokenizer model checksums and tests
This commit is contained in:
parent
006bb60d27
commit
1a82573126
1 changed files with 131 additions and 4 deletions
|
@ -7,8 +7,6 @@ import os
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
# Necessary to load the local gguf package
|
# Necessary to load the local gguf package
|
||||||
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
|
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
@ -18,7 +16,119 @@ from gguf.huggingface_hub import HFVocabRequest
|
||||||
logger = logging.getLogger("gguf-gen-pre")
|
logger = logging.getLogger("gguf-gen-pre")
|
||||||
|
|
||||||
|
|
||||||
def test_pre_tok(content) -> None:
|
# NOTE: It's impossible to catch all edge cases.
|
||||||
|
# Most naive way to handle this is to a have a pre-compiled unicode list of all 1.1 million characters
|
||||||
|
# as it's finite and iso standardized.
|
||||||
|
# This means we can predict the upper bound and can apply known time complexity solutions to
|
||||||
|
# discover the best way resolve it.
|
||||||
|
def test_pre_tok_params() -> list[str]:
|
||||||
|
return [
|
||||||
|
"ü, ǖ, ǘ, ǚ, ǜ", # diaeresis
|
||||||
|
"綠, 女, 怒, 玉, 句", # pinyin
|
||||||
|
"ied 4 ½ months", # ordinal
|
||||||
|
"¡Hola Mundo!", # spanish
|
||||||
|
"Olá Mundo!", # portuguese
|
||||||
|
"Selam Dünya!", # turkish
|
||||||
|
"Salam, dünýä!", # turkman
|
||||||
|
"Γειά σου Κόσμε!", # greek
|
||||||
|
"हैलो वर्ल्ड!", # hindi
|
||||||
|
"สวัสดีชาวโลก!", # thai
|
||||||
|
"こんにちは世界!", # japanese
|
||||||
|
"你好世界!", # chinese
|
||||||
|
"Hàlo a Shaoghail!", # gaelic
|
||||||
|
"Chào thế giới!", # vietnamese
|
||||||
|
"Привет, мир!", # russian
|
||||||
|
"Здравей свят!", # bulgarian
|
||||||
|
"សួស្តីពិភពលោក!", # kymer
|
||||||
|
"Le rapide renard brun sauta par dessus le chien paresseux.", # french
|
||||||
|
"\tWil je een kopje thee?\n", # dutch
|
||||||
|
" Te gustaría algo de té ? ", # spanish
|
||||||
|
# NOTE: I expect right-to-left languages to fail
|
||||||
|
"העלא וועלט!", # yiddish (r-to-l)
|
||||||
|
"سلام دنیا!", # persian (r-to-l)
|
||||||
|
"", # Why?; This is a falsy value in python, no symbols.
|
||||||
|
" ",
|
||||||
|
" ",
|
||||||
|
" ",
|
||||||
|
"\t",
|
||||||
|
"\n",
|
||||||
|
"\n\n",
|
||||||
|
"\n\n\n",
|
||||||
|
"\t\n",
|
||||||
|
"Hello world",
|
||||||
|
" Hello world",
|
||||||
|
"Hello World",
|
||||||
|
" Hello World",
|
||||||
|
" Hello World!",
|
||||||
|
"Hello, world!",
|
||||||
|
" Hello, world!",
|
||||||
|
" this is 🦙.cpp",
|
||||||
|
"w048 7tuijk dsdfhu",
|
||||||
|
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||||
|
"Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello\n Hello",
|
||||||
|
" (",
|
||||||
|
"\n =",
|
||||||
|
"' era",
|
||||||
|
"Hello, y'all! How are you 😁 局外人?苹果apple工作work3.14159天God~",
|
||||||
|
"3",
|
||||||
|
"33",
|
||||||
|
"333",
|
||||||
|
"3333",
|
||||||
|
"33333",
|
||||||
|
"333333",
|
||||||
|
"3333333",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_pre_tok(hf_voc_req: HFVocabRequest) -> None:
|
||||||
|
# NOTE: aggregate all models to their respective paths
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
params = test_pre_tok_params()
|
||||||
|
for model in hf_voc_req.models:
|
||||||
|
# set the model path, e.g. 'models/meta-llama/Llama-2-7b-hf'
|
||||||
|
path = Path(f"{hf_voc_req.model_path}/{model["repo"]}")
|
||||||
|
# set the model name, e.g. llama-2-7b-hf
|
||||||
|
name = path.stem.lower()
|
||||||
|
# model input encodings, e.g. 'models/meta-llama/Llama-2-7b-hf/llama-2-7b-hf.vocab.gguf.inp'
|
||||||
|
inp = path / f"ggml-vocab-{name}.inp"
|
||||||
|
# model output encodings, e.g. 'models/meta-llama/Llama-2-7b-hf/llama-2-7b-hf.vocab.gguf.out'
|
||||||
|
out = path / f"ggml-vocab-{name}.out"
|
||||||
|
# extracted tokenizer model
|
||||||
|
final = path / f"ggml-vocab-{name}.gguf"
|
||||||
|
|
||||||
|
# skip tokenizer folder if unavailable
|
||||||
|
if not path.exists():
|
||||||
|
logger.warning(f"skipped - {model['repo']} not found.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
try: # create the tokenizer
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(path)
|
||||||
|
except OSError as e:
|
||||||
|
logger.error(f"{model['repo']} not found: {e}")
|
||||||
|
continue # skip this tokenizer model
|
||||||
|
|
||||||
|
with open(inp, "w", encoding="utf-8") as f:
|
||||||
|
for test in params:
|
||||||
|
f.write(f"{test}")
|
||||||
|
f.write("\n__ggml_vocab_test__\n")
|
||||||
|
|
||||||
|
with open(out, "w", encoding="utf-8") as f:
|
||||||
|
for test in params:
|
||||||
|
encodings = tokenizer.encode(test, add_special_tokens=False)
|
||||||
|
for encoding in encodings:
|
||||||
|
f.write(f" {encoding}")
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
logger.info(f"Tests for {model["repo"]} written in {final}.*")
|
||||||
|
|
||||||
|
|
||||||
|
def generate_tokenizers(hf_voc_req: HFVocabRequest) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,7 +139,13 @@ def main():
|
||||||
"-v", "--verbose", action="store_true", help="A huggingface read auth token"
|
"-v", "--verbose", action="store_true", help="A huggingface read auth token"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-m", "--model-path", default=None, help="The models storage path"
|
"-m", "--model-path", default=None, help="The models storage path. Default is 'models/'."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-t", "--gen-tests", action="store_true", help="Generate the tokenizer tests. Default is False."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-g", "--gen-toks", action="store_true", help="Generate the gguf vocab files. Default is False."
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
@ -42,5 +158,16 @@ def main():
|
||||||
args.model_path, args.hf_auth_token, logger
|
args.model_path, args.hf_auth_token, logger
|
||||||
)
|
)
|
||||||
|
|
||||||
|
hf_vocab_req.download_models()
|
||||||
|
hf_vocab_req.generate_checksums()
|
||||||
|
hf_vocab_req.log_pre_tokenizer_info()
|
||||||
|
|
||||||
|
if args.gen_tests:
|
||||||
|
test_pre_tok(hf_vocab_req)
|
||||||
|
|
||||||
|
if args.gen_toks:
|
||||||
|
generate_tokenizers(hf_vocab_req)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue