From 5cad62bce41546ddae8908eeb5bb06476f4c5bd8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 26 Aug 2023 15:55:23 +0300 Subject: [PATCH] tests : write a Python tokenizer test (wip) --- tests/test-tokenizer-0.cpp | 40 ++++++++++++++++++++------------------ tests/test-tokenizer-0.py | 18 +++++++++++++++++ 2 files changed, 39 insertions(+), 19 deletions(-) create mode 100644 tests/test-tokenizer-0.py diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 7e9ac9188..4bed054d6 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -16,36 +16,38 @@ static std::string unescape_whitespace(llama_context* ctx, const std::vector> & k_tests() { static std::map> _k_tests = { - { " ", {1, 259, }, }, - { " ", { 1, 1678, }, }, - { " ", { 1, 268, }, }, - { "\t", { 1, 29871, 12, }, }, - { "\n", { 1, 29871, 13, }, }, - { "\t\n", { 1, 29871, 12, 13, }, }, + { " ", { 1, 259, }, }, + { " ", { 1, 1678, }, }, + { " ", { 1, 268, }, }, + { "\t", { 1, 29871, 12, }, }, + { "\n", { 1, 29871, 13, }, }, + { "\t\n", { 1, 29871, 12, 13, }, }, { "Hello world", { 1, 15043, 3186, }, }, { " Hello world", { 1, 29871, 15043, 3186, }, }, { "Hello World", { 1, 15043, 2787, }, }, { " Hello World", { 1, 29871, 15043, 2787, }, }, { " Hello World!", { 1, 29871, 15043, 2787, 29991, }, }, + { "Hello, world!", { 1, 15043, 29892, 3186, 29991, }, }, + { " Hello, world!", { 1, 29871, 15043, 29892, 3186, 29991, }, }, { " this is πŸ¦™.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, { "w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, }, { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", { 1, 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, - 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, - 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, - 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, - 136, 228, 162, 132, 228, 161, 140, }, }, + 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, + 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, + 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, + 136, 228, 162, 132, 228, 161, 140, }, }, { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", { 1, 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, - 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, - 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, - 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, - { "Hello", { 1, 15043 }, }, - { " Hello", { 1, 29871, 15043 }, }, - { " Hello", { 1, 259, 15043 }, }, - { " Hello", { 1, 1678, 15043 }, }, - { " Hello", { 1, 268, 15043 }, }, - { " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043 }, }, + 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, + 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, + 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, + { "Hello", { 1, 15043, }, }, + { " Hello", { 1, 29871, 15043, }, }, + { " Hello", { 1, 259, 15043, }, }, + { " Hello", { 1, 1678, 15043, }, }, + { " Hello", { 1, 268, 15043, }, }, + { " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043, }, }, }; return _k_tests; diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py new file mode 100644 index 000000000..d21f8b5a1 --- /dev/null +++ b/tests/test-tokenizer-0.py @@ -0,0 +1,18 @@ +import os +import sys +import argparse + +from sentencepiece import SentencePieceProcessor + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer + +tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') + +text = 'Hello, world!' +print(text) +print(tokenizer.encode(text, add_bos=True)) +print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))