tests : add test-tokenizer-0.sh

2024-05-02 08:34:56 +03:00 · 2024-05-02 08:34:56 +03:00 · ce7d3a0442
commit ce7d3a0442
parent c4ec9c0d3d
5 changed files with 110 additions and 240 deletions
--- a/tests/test-tokenizer-0-bpe.py
+++ b/tests/test-tokenizer-0-bpe.py
@ -1,117 +0,0 @@
 # tests with BPE tokenizer
 #
 # sample usage:
 #
 #   python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/Meta-Llama-3-8B-Instruct/
 #   python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/falcon-7b/
 #   python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/deepseek-coder-6.7b-instruct/
 #
 import argparse
 from transformers import AutoTokenizer
 parser = argparse.ArgumentParser()
 parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
 parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
 args = parser.parse_args()
 dir_tokenizer = args.dir_tokenizer
 tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
 tests = [
    "",
    " ",
    "  ",
    "   ",
    "\t",
    "\n",
    "\n\n",
    "\n\n\n",
    "\t\n",
    "Hello world",
    " Hello world",
    "Hello World",
    " Hello World",
    " Hello World!",
    "Hello, world!",
    " Hello, world!",
    " this is 🦙.cpp",
    "w048 7tuijk dsdfhu",
    "нещо на Български",
    "កាន់តែពិសេសអាចខលចេញ",
    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
    "Hello",
    " Hello",
    "  Hello",
    "   Hello",
    "    Hello",
    "    Hello\n    Hello",
    " (",
    "\n =",
    "' era",
    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
    "3",
    "33",
    "333",
    "3333",
    "33333",
    "333333",
    "3333333",
    "33333333",
    "333333333",
 ]
 for text in tests:
    print('text: ', text)
    print(tokenizer.encode(text))
    print(tokenizer.decode(tokenizer.encode(text)))
 print("\n\ntests for C++:\n")
 for text in tests:
    res = tokenizer.encode(text)
    k = text.replace('\n', '\\n')
    k = k.replace('\t', '\\t')
    k = '"' + k + '"'
    print("{ %-24s, { " % k, end='')
    for x in res:
        print("%7d," % x, end='')
    print(" }, },")
 print(tokenizer.encode('hello'))
 print(tokenizer.encode('world'))
 print(tokenizer.encode(' world'))
 print(tokenizer.encode('hello world'))
 fname_tok = args.fname_tok
 if fname_tok:
    print('tokenizing file: ', fname_tok)
    fname_out = fname_tok + '.tok'
    with open(fname_tok, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        s = ''.join(lines)
        res = tokenizer.encode(s)
        # write to file
        with open(fname_out, 'w', encoding='utf-8') as f:
            for x in res:
                # LLaMA v3 for some reason strips the space for these tokens (and others)
                # if x == 662:
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
                # elif x == 1174:
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
                # elif x == 2564:
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
                # elif x == 758:
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
                # elif x == 949:
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
                # elif x == 5354:
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
                # else:
                #     f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
                f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
        print('len(res): ', len(res))
        print('len(lines): ', len(lines))
    print('results written to: ', fname_out)
--- a/tests/test-tokenizer-0-spm.py
+++ b/tests/test-tokenizer-0-spm.py
@ -1,114 +0,0 @@
 # tests with SPM tokenizer
 #
 # sample usage:
 #
 #   python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/Llama-2-7b-hf/
 #   python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/CodeLlama-34b-Instruct-hf/
 #
 import argparse
 from sentencepiece import SentencePieceProcessor
 parser = argparse.ArgumentParser()
 parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
 parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
 args = parser.parse_args()
 dir_tokenizer = args.dir_tokenizer
 tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
 tests = [
    "",
    " ",
    "  ",
    "   ",
    "\t",
    "\n",
    "\n\n",
    "\n\n\n",
    "\t\n",
    "Hello world",
    " Hello world",
    "Hello World",
    " Hello World",
    " Hello World!",
    "Hello, world!",
    " Hello, world!",
    " this is 🦙.cpp",
    "w048 7tuijk dsdfhu",
    "нещо на Български",
    "កាន់តែពិសេសអាចខលចេញ",
    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
    "Hello",
    " Hello",
    "  Hello",
    "   Hello",
    "    Hello",
    "    Hello\n    Hello",
    " (",
    "\n =",
    "' era",
    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
    "3",
    "33",
    "333",
    "3333",
    "33333",
    "333333",
    "3333333",
    "33333333",
    "333333333",
 ]
 for text in tests:
    print('text: ', text)
    print('\nwith bos:')
    print(tokenizer.encode(text, add_bos=True))
    print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))
    print('\nwithout bos:')
    print(tokenizer.encode(text, add_bos=False))
    print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))
 print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello'
 print("'" + tokenizer.id_to_piece(29871) + "'") # '_'
 print("'" + tokenizer.decode([15043]) + "'")        # 'Hello'
 print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello'
 print("'" + tokenizer.decode([29871, 15043]) + "'")               # ' Hello'
 print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello  Hello'
 print("\n\ntests for C++:\n")
 for text in tests:
    res = tokenizer.encode(text, add_bos=False)
    k = text.replace('\n', '\\n')
    k = k.replace('\t', '\\t')
    k = '"' + k + '"'
    print("{ %-24s, { " % k, end='')
    for x in res:
        print("%7d," % x, end='')
    print(" }, },")
 print(tokenizer.encode('hello'))
 print(tokenizer.encode('world'))
 print(tokenizer.encode(' world'))
 print(tokenizer.encode('hello world'))
 fname_tok = args.fname_tok
 if fname_tok:
    print('tokenizing file: ', fname_tok)
    fname_out = fname_tok + '.tok'
    with open(fname_tok, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        s = ''.join(lines)
        res = tokenizer.encode(s, add_bos=True)
        # write to file
        with open(fname_out, 'w', encoding='utf-8') as f:
            for x in res:
                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
        print('len(res): ', len(res))
        print('len(lines): ', len(lines))
    print('results written to: ', fname_out)
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -55,8 +55,10 @@
 //    return _k_tests;
 //}
-static std::map<std::string, std::vector<llama_token>> read_tests(const std::string & fname_inp, const std::string & fname_out) {
+using llama_tests = std::map<std::string, std::vector<llama_token>>;
-    std::map<std::string, std::vector<llama_token>> tests;
+
 static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) {
    llama_tests tests;
    std::ifstream ifs_inp(fname_inp);
    if (!ifs_inp) {
@ -175,12 +177,20 @@ int main(int argc, char **argv) {
    bool success = true;
-    const auto k_tests = read_tests(fname_inp, fname_out);
+    const auto k_tests = [&]() -> llama_tests {
        if (!fname_text.empty()) {
            return {};
        }
-    if (k_tests.empty()) {
+        const auto res = read_tests(fname_inp, fname_out);
-        fprintf(stderr, "%s : error: no tests found\n", __func__);
+
-        return 1;
+        if (res.empty()) {
-    }
+            fprintf(stderr, "%s : error: no tests found\n", __func__);
            exit(1);
        }
        return res;
    }();
    const bool add_special = false;
@ -238,7 +248,17 @@ int main(int argc, char **argv) {
        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, add_special);
+        std::vector<llama_token> res;
        {
            const auto t_start = ggml_time_us();
            res = llama_tokenize(ctx, text, add_special);
            const auto t_end = ggml_time_us();
            fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0);
        }
        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
@ -252,7 +272,8 @@ int main(int argc, char **argv) {
            }
            for (const auto & tok : res) {
-                ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
+                //ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
                ofs << tok << "\n";
            }
        }
--- a/tests/test-tokenizer-0.py
+++ b/tests/test-tokenizer-0.py
@ -0,0 +1,46 @@
 import time
 import argparse
 from transformers import AutoTokenizer
 parser = argparse.ArgumentParser()
 parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
 parser.add_argument("--fname-tok",   help="path to a text file to tokenize", required=True)
 args = parser.parse_args()
 dir_tokenizer = args.dir_tokenizer
 fname_tok = args.fname_tok
 tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
 print('tokenizing file: ', fname_tok)
 fname_out = fname_tok + '.tok'
 with open(fname_tok, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    s = ''.join(lines)
    t_start = time.time()
    res = tokenizer.encode(s, add_special_tokens=False)
    t_end = time.time()
    print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)')
    with open(fname_out, 'w', encoding='utf-8') as f:
        for x in res:
            # LLaMA v3 for some reason strips the space for these tokens (and others)
            # if x == 662:
            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
            # elif x == 1174:
            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
            # elif x == 2564:
            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
            # elif x == 758:
            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
            # elif x == 949:
            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
            # elif x == 5354:
            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
            # else:
            #     f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
            # f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
            f.write(str(x) + '\n')
    print('len(res): ', len(res))
    print('len(lines): ', len(lines))
 print('results written to: ', fname_out)
--- a/tests/test-tokenizer-0.sh
+++ b/tests/test-tokenizer-0.sh
@ -0,0 +1,34 @@
 #!/bin/bash
 #
 # Usage:
 #
 #   test-tokenizer-0.sh <name> <input>
 #
 if [ $# -ne 2 ]; then
    printf "Usage: $0 <name> <input>\n"
    exit 1
 fi
 name=$1
 input=$2
 make -j tests/test-tokenizer-0
 printf "Testing %s on %s ...\n" $name $input
 python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
 cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
 ./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
 cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
 diff $input.tok $input.tokcpp > /dev/null 2>&1
 if [ $? -eq 0 ]; then
    printf "Tokenization is correct!\n"
 else
    diff $input.tok $input.tokcpp | head -n 32
    printf "Tokenization differs!\n"
 fi