tests : add option to tokenize text files

ggml-ci
2023-08-26 19:21:22 +03:00 · 2023-08-26 19:21:22 +03:00 · e4324cbd4d
commit e4324cbd4d
parent 70005bd5c9
2 changed files with 61 additions and 1 deletions
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -5,6 +5,7 @@
 #include <string>
 #include <map>
 #include <vector>
 #include <fstream>
 // generate using test-tokenizer-0.py
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
@ -41,12 +42,17 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
 int main(int argc, char **argv) {
    if (argc < 2) {
-        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
+        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
        return 1;
    }
    const std::string fname = argv[1];
    std::string fname_text;
    if (argc > 2) {
        fname_text = argv[2];
    }
    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
    llama_model * model;
@ -131,6 +137,42 @@ int main(int argc, char **argv) {
        }
    }
    if (!fname_text.empty()) {
        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
        std::string text;
        {
            std::ifstream ifs(fname_text);
            if (!ifs) {
                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
                return 1;
            }
            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
        }
        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
        {
            const std::string fname_out = fname_text + ".tokcpp";
            std::ofstream ofs(fname_out);
            if (!ofs) {
                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
                return 1;
            }
            for (const auto & tok : res) {
                ofs << tok << " ";
            }
            ofs << "\n";
        }
    }
    llama_free_model(model);
    llama_free(ctx);
--- a/tests/test-tokenizer-0.py
+++ b/tests/test-tokenizer-0.py
@ -6,6 +6,7 @@ from sentencepiece import SentencePieceProcessor
 parser = argparse.ArgumentParser()
 parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
 parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
 args = parser.parse_args()
 dir_tokenizer = args.dir_tokenizer
@ -68,3 +69,20 @@ for text in tests:
    for x in res:
        print("%7d," % x, end='')
    print(" }, },")
 fname_tok = args.fname_tok
 if fname_tok:
    print('tokenizing file: ', fname_tok)
    fname_out = fname_tok + '.tok'
    with open(fname_tok, 'r') as f:
        lines = f.readlines()
        s = ''.join(lines)
        res = tokenizer.encode(s, add_bos=True)
        # write to file
        with open(fname_out, 'w') as f:
            for x in res:
                f.write(str(x) + ' ')
            f.write('\n')
        print('len(res): ', len(res))
        print('len(lines): ', len(lines))
    print('results written to: ', fname_out)