tests : add option to tokenize text files

ggml-ci
This commit is contained in:
Georgi Gerganov 2023-08-26 19:21:22 +03:00
parent 70005bd5c9
commit e4324cbd4d
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 61 additions and 1 deletions

View file

@ -5,6 +5,7 @@
#include <string> #include <string>
#include <map> #include <map>
#include <vector> #include <vector>
#include <fstream>
// generate using test-tokenizer-0.py // generate using test-tokenizer-0.py
static const std::map<std::string, std::vector<llama_token>> & k_tests() { static const std::map<std::string, std::vector<llama_token>> & k_tests() {
@ -41,12 +42,17 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
int main(int argc, char **argv) { int main(int argc, char **argv) {
if (argc < 2) { if (argc < 2) {
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]); fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
return 1; return 1;
} }
const std::string fname = argv[1]; const std::string fname = argv[1];
std::string fname_text;
if (argc > 2) {
fname_text = argv[2];
}
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
llama_model * model; llama_model * model;
@ -131,6 +137,42 @@ int main(int argc, char **argv) {
} }
} }
if (!fname_text.empty()) {
fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
std::string text;
{
std::ifstream ifs(fname_text);
if (!ifs) {
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
return 1;
}
text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
}
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
{
const std::string fname_out = fname_text + ".tokcpp";
std::ofstream ofs(fname_out);
if (!ofs) {
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
return 1;
}
for (const auto & tok : res) {
ofs << tok << " ";
}
ofs << "\n";
}
}
llama_free_model(model); llama_free_model(model);
llama_free(ctx); llama_free(ctx);

View file

@ -6,6 +6,7 @@ from sentencepiece import SentencePieceProcessor
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
args = parser.parse_args() args = parser.parse_args()
dir_tokenizer = args.dir_tokenizer dir_tokenizer = args.dir_tokenizer
@ -68,3 +69,20 @@ for text in tests:
for x in res: for x in res:
print("%7d," % x, end='') print("%7d," % x, end='')
print(" }, },") print(" }, },")
fname_tok = args.fname_tok
if fname_tok:
print('tokenizing file: ', fname_tok)
fname_out = fname_tok + '.tok'
with open(fname_tok, 'r') as f:
lines = f.readlines()
s = ''.join(lines)
res = tokenizer.encode(s, add_bos=True)
# write to file
with open(fname_out, 'w') as f:
for x in res:
f.write(str(x) + ' ')
f.write('\n')
print('len(res): ', len(res))
print('len(lines): ', len(lines))
print('results written to: ', fname_out)