tests : add option to tokenize text files
ggml-ci
This commit is contained in:
parent
70005bd5c9
commit
e4324cbd4d
2 changed files with 61 additions and 1 deletions
|
@ -5,6 +5,7 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
// generate using test-tokenizer-0.py
|
// generate using test-tokenizer-0.py
|
||||||
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||||
|
@ -41,12 +42,17 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
if (argc < 2) {
|
if (argc < 2) {
|
||||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string fname = argv[1];
|
const std::string fname = argv[1];
|
||||||
|
|
||||||
|
std::string fname_text;
|
||||||
|
if (argc > 2) {
|
||||||
|
fname_text = argv[2];
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||||
|
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
|
@ -131,6 +137,42 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!fname_text.empty()) {
|
||||||
|
fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
|
||||||
|
|
||||||
|
std::string text;
|
||||||
|
{
|
||||||
|
std::ifstream ifs(fname_text);
|
||||||
|
if (!ifs) {
|
||||||
|
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
||||||
|
|
||||||
|
const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
||||||
|
|
||||||
|
{
|
||||||
|
const std::string fname_out = fname_text + ".tokcpp";
|
||||||
|
|
||||||
|
std::ofstream ofs(fname_out);
|
||||||
|
if (!ofs) {
|
||||||
|
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto & tok : res) {
|
||||||
|
ofs << tok << " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
ofs << "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,7 @@ from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
|
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
|
||||||
|
parser.add_argument("--fname-tok", help="path to a text file to tokenize")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
dir_tokenizer = args.dir_tokenizer
|
dir_tokenizer = args.dir_tokenizer
|
||||||
|
@ -68,3 +69,20 @@ for text in tests:
|
||||||
for x in res:
|
for x in res:
|
||||||
print("%7d," % x, end='')
|
print("%7d," % x, end='')
|
||||||
print(" }, },")
|
print(" }, },")
|
||||||
|
|
||||||
|
fname_tok = args.fname_tok
|
||||||
|
if fname_tok:
|
||||||
|
print('tokenizing file: ', fname_tok)
|
||||||
|
fname_out = fname_tok + '.tok'
|
||||||
|
with open(fname_tok, 'r') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
s = ''.join(lines)
|
||||||
|
res = tokenizer.encode(s, add_bos=True)
|
||||||
|
# write to file
|
||||||
|
with open(fname_out, 'w') as f:
|
||||||
|
for x in res:
|
||||||
|
f.write(str(x) + ' ')
|
||||||
|
f.write('\n')
|
||||||
|
print('len(res): ', len(res))
|
||||||
|
print('len(lines): ', len(lines))
|
||||||
|
print('results written to: ', fname_out)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue