Minor improvements in GPT2 tokenizer (#3567)
* Fixing minor bugs in bpe_gpt2_preprocess * Don't add bos token in test
This commit is contained in:
parent
c5b49360d0
commit
233fc1c69f
5 changed files with 17 additions and 20 deletions
|
@ -36,6 +36,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
|||
{ " Hello" , { 258, 23090, }, },
|
||||
{ " Hello" , { 466, 23090, }, },
|
||||
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
|
||||
{ "\n =" , { 1212, 40, }, },
|
||||
{ "' era" , { 18, 4932, }, },
|
||||
};
|
||||
|
||||
return _k_tests;
|
||||
|
@ -155,7 +157,7 @@ int main(int argc, char **argv) {
|
|||
|
||||
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
||||
|
||||
const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
|
||||
const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
|
||||
|
||||
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
||||
|
||||
|
@ -169,10 +171,8 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
|
||||
for (const auto & tok : res) {
|
||||
ofs << tok << " ";
|
||||
ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
|
||||
}
|
||||
|
||||
ofs << "\n";
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue