unicode : support \p{N}, \p{L} and \p{P} natively

This commit is contained in:
Georgi Gerganov 2024-04-27 17:48:38 +03:00
parent ce5485aee0
commit 91eaa414bf
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
7 changed files with 94 additions and 26 deletions

View file

@ -111,7 +111,7 @@ if fname_tok:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# else:
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
print('len(res): ', len(res))
print('len(lines): ', len(lines))
print('results written to: ', fname_out)

View file

@ -183,7 +183,7 @@ int main(int argc, char **argv) {
}
for (const auto & tok : res) {
ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
}
}