unicode : support \p{N}, \p{L} and \p{P} natively

This commit is contained in:
Georgi Gerganov 2024-04-27 17:48:38 +03:00
parent ce5485aee0
commit 91eaa414bf
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
7 changed files with 94 additions and 26 deletions

View file

@ -111,7 +111,7 @@ if fname_tok:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# else:
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
print('len(res): ', len(res))
print('len(lines): ', len(lines))
print('results written to: ', fname_out)