Minor improvements in GPT2 tokenizer (#3567)

* Fixing minor bugs in bpe_gpt2_preprocess

* Don't add bos token in test
This commit is contained in:
goerch 2023-10-10 18:59:52 +02:00 committed by GitHub
parent c5b49360d0
commit 233fc1c69f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 17 additions and 20 deletions

View file

@ -41,6 +41,8 @@ tests = [
" Hello",
" Hello",
" Hello\n Hello",
"\n =",
"' era",
]
for text in tests:
@ -69,15 +71,14 @@ fname_tok = args.fname_tok
if fname_tok:
print('tokenizing file: ', fname_tok)
fname_out = fname_tok + '.tok'
with open(fname_tok, 'r') as f:
with open(fname_tok, 'r', encoding='utf-8') as f:
lines = f.readlines()
s = ''.join(lines)
res = tokenizer.encode(s)
# write to file
with open(fname_out, 'w') as f:
with open(fname_out, 'w', encoding='utf-8') as f:
for x in res:
f.write(str(x) + ' ')
f.write('\n')
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
print('len(res): ', len(res))
print('len(lines): ', len(lines))
print('results written to: ', fname_out)