This commit is contained in:
Georgi Gerganov 2024-04-27 11:33:16 +03:00
parent ad929833cb
commit 4434c9d6c2
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 19 additions and 4 deletions

View file

@ -96,6 +96,21 @@ if fname_tok:
# write to file
with open(fname_out, 'w', encoding='utf-8') as f:
for x in res:
# LLaMA v3 for some reason strips the space for these tokens (and others)
# if x == 662:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 1174:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 2564:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 758:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 949:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 5354:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# else:
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
print('len(res): ', len(res))
print('len(lines): ', len(lines))