py : cleanup the code
- use f-strings where possible - drop first param of encode/decode functions since "utf-8" is the default
This commit is contained in:
parent
9733104be5
commit
cbef542879
6 changed files with 27 additions and 29 deletions
|
@ -49,7 +49,7 @@ def write_header(f_out, header):
|
|||
def write_tokens(fout, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode("utf-8")
|
||||
text = " \u2047 ".encode()
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
|
@ -60,13 +60,13 @@ def write_tokens(fout, tokenizer):
|
|||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
# TODO: GPT4All - add extra <pad> token
|
||||
text = "<pad>".encode("utf-8")
|
||||
text = "<pad>".encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", 0.0))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue