py : cleanup the code

- use f-strings where possible - drop first param of encode/decode functions since "utf-8" is the default
2023-03-29 21:31:24 +02:00 · 2023-03-29 21:31:24 +02:00 · cbef542879
commit cbef542879
parent 9733104be5
6 changed files with 27 additions and 29 deletions
--- a/convert-gpt4all-to-ggml.py
+++ b/convert-gpt4all-to-ggml.py
@ -49,7 +49,7 @@ def write_header(f_out, header):
 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode("utf-8")
+            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
@ -60,13 +60,13 @@ def write_tokens(fout, tokenizer):
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))

    # TODO: GPT4All - add extra <pad> token
-    text = "<pad>".encode("utf-8")
+    text = "<pad>".encode()
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
    fout.write(struct.pack("f", 0.0))