convert-hf-to-gguf-update.py: use triple quoted f-string instead

2024-05-02 01:47:41 +10:00 · 2024-05-02 01:47:41 +10:00 · 154ad1236e
commit 154ad1236e
parent 6d42f3d773
1 changed files with 41 additions and 40 deletions
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -146,46 +146,47 @@ for model in models:
    src_ifs += f"            # ref: {model['repo']}\n"
    src_ifs += f"            res = \"{name}\"\n"
-src_func = "" # noqa: E222
+src_func = f"""
-src_func +=  "    def get_vocab_base_pre(self, tokenizer) -> str:\n" # noqa: E222
+    def get_vocab_base_pre(self, tokenizer) -> str:
-src_func +=  "        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n" # noqa: E222
+        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
-src_func +=  "        # is specific for the BPE pre-tokenizer used by the model\n" # noqa: E222
+        # is specific for the BPE pre-tokenizer used by the model
-src_func +=  "        # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n" # noqa: E222
+        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
-src_func +=  "        # use in llama.cpp to implement the same pre-tokenizer\n" # noqa: E222
+        # use in llama.cpp to implement the same pre-tokenizer
-src_func +=  "\n" # noqa: E222
+
-src_func += f"        chktxt = {repr(chktxt)}\n" # noqa: E222
+        chktxt = {repr(chktxt)}
-src_func +=  "\n" # noqa: E222
+
-src_func +=  "        chktok = tokenizer.encode(chktxt)\n" # noqa: E222
+        chktok = tokenizer.encode(chktxt)
-src_func +=  "        chkhsh = sha256(str(chktok).encode()).hexdigest()\n" # noqa: E222
+        chkhsh = sha256(str(chktok).encode()).hexdigest()
-src_func +=  "\n" # noqa: E222
+
-src_func +=  "        print(f\"chktok: {chktok}\")\n" # noqa: E222
+        print(f"chktok: {{chktok}}")
-src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n" # noqa: E222
+        print(f"chkhsh: {{chkhsh}}")
-src_func +=  "\n" # noqa: E222
+
-src_func +=  "        res = None\n" # noqa: E222
+        res = None
-src_func +=  "\n" # noqa: E222
+
-src_func +=  "        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n" # noqa: E222
+        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
-src_func +=  "        #       or pull the latest version of the model from Huggingface\n" # noqa: E222
+        #       or pull the latest version of the model from Huggingface
-src_func +=  "        #       don't edit the hashes manually!\n" # noqa: E222
+        #       don't edit the hashes manually!
-src_func += f"{src_ifs}\n" # noqa: E222
+{src_ifs}
-src_func +=  "        if res is None:\n" # noqa: E222
+        if res is None:
-src_func +=  "            print(\"\\n\")\n" # noqa: E222
+            print("\\n")
-src_func +=  "            print(\"**************************************************************************************\")\n" # noqa: E222
+            print("**************************************************************************************")
-src_func +=  "            print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n" # noqa: E222
+            print("** WARNING: The BPE pre-tokenizer was not recognized!")
-src_func +=  "            print(\"**          There are 2 possible reasons for this:\")\n" # noqa: E222
+            print("**          There are 2 possible reasons for this:")
-src_func +=  "            print(\"**          - the model has not been added to convert-hf-to-gguf-update.py yet\")\n" # noqa: E222
+            print("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
-src_func +=  "            print(\"**          - the pre-tokenization config has changed upstream\")\n" # noqa: E222
+            print("**          - the pre-tokenization config has changed upstream")
-src_func +=  "            print(\"**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n" # noqa: E222
+            print("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
-src_func +=  "            print(\"** ref:     https://github.com/ggerganov/llama.cpp/pull/6920\")\n" # noqa: E222
+            print("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
-src_func +=  "            print(\"**\")\n" # noqa: E222
+            print("**")
-src_func +=  "            print(f\"** chkhsh:  {chkhsh}\")\n" # noqa: E222
+            print(f"** chkhsh:  {{chkhsh}}")
-src_func +=  "            print(\"**************************************************************************************\")\n" # noqa: E222
+            print("**************************************************************************************")
-src_func +=  "            print(\"\\n\")\n" # noqa: E222
+            print("\\n")
-src_func +=  "            raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n" # noqa: E222
+            raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
-src_func +=  "\n" # noqa: E222
+
-src_func +=  "        print(f\"tokenizer.ggml.pre: {res}\")\n" # noqa: E222
+        print(f"tokenizer.ggml.pre: {{repr(res)}}")
-src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n" # noqa: E222
+        print(f"chkhsh: {{chkhsh}}")
-src_func +=  "\n" # noqa: E222
+
-src_func +=  "        return res\n" # noqa: E222
+        return res
 """
 print(src_func) # noqa: NP100