convert-hf-to-gguf-update.py: use triple quoted f-string instead
This commit is contained in:
parent
6d42f3d773
commit
154ad1236e
1 changed files with 41 additions and 40 deletions
|
@ -146,46 +146,47 @@ for model in models:
|
||||||
src_ifs += f" # ref: {model['repo']}\n"
|
src_ifs += f" # ref: {model['repo']}\n"
|
||||||
src_ifs += f" res = \"{name}\"\n"
|
src_ifs += f" res = \"{name}\"\n"
|
||||||
|
|
||||||
src_func = "" # noqa: E222
|
src_func = f"""
|
||||||
src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n" # noqa: E222
|
def get_vocab_base_pre(self, tokenizer) -> str:
|
||||||
src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n" # noqa: E222
|
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
||||||
src_func += " # is specific for the BPE pre-tokenizer used by the model\n" # noqa: E222
|
# is specific for the BPE pre-tokenizer used by the model
|
||||||
src_func += " # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n" # noqa: E222
|
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
||||||
src_func += " # use in llama.cpp to implement the same pre-tokenizer\n" # noqa: E222
|
# use in llama.cpp to implement the same pre-tokenizer
|
||||||
src_func += "\n" # noqa: E222
|
|
||||||
src_func += f" chktxt = {repr(chktxt)}\n" # noqa: E222
|
chktxt = {repr(chktxt)}
|
||||||
src_func += "\n" # noqa: E222
|
|
||||||
src_func += " chktok = tokenizer.encode(chktxt)\n" # noqa: E222
|
chktok = tokenizer.encode(chktxt)
|
||||||
src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n" # noqa: E222
|
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||||
src_func += "\n" # noqa: E222
|
|
||||||
src_func += " print(f\"chktok: {chktok}\")\n" # noqa: E222
|
print(f"chktok: {{chktok}}")
|
||||||
src_func += " print(f\"chkhsh: {chkhsh}\")\n" # noqa: E222
|
print(f"chkhsh: {{chkhsh}}")
|
||||||
src_func += "\n" # noqa: E222
|
|
||||||
src_func += " res = None\n" # noqa: E222
|
res = None
|
||||||
src_func += "\n" # noqa: E222
|
|
||||||
src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n" # noqa: E222
|
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
|
||||||
src_func += " # or pull the latest version of the model from Huggingface\n" # noqa: E222
|
# or pull the latest version of the model from Huggingface
|
||||||
src_func += " # don't edit the hashes manually!\n" # noqa: E222
|
# don't edit the hashes manually!
|
||||||
src_func += f"{src_ifs}\n" # noqa: E222
|
{src_ifs}
|
||||||
src_func += " if res is None:\n" # noqa: E222
|
if res is None:
|
||||||
src_func += " print(\"\\n\")\n" # noqa: E222
|
print("\\n")
|
||||||
src_func += " print(\"**************************************************************************************\")\n" # noqa: E222
|
print("**************************************************************************************")
|
||||||
src_func += " print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n" # noqa: E222
|
print("** WARNING: The BPE pre-tokenizer was not recognized!")
|
||||||
src_func += " print(\"** There are 2 possible reasons for this:\")\n" # noqa: E222
|
print("** There are 2 possible reasons for this:")
|
||||||
src_func += " print(\"** - the model has not been added to convert-hf-to-gguf-update.py yet\")\n" # noqa: E222
|
print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
|
||||||
src_func += " print(\"** - the pre-tokenization config has changed upstream\")\n" # noqa: E222
|
print("** - the pre-tokenization config has changed upstream")
|
||||||
src_func += " print(\"** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n" # noqa: E222
|
print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
|
||||||
src_func += " print(\"** ref: https://github.com/ggerganov/llama.cpp/pull/6920\")\n" # noqa: E222
|
print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
||||||
src_func += " print(\"**\")\n" # noqa: E222
|
print("**")
|
||||||
src_func += " print(f\"** chkhsh: {chkhsh}\")\n" # noqa: E222
|
print(f"** chkhsh: {{chkhsh}}")
|
||||||
src_func += " print(\"**************************************************************************************\")\n" # noqa: E222
|
print("**************************************************************************************")
|
||||||
src_func += " print(\"\\n\")\n" # noqa: E222
|
print("\\n")
|
||||||
src_func += " raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n" # noqa: E222
|
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
|
||||||
src_func += "\n" # noqa: E222
|
|
||||||
src_func += " print(f\"tokenizer.ggml.pre: {res}\")\n" # noqa: E222
|
print(f"tokenizer.ggml.pre: {{repr(res)}}")
|
||||||
src_func += " print(f\"chkhsh: {chkhsh}\")\n" # noqa: E222
|
print(f"chkhsh: {{chkhsh}}")
|
||||||
src_func += "\n" # noqa: E222
|
|
||||||
src_func += " return res\n" # noqa: E222
|
return res
|
||||||
|
"""
|
||||||
|
|
||||||
print(src_func) # noqa: NP100
|
print(src_func) # noqa: NP100
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue