diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 14aa0c45a..bbd66464b 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -20,11 +20,13 @@ # - Update llama.cpp with the new pre-tokenizer if necessary # # TODO: generate tokenizer tests for llama.cpp -# TODO: automate the update of convert-hf-to-gguf.py # import logging import os +import pathlib +import re + import requests import sys import json @@ -135,7 +137,6 @@ for model in models: download_file_with_auth(url, token, save_path) # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function: -# TODO: auto-update convert-hf-to-gguf.py with the generated function src_ifs = "" for model in models: @@ -224,11 +225,18 @@ src_func = f""" return res """ -print(src_func) # noqa: NP100 +convert_py_pth = pathlib.Path("convert-hf-to-gguf.py") +convert_py = convert_py_pth.read_text() +convert_py = re.sub( + r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)", + lambda m: m.group(1) + src_func + m.group(3), + convert_py, + flags=re.DOTALL | re.MULTILINE, +) -logger.info("\n") -logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!") -logger.info("\n") +convert_py_pth.write_text(convert_py) + +logger.info("+++ convert-hf-to-gguf.py was updated") # generate tests for each tokenizer model diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index cd875fa4a..17cec08f7 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -402,6 +402,7 @@ class Model: # NOTE: this function is generated by convert-hf-to-gguf-update.py # do not modify it manually! # ref: https://github.com/ggerganov/llama.cpp/pull/6920 + # Marker: Start get_vocab_base_pre def get_vocab_base_pre(self, tokenizer) -> str: # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # is specific for the BPE pre-tokenizer used by the model @@ -489,6 +490,7 @@ class Model: logger.debug(f"chkhsh: {chkhsh}") return res + # Marker: End get_vocab_base_pre def _set_vocab_gpt2(self) -> None: tokens, toktypes, tokpre = self.get_vocab_base()