convert-hf-to-gguf-update: automate updating

This commit is contained in:
Aarni Koskela 2024-05-17 13:43:12 +03:00
parent 9afdffe70e
commit 86016b7d8d
2 changed files with 16 additions and 6 deletions

View file

@ -20,11 +20,13 @@
# - Update llama.cpp with the new pre-tokenizer if necessary # - Update llama.cpp with the new pre-tokenizer if necessary
# #
# TODO: generate tokenizer tests for llama.cpp # TODO: generate tokenizer tests for llama.cpp
# TODO: automate the update of convert-hf-to-gguf.py
# #
import logging import logging
import os import os
import pathlib
import re
import requests import requests
import sys import sys
import json import json
@ -135,7 +137,6 @@ for model in models:
download_file_with_auth(url, token, save_path) download_file_with_auth(url, token, save_path)
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function: # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
# TODO: auto-update convert-hf-to-gguf.py with the generated function
src_ifs = "" src_ifs = ""
for model in models: for model in models:
@ -224,11 +225,18 @@ src_func = f"""
return res return res
""" """
print(src_func) # noqa: NP100 convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
convert_py = convert_py_pth.read_text()
convert_py = re.sub(
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
lambda m: m.group(1) + src_func + m.group(3),
convert_py,
flags=re.DOTALL | re.MULTILINE,
)
logger.info("\n") convert_py_pth.write_text(convert_py)
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
logger.info("\n") logger.info("+++ convert-hf-to-gguf.py was updated")
# generate tests for each tokenizer model # generate tests for each tokenizer model

View file

@ -402,6 +402,7 @@ class Model:
# NOTE: this function is generated by convert-hf-to-gguf-update.py # NOTE: this function is generated by convert-hf-to-gguf-update.py
# do not modify it manually! # do not modify it manually!
# ref: https://github.com/ggerganov/llama.cpp/pull/6920 # ref: https://github.com/ggerganov/llama.cpp/pull/6920
# Marker: Start get_vocab_base_pre
def get_vocab_base_pre(self, tokenizer) -> str: def get_vocab_base_pre(self, tokenizer) -> str:
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
# is specific for the BPE pre-tokenizer used by the model # is specific for the BPE pre-tokenizer used by the model
@ -489,6 +490,7 @@ class Model:
logger.debug(f"chkhsh: {chkhsh}") logger.debug(f"chkhsh: {chkhsh}")
return res return res
# Marker: End get_vocab_base_pre
def _set_vocab_gpt2(self) -> None: def _set_vocab_gpt2(self) -> None:
tokens, toktypes, tokpre = self.get_vocab_base() tokens, toktypes, tokpre = self.get_vocab_base()