convert-hf-to-gguf-update: automate updating
This commit is contained in:
parent
9afdffe70e
commit
86016b7d8d
2 changed files with 16 additions and 6 deletions
|
@ -20,11 +20,13 @@
|
|||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
||||
#
|
||||
# TODO: generate tokenizer tests for llama.cpp
|
||||
# TODO: automate the update of convert-hf-to-gguf.py
|
||||
#
|
||||
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
|
||||
import requests
|
||||
import sys
|
||||
import json
|
||||
|
@ -135,7 +137,6 @@ for model in models:
|
|||
download_file_with_auth(url, token, save_path)
|
||||
|
||||
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
||||
# TODO: auto-update convert-hf-to-gguf.py with the generated function
|
||||
|
||||
src_ifs = ""
|
||||
for model in models:
|
||||
|
@ -224,11 +225,18 @@ src_func = f"""
|
|||
return res
|
||||
"""
|
||||
|
||||
print(src_func) # noqa: NP100
|
||||
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
||||
convert_py = convert_py_pth.read_text()
|
||||
convert_py = re.sub(
|
||||
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||
lambda m: m.group(1) + src_func + m.group(3),
|
||||
convert_py,
|
||||
flags=re.DOTALL | re.MULTILINE,
|
||||
)
|
||||
|
||||
logger.info("\n")
|
||||
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
|
||||
logger.info("\n")
|
||||
convert_py_pth.write_text(convert_py)
|
||||
|
||||
logger.info("+++ convert-hf-to-gguf.py was updated")
|
||||
|
||||
# generate tests for each tokenizer model
|
||||
|
||||
|
|
|
@ -402,6 +402,7 @@ class Model:
|
|||
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
||||
# do not modify it manually!
|
||||
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||
# Marker: Start get_vocab_base_pre
|
||||
def get_vocab_base_pre(self, tokenizer) -> str:
|
||||
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
||||
# is specific for the BPE pre-tokenizer used by the model
|
||||
|
@ -489,6 +490,7 @@ class Model:
|
|||
logger.debug(f"chkhsh: {chkhsh}")
|
||||
|
||||
return res
|
||||
# Marker: End get_vocab_base_pre
|
||||
|
||||
def _set_vocab_gpt2(self) -> None:
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue