convert-hf-to-gguf-update: automate updating
This commit is contained in:
parent
9afdffe70e
commit
86016b7d8d
2 changed files with 16 additions and 6 deletions
|
@ -20,11 +20,13 @@
|
||||||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
# - Update llama.cpp with the new pre-tokenizer if necessary
|
||||||
#
|
#
|
||||||
# TODO: generate tokenizer tests for llama.cpp
|
# TODO: generate tokenizer tests for llama.cpp
|
||||||
# TODO: automate the update of convert-hf-to-gguf.py
|
|
||||||
#
|
#
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import pathlib
|
||||||
|
import re
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
|
@ -135,7 +137,6 @@ for model in models:
|
||||||
download_file_with_auth(url, token, save_path)
|
download_file_with_auth(url, token, save_path)
|
||||||
|
|
||||||
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
||||||
# TODO: auto-update convert-hf-to-gguf.py with the generated function
|
|
||||||
|
|
||||||
src_ifs = ""
|
src_ifs = ""
|
||||||
for model in models:
|
for model in models:
|
||||||
|
@ -224,11 +225,18 @@ src_func = f"""
|
||||||
return res
|
return res
|
||||||
"""
|
"""
|
||||||
|
|
||||||
print(src_func) # noqa: NP100
|
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
||||||
|
convert_py = convert_py_pth.read_text()
|
||||||
|
convert_py = re.sub(
|
||||||
|
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||||
|
lambda m: m.group(1) + src_func + m.group(3),
|
||||||
|
convert_py,
|
||||||
|
flags=re.DOTALL | re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("\n")
|
convert_py_pth.write_text(convert_py)
|
||||||
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
|
|
||||||
logger.info("\n")
|
logger.info("+++ convert-hf-to-gguf.py was updated")
|
||||||
|
|
||||||
# generate tests for each tokenizer model
|
# generate tests for each tokenizer model
|
||||||
|
|
||||||
|
|
|
@ -402,6 +402,7 @@ class Model:
|
||||||
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
||||||
# do not modify it manually!
|
# do not modify it manually!
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||||
|
# Marker: Start get_vocab_base_pre
|
||||||
def get_vocab_base_pre(self, tokenizer) -> str:
|
def get_vocab_base_pre(self, tokenizer) -> str:
|
||||||
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
||||||
# is specific for the BPE pre-tokenizer used by the model
|
# is specific for the BPE pre-tokenizer used by the model
|
||||||
|
@ -489,6 +490,7 @@ class Model:
|
||||||
logger.debug(f"chkhsh: {chkhsh}")
|
logger.debug(f"chkhsh: {chkhsh}")
|
||||||
|
|
||||||
return res
|
return res
|
||||||
|
# Marker: End get_vocab_base_pre
|
||||||
|
|
||||||
def _set_vocab_gpt2(self) -> None:
|
def _set_vocab_gpt2(self) -> None:
|
||||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue