py : switch to snake_case (#8305)
* py : switch to snake_case ggml-ci * cont ggml-ci * cont ggml-ci * cont : fix link * gguf-py : use snake_case in scripts entrypoint export * py : rename requirements for convert_legacy_llama.py Needed for scripts/check-requirements.sh --------- Co-authored-by: Francis Couture-Harpin <git@compilade.net>
This commit is contained in:
parent
f09b7cb609
commit
e235b267a2
32 changed files with 69 additions and 104 deletions
|
@ -2,7 +2,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# This script downloads the tokenizer models of the specified models from Huggingface and
|
||||
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
|
||||
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
|
||||
#
|
||||
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
|
||||
# provide the necessary information to llama.cpp via the GGUF header in order to implement
|
||||
|
@ -15,9 +15,9 @@
|
|||
# - Add a new model to the "models" list
|
||||
# - Run the script with your huggingface token:
|
||||
#
|
||||
# python3 convert-hf-to-gguf-update.py <huggingface_token>
|
||||
# python3 convert_hf_to_gguf_update.py <huggingface_token>
|
||||
#
|
||||
# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
|
||||
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
|
||||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
||||
#
|
||||
# TODO: generate tokenizer tests for llama.cpp
|
||||
|
@ -37,7 +37,7 @@ from enum import IntEnum, auto
|
|||
from transformers import AutoTokenizer
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger = logging.getLogger("convert-hf-to-gguf-update")
|
||||
logger = logging.getLogger("convert_hf_to_gguf_update")
|
||||
sess = requests.Session()
|
||||
|
||||
|
||||
|
@ -56,10 +56,10 @@ if len(sys.argv) == 2:
|
|||
token = sys.argv[1]
|
||||
if not token.startswith("hf_"):
|
||||
logger.info("Huggingface token seems invalid")
|
||||
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
||||
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
||||
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
|
||||
sys.exit(1)
|
||||
|
||||
# TODO: add models here, base models preferred
|
||||
|
@ -134,7 +134,7 @@ for model in models:
|
|||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||
|
||||
|
||||
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
||||
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
||||
|
||||
src_ifs = ""
|
||||
for model in models:
|
||||
|
@ -201,7 +201,7 @@ src_func = f"""
|
|||
|
||||
res = None
|
||||
|
||||
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
|
||||
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
|
||||
# or pull the latest version of the model from Huggingface
|
||||
# don't edit the hashes manually!
|
||||
{src_ifs}
|
||||
|
@ -210,9 +210,9 @@ src_func = f"""
|
|||
logger.warning("**************************************************************************************")
|
||||
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
|
||||
logger.warning("** There are 2 possible reasons for this:")
|
||||
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
|
||||
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
||||
logger.warning("** - the pre-tokenization config has changed upstream")
|
||||
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
|
||||
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
||||
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
||||
logger.warning("**")
|
||||
logger.warning(f"** chkhsh: {{chkhsh}}")
|
||||
|
@ -226,7 +226,7 @@ src_func = f"""
|
|||
return res
|
||||
"""
|
||||
|
||||
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
||||
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
|
||||
convert_py = convert_py_pth.read_text(encoding="utf-8")
|
||||
convert_py = re.sub(
|
||||
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||
|
@ -237,7 +237,7 @@ convert_py = re.sub(
|
|||
|
||||
convert_py_pth.write_text(convert_py, encoding="utf-8")
|
||||
|
||||
logger.info("+++ convert-hf-to-gguf.py was updated")
|
||||
logger.info("+++ convert_hf_to_gguf.py was updated")
|
||||
|
||||
# generate tests for each tokenizer model
|
||||
|
||||
|
@ -343,6 +343,6 @@ logger.info("\nRun the following commands to generate the vocab files for testin
|
|||
for model in models:
|
||||
name = model["name"]
|
||||
|
||||
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
|
||||
print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
|
||||
|
||||
logger.info("\n")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue