*.py: fix flake8 warnings

This commit is contained in:
brian khuu 2024-04-30 02:36:00 +10:00
parent 5e5e74e3b8
commit fcc5a5e0fe
6 changed files with 101 additions and 93 deletions

View file

@ -1,4 +1,4 @@
[flake8] [flake8]
max-line-length = 125 max-line-length = 125
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503 ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
exclude = examples/*,examples/*/**,*/**/__init__.py,convert-hf-to-gguf-update.py exclude = examples/*,examples/*/**,*/**/__init__.py

View file

@ -21,6 +21,7 @@
# TODO: automate the update of convert-hf-to-gguf.py # TODO: automate the update of convert-hf-to-gguf.py
# #
import logging
import os import os
import requests import requests
import sys import sys
@ -28,12 +29,17 @@ import json
from hashlib import sha256 from hashlib import sha256
from enum import IntEnum, auto from enum import IntEnum, auto
from transformers import AutoTokenizer
logger = logging.getLogger("convert-hf-to-gguf-update")
class TOKENIZER_TYPE(IntEnum): class TOKENIZER_TYPE(IntEnum):
SPM = auto() SPM = auto()
BPE = auto() BPE = auto()
WPM = auto() WPM = auto()
# TODO: this string has to exercise as much pre-tokenizer functionality as possible # TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome # will be updated with time - contributions welcome
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
@ -41,7 +47,7 @@ chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍
if len(sys.argv) == 2: if len(sys.argv) == 2:
token = sys.argv[1] token = sys.argv[1]
else: else:
print("Usage: python convert-hf-to-gguf-update.py <huggingface_token>") logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1) sys.exit(1)
# TODO: add models here, base models preferred # TODO: add models here, base models preferred
@ -62,15 +68,17 @@ models = [
if not os.path.exists("models/tokenizers"): if not os.path.exists("models/tokenizers"):
os.makedirs("models/tokenizers") os.makedirs("models/tokenizers")
def download_file_with_auth(url, token, save_path): def download_file_with_auth(url, token, save_path):
headers = {"Authorization": f"Bearer {token}"} headers = {"Authorization": f"Bearer {token}"}
response = requests.get(url, headers=headers) response = requests.get(url, headers=headers)
if response.status_code == 200: if response.status_code == 200:
with open(save_path, 'wb') as f: with open(save_path, 'wb') as f:
f.write(response.content) f.write(response.content)
print(f"File {save_path} downloaded successfully") logger.info(f"File {save_path} downloaded successfully")
else: else:
print(f"Failed to download file. Status code: {response.status_code}") logger.info(f"Failed to download file. Status code: {response.status_code}")
# download the tokenizer models # download the tokenizer models
for model in models: for model in models:
@ -81,10 +89,10 @@ for model in models:
if not os.path.exists(f"models/tokenizers/{name}"): if not os.path.exists(f"models/tokenizers/{name}"):
os.makedirs(f"models/tokenizers/{name}") os.makedirs(f"models/tokenizers/{name}")
else: else:
print(f"Directory models/tokenizers/{name} already exists - skipping") logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
continue continue
print(f"Downloading {name} to models/tokenizers/{name}") logger.info(f"Downloading {name} to models/tokenizers/{name}")
url = f"{repo}/raw/main/config.json" url = f"{repo}/raw/main/config.json"
save_path = f"models/tokenizers/{name}/config.json" save_path = f"models/tokenizers/{name}/config.json"
@ -115,76 +123,75 @@ for model in models:
continue continue
# create the tokenizer # create the tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
chktok = tokenizer.encode(chktxt) chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest() chkhsh = sha256(str(chktok).encode()).hexdigest()
print(f"model: {name}") logger.info(f"model: {name}")
print(f"tokt: {tokt}") logger.info(f"tokt: {tokt}")
print(f"repo: {model['repo']}") logger.info(f"repo: {model['repo']}")
print(f"chktok: {chktok}") logger.info(f"chktok: {chktok}")
print(f"chkhsh: {chkhsh}") logger.info(f"chkhsh: {chkhsh}")
# print the "pre_tokenizer" content from the tokenizer.json # print the "pre_tokenizer" content from the tokenizer.json
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f: with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
cfg = json.load(f) cfg = json.load(f)
pre_tokenizer = cfg["pre_tokenizer"] pre_tokenizer = cfg["pre_tokenizer"]
print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4)) logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
print(f"\n") logger.info("")
src_ifs += f" if chkhsh == \"{chkhsh}\":\n" src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
src_ifs += f" # ref: {model['repo']}\n" src_ifs += f" # ref: {model['repo']}\n"
src_ifs += f" res = \"{name}\"\n" src_ifs += f" res = \"{name}\"\n"
src_func = "" src_func = "" # noqa: E222
src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n" src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n" # noqa: E222
src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n" src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n" # noqa: E222
src_func += " # is specific for the BPE pre-tokenizer used by the model\n" src_func += " # is specific for the BPE pre-tokenizer used by the model\n" # noqa: E222
src_func += " # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n" src_func += " # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n" # noqa: E222
src_func += " # use in llama.cpp to implement the same pre-tokenizer\n" src_func += " # use in llama.cpp to implement the same pre-tokenizer\n" # noqa: E222
src_func += "\n" src_func += "\n" # noqa: E222
src_func += f" chktxt = {repr(chktxt)}\n" src_func += f" chktxt = {repr(chktxt)}\n" # noqa: E222
src_func += "\n" src_func += "\n" # noqa: E222
src_func += " chktok = tokenizer.encode(chktxt)\n" src_func += " chktok = tokenizer.encode(chktxt)\n" # noqa: E222
src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n" src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n" # noqa: E222
src_func += "\n" src_func += "\n" # noqa: E222
src_func += " print(f\"chktok: {chktok}\")\n" src_func += " print(f\"chktok: {chktok}\")\n" # noqa: E222
src_func += " print(f\"chkhsh: {chkhsh}\")\n" src_func += " print(f\"chkhsh: {chkhsh}\")\n" # noqa: E222
src_func += "\n" src_func += "\n" # noqa: E222
src_func += " res = None\n" src_func += " res = None\n" # noqa: E222
src_func += "\n" src_func += "\n" # noqa: E222
src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n" src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n" # noqa: E222
src_func += " # or pull the latest version of the model from Huggingface\n" src_func += " # or pull the latest version of the model from Huggingface\n" # noqa: E222
src_func += " # don't edit the hashes manually!\n" src_func += " # don't edit the hashes manually!\n" # noqa: E222
src_func += f"{src_ifs}\n" src_func += f"{src_ifs}\n" # noqa: E222
src_func += " if res is None:\n" src_func += " if res is None:\n" # noqa: E222
src_func += " print(\"\\n\")\n" src_func += " print(\"\\n\")\n" # noqa: E222
src_func += " print(\"**************************************************************************************\")\n" src_func += " print(\"**************************************************************************************\")\n" # noqa: E222
src_func += " print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n" src_func += " print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n" # noqa: E222
src_func += " print(\"** There are 2 possible reasons for this:\")\n" src_func += " print(\"** There are 2 possible reasons for this:\")\n" # noqa: E222
src_func += " print(\"** - the model has not been added to convert-hf-to-gguf-update.py yet\")\n" src_func += " print(\"** - the model has not been added to convert-hf-to-gguf-update.py yet\")\n" # noqa: E222
src_func += " print(\"** - the pre-tokenization config has changed upstream\")\n" src_func += " print(\"** - the pre-tokenization config has changed upstream\")\n" # noqa: E222
src_func += " print(\"** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n" src_func += " print(\"** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n" # noqa: E222
src_func += " print(\"** ref: https://github.com/ggerganov/llama.cpp/pull/6920\")\n" src_func += " print(\"** ref: https://github.com/ggerganov/llama.cpp/pull/6920\")\n" # noqa: E222
src_func += " print(\"**\")\n" src_func += " print(\"**\")\n" # noqa: E222
src_func += " print(f\"** chkhsh: {chkhsh}\")\n" src_func += " print(f\"** chkhsh: {chkhsh}\")\n" # noqa: E222
src_func += " print(\"**************************************************************************************\")\n" src_func += " print(\"**************************************************************************************\")\n" # noqa: E222
src_func += " print(\"\\n\")\n" src_func += " print(\"\\n\")\n" # noqa: E222
src_func += " raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n" src_func += " raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n" # noqa: E222
src_func += "\n" src_func += "\n" # noqa: E222
src_func += " print(f\"tokenizer.ggml.pre: {res}\")\n" src_func += " print(f\"tokenizer.ggml.pre: {res}\")\n" # noqa: E222
src_func += " print(f\"chkhsh: {chkhsh}\")\n" src_func += " print(f\"chkhsh: {chkhsh}\")\n" # noqa: E222
src_func += "\n" src_func += "\n" # noqa: E222
src_func += " return res\n" src_func += " return res\n" # noqa: E222
print(src_func) print(src_func) # noqa: NP100
print("\n") logger.info("\n")
print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!") logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
print("\n") logger.info("\n")
# generate tests for each tokenizer model # generate tests for each tokenizer model
@ -250,7 +257,6 @@ for model in models:
tokt = model["tokt"] tokt = model["tokt"]
# create the tokenizer # create the tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f: with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
@ -265,15 +271,15 @@ for model in models:
f.write(f" {r}") f.write(f" {r}")
f.write("\n") f.write("\n")
print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*") logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
# generate commands for creating vocab files # generate commands for creating vocab files
print("\nRun the following commands to generate the vocab files for testing:\n") logger.info("\nRun the following commands to generate the vocab files for testing:\n")
for model in models: for model in models:
name = model["name"] name = model["name"]
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") logger.info(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
print("\n") logger.info("\n")

View file

@ -276,8 +276,8 @@ class Model(ABC):
chktok = tokenizer.encode(chktxt) chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest() chkhsh = sha256(str(chktok).encode()).hexdigest()
print(f"chktok: {chktok}") logger.debug(f"chktok: {chktok}")
print(f"chkhsh: {chkhsh}") logger.debug(f"chkhsh: {chkhsh}")
res = None res = None
@ -310,22 +310,22 @@ class Model(ABC):
res = "gpt-2" res = "gpt-2"
if res is None: if res is None:
print("\n") logger.warning("\n")
print("**************************************************************************************") logger.warning("**************************************************************************************")
print("** WARNING: The BPE pre-tokenizer was not recognized!") logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
print("** There are 2 possible reasons for this:") logger.warning("** There are 2 possible reasons for this:")
print("** - the model has not been added to convert-hf-to-gguf-update.py yet") logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
print("** - the pre-tokenization config has changed upstream") logger.warning("** - the pre-tokenization config has changed upstream")
print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
print("**") logger.warning("**")
print(f"** chkhsh: {chkhsh}") logger.warning(f"** chkhsh: {chkhsh}")
print("**************************************************************************************") logger.warning("**************************************************************************************")
print("\n") logger.warning("\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
print(f"tokenizer.ggml.pre: {res}") logger.debug(f"tokenizer.ggml.pre: {res}")
print(f"chkhsh: {chkhsh}") logger.debug(f"chkhsh: {chkhsh}")
return res return res

View file

@ -4,6 +4,7 @@
# #
from __future__ import annotations from __future__ import annotations
import logging
import os import os
from collections import OrderedDict from collections import OrderedDict
from typing import Any, Literal, NamedTuple, TypeVar, Union from typing import Any, Literal, NamedTuple, TypeVar, Union
@ -27,6 +28,7 @@ from gguf.constants import (
GGUFValueType, GGUFValueType,
) )
logger = logging.getLogger(__name__)
READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION] READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION]
@ -142,7 +144,7 @@ class GGUFReader:
# TODO: add option to generate error on duplicate keys # TODO: add option to generate error on duplicate keys
# raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}') # raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
print(f'Warning: Duplicate key {field.name} at offset {field.offset}') logger.warning(f'Duplicate key {field.name} at offset {field.offset}')
self.fields[field.name + '_{}'.format(field.offset)] = field self.fields[field.name + '_{}'.format(field.offset)] = field
else: else:
self.fields[field.name] = field self.fields[field.name] = field

View file

@ -12,7 +12,7 @@ import argparse
from transformers import AutoTokenizer from transformers import AutoTokenizer
logger = logging.getLogger("convert") logger = logging.getLogger("test-tokenizer-0-bpe")
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")

View file

@ -12,7 +12,7 @@ import argparse
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
logger = logging.getLogger("test-tokenizer-0-llama") logger = logging.getLogger("test-tokenizer-0-spm")
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")