convert-*.py: parse model card in metadata util. Add license_link and license_name to kv store

This commit is contained in:
brian khuu 2024-06-02 12:27:28 +10:00
parent 5c263cb257
commit b36e391b87
5 changed files with 105 additions and 33 deletions

View file

@ -57,7 +57,6 @@ class Model:
lazy: bool lazy: bool
part_names: list[str] part_names: list[str]
is_safetensors: bool is_safetensors: bool
model_card: dict[str, Any]
hparams: dict[str, Any] hparams: dict[str, Any]
block_count: int block_count: int
tensor_map: gguf.TensorNameMap tensor_map: gguf.TensorNameMap
@ -85,7 +84,6 @@ class Model:
self.is_safetensors = len(self.part_names) > 0 self.is_safetensors = len(self.part_names) > 0
if not self.is_safetensors: if not self.is_safetensors:
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin") self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.model_card = Model.load_model_card(dir_model)
self.hparams = Model.load_hparams(self.dir_model) self.hparams = Model.load_hparams(self.dir_model)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@ -249,8 +247,8 @@ class Model:
self.gguf_writer.add_url(self.metadata.url) self.gguf_writer.add_url(self.metadata.url)
if self.metadata.description is not None: if self.metadata.description is not None:
self.gguf_writer.add_description(self.metadata.description) self.gguf_writer.add_description(self.metadata.description)
if self.metadata.licence is not None: if self.metadata.license is not None:
self.gguf_writer.add_licence(self.metadata.licence) self.gguf_writer.add_license(self.metadata.license)
if self.metadata.source_url is not None: if self.metadata.source_url is not None:
self.gguf_writer.add_source_url(self.metadata.source_url) self.gguf_writer.add_source_url(self.metadata.source_url)
if self.metadata.source_hf_repo is not None: if self.metadata.source_hf_repo is not None:
@ -439,11 +437,6 @@ class Model:
return part_names return part_names
@staticmethod
def load_model_card(dir_model: Path):
with open(dir_model / "README.md", "r", encoding="utf-8") as f:
return frontmatter.load(f)
@staticmethod @staticmethod
def load_hparams(dir_model: Path): def load_hparams(dir_model: Path):
with open(dir_model / "config.json", "r", encoding="utf-8") as f: with open(dir_model / "config.json", "r", encoding="utf-8") as f:
@ -3611,9 +3604,10 @@ def main() -> None:
else: else:
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
metadata = gguf.Metadata.load(args.metadata)
dir_model = args.model dir_model = args.model
metadata = gguf.Metadata.load(args.metadata, dir_model)
if not dir_model.is_dir(): if not dir_model.is_dir():
logger.error(f'Error: {args.model} is not a directory') logger.error(f'Error: {args.model} is not a directory')
sys.exit(1) sys.exit(1)

View file

@ -1260,10 +1260,12 @@ def main(args_in: list[str] | None = None) -> None:
else: else:
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
dir_model = args.model
metadata = gguf.Metadata.load(args.metadata) metadata = gguf.Metadata.load(args.metadata)
if args.get_outfile: if args.get_outfile:
model_plus = load_some_model(args.model) model_plus = load_some_model(dir_model)
params = Params.load(model_plus) params = Params.load(model_plus)
model = convert_model_names(model_plus.model, params, args.skip_unknown) model = convert_model_names(model_plus.model, params, args.skip_unknown)
model_params_count = per_model_weight_count_estimation(model_plus.model.items(), params.n_experts) model_params_count = per_model_weight_count_estimation(model_plus.model.items(), params.n_experts)
@ -1275,14 +1277,14 @@ def main(args_in: list[str] | None = None) -> None:
raise ValueError("--vocab-only does not make sense with --no-vocab") raise ValueError("--vocab-only does not make sense with --no-vocab")
if args.dump_single: if args.dump_single:
model_plus = lazy_load_file(args.model) model_plus = lazy_load_file(dir_model)
do_dump_model(model_plus) do_dump_model(model_plus)
return return
if not args.vocab_only: if not args.vocab_only:
model_plus = load_some_model(args.model) model_plus = load_some_model(dir_model)
else: else:
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None)
model_params_count = per_model_weight_count_estimation(model_plus.model.items(), params.n_experts) model_params_count = per_model_weight_count_estimation(model_plus.model.items(), params.n_experts)
logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count)})") logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count)})")
@ -1318,7 +1320,7 @@ def main(args_in: list[str] | None = None) -> None:
logger.info(f"params = {params}") logger.info(f"params = {params}")
model_parent_path = model_plus.paths[0].parent model_parent_path = model_plus.paths[0].parent
vocab_path = Path(args.vocab_dir or args.model or model_parent_path) vocab_path = Path(args.vocab_dir or dir_model or model_parent_path)
vocab_factory = VocabFactory(vocab_path) vocab_factory = VocabFactory(vocab_path)
vocab_types = None if args.no_vocab else args.vocab_type.split(",") vocab_types = None if args.no_vocab else args.vocab_type.split(",")
vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path) vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)

View file

@ -31,6 +31,8 @@ class Keys:
URL = "general.url" URL = "general.url"
DESCRIPTION = "general.description" DESCRIPTION = "general.description"
LICENSE = "general.license" LICENSE = "general.license"
LICENSE_NAME = "general.license.name"
LICENSE_LINK = "general.license.link"
SOURCE_URL = "general.source.url" SOURCE_URL = "general.source.url"
SOURCE_HF_REPO = "general.source.huggingface.repository" SOURCE_HF_REPO = "general.source.huggingface.repository"
FILE_TYPE = "general.file_type" FILE_TYPE = "general.file_type"

View file

@ -454,6 +454,12 @@ class GGUFWriter:
def add_license(self, license: str) -> None: def add_license(self, license: str) -> None:
self.add_string(Keys.General.LICENSE, license) self.add_string(Keys.General.LICENSE, license)
def add_license_name(self, license: str) -> None:
self.add_string(Keys.General.LICENSE_NAME, license)
def add_license_link(self, license: str) -> None:
self.add_string(Keys.General.LICENSE_LINK, license)
def add_source_url(self, url: str) -> None: def add_source_url(self, url: str) -> None:
self.add_string(Keys.General.SOURCE_URL, url) self.add_string(Keys.General.SOURCE_URL, url)

View file

@ -1,6 +1,7 @@
from __future__ import annotations from __future__ import annotations
import json import json
import frontmatter
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@ -11,6 +12,7 @@ from .constants import Keys
@dataclass @dataclass
class Metadata: class Metadata:
# Authorship Metadata to be written to GGUF KV Store
name: Optional[str] = None name: Optional[str] = None
basename: Optional[str] = None basename: Optional[str] = None
finetune: Optional[str] = None finetune: Optional[str] = None
@ -18,32 +20,98 @@ class Metadata:
version: Optional[str] = None version: Optional[str] = None
url: Optional[str] = None url: Optional[str] = None
description: Optional[str] = None description: Optional[str] = None
licence: Optional[str] = None license: Optional[str] = None
license_name: Optional[str] = None
license_link: Optional[str] = None
source_url: Optional[str] = None source_url: Optional[str] = None
source_hf_repo: Optional[str] = None source_hf_repo: Optional[str] = None
@staticmethod @staticmethod
def load(metadata_path: Path) -> Metadata: def load(metadata_override_path: Path, model_path: Path) -> Metadata:
if metadata_path is None or not metadata_path.exists(): # This grabs as many contextual authorship metadata as possible from the model repository
return Metadata() # making any conversion as required to match the gguf kv store metadata format
# as well as giving users the ability to override any authorship metadata that may be incorrect
with open(metadata_path, 'r') as file:
data = json.load(file)
# Create a new Metadata instance # Create a new Metadata instance
metadata = Metadata() metadata = Metadata()
# Assigning values to Metadata attributes if they exist in the JSON file # load model folder model card if available
# Reference Model Card Metadata: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1
model_card = Metadata.load_model_card(model_path)
if metadata.name is None:
if "model-index" in model_card and len(model_card["model_name"]) == 1 and "name" in model_card["model_name"][0]:
metadata.name = model_card["model_name"][0].get("name")
elif "model_name" in model_card:
# non huggingface model card standard but notice some model creator using it
metadata.name = model_card.get("model_name")
if metadata.license is None:
metadata.license = model_card.get("license")
if metadata.license_name is None:
metadata.license_name = model_card.get("license_name")
if metadata.license_link is None:
metadata.license_link = model_card.get("license_link")
# load huggingface parameters if available
hf_params = Metadata.load_huggingface_parameters(model_path)
hf_name_or_path = hf_params.get("_name_or_path")
if metadata.name is None and hf_name_or_path is not None:
metadata.name = Path(hf_name_or_path).name
if metadata.source_hf_repo is None and hf_name_or_path is not None:
metadata.source_hf_repo = Path(hf_name_or_path).name
# Use Directory Folder Name As Fallback Name
if metadata.name is None:
if model_path is not None and model_path.exists():
metadata.name = model_path.name
# Metadata Override
# This is based on LLM_KV_NAMES mapping in llama.cpp # This is based on LLM_KV_NAMES mapping in llama.cpp
metadata.name = data.get(Keys.General.NAME) metadata_override = Metadata.load_metadata_override(metadata_override_path)
metadata.basename = data.get(Keys.General.BASENAME) metadata.name = metadata_override.get(Keys.General.NAME , metadata.name ) # noqa: E202
metadata.finetune = data.get(Keys.General.FINETUNE) metadata.basename = metadata_override.get(Keys.General.BASENAME , metadata.basename ) # noqa: E202
metadata.author = data.get(Keys.General.AUTHOR) metadata.finetune = metadata_override.get(Keys.General.FINETUNE , metadata.finetune ) # noqa: E202
metadata.version = data.get(Keys.General.VERSION) metadata.author = metadata_override.get(Keys.General.AUTHOR , metadata.author ) # noqa: E202
metadata.url = data.get(Keys.General.URL) metadata.version = metadata_override.get(Keys.General.VERSION , metadata.version ) # noqa: E202
metadata.description = data.get(Keys.General.DESCRIPTION) metadata.url = metadata_override.get(Keys.General.URL , metadata.url ) # noqa: E202
metadata.license = data.get(Keys.General.LICENSE) metadata.description = metadata_override.get(Keys.General.DESCRIPTION , metadata.description ) # noqa: E202
metadata.source_url = data.get(Keys.General.SOURCE_URL) metadata.license = metadata_override.get(Keys.General.LICENSE , metadata.license ) # noqa: E202
metadata.source_hf_repo = data.get(Keys.General.SOURCE_HF_REPO) metadata.license_name = metadata_override.get(Keys.General.LICENSE_NAME , metadata.license_name ) # noqa: E202
metadata.license_link = metadata_override.get(Keys.General.LICENSE_LINK , metadata.license_link ) # noqa: E202
metadata.source_url = metadata_override.get(Keys.General.SOURCE_URL , metadata.source_url ) # noqa: E202
metadata.source_hf_repo = metadata_override.get(Keys.General.SOURCE_HF_REPO, metadata.source_hf_repo) # noqa: E202
return metadata return metadata
@staticmethod
def load_metadata_override(metadata_override_path: Path):
if metadata_override_path is None or not metadata_override_path.exists():
return {}
with open(metadata_override_path, "r", encoding="utf-8") as f:
return json.load(f)
@staticmethod
def load_model_card(model_path: Path):
if model_path is None or not model_path.exists():
return {}
model_card_path = model_path / "README.md"
if not model_card_path.exists():
return {}
with open(model_card_path, "r", encoding="utf-8") as f:
return frontmatter.load(f)
@staticmethod
def load_huggingface_parameters(model_path: Path):
if model_path is None or not model_path.exists():
return {}
config_path = model_path / "config.json"
if not config_path.exists():
return {}
with open(config_path, "r", encoding="utf-8") as f:
return json.load(f)