From 7a70fcd85e785ad6ee7a8568b56366d3a6c4f9a0 Mon Sep 17 00:00:00 2001 From: brian khuu Date: Thu, 1 Aug 2024 22:03:22 +1000 Subject: [PATCH] py: add more authorship metadata from model card --- gguf-py/gguf/metadata.py | 126 ++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 61 deletions(-) diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py index 15189f717..30e063051 100644 --- a/gguf-py/gguf/metadata.py +++ b/gguf-py/gguf/metadata.py @@ -284,20 +284,64 @@ class Metadata: ######################## if model_card is not None: - if "model_name" in model_card and metadata.name is None: - # Not part of huggingface model card standard but notice some model creator using it - # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' - metadata.name = model_card.get("model_name") + def use_model_card_metadata(metadata: Any | None, key_name: str): + if key_name in model_card and metadata is None: + metadata = model_card.get(key_name) - if "model_creator" in model_card and metadata.author is None: - # Not part of huggingface model card standard but notice some model creator using it - # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' - metadata.author = model_card.get("model_creator") + def use_array_model_card_metadata(metadata: Any | None, key_name: str): + # Note: Will append rather than replace if already exist + tags_value = model_card.get(key_name, None) + if tags_value is None: + return - if "model_type" in model_card and metadata.basename is None: - # Not part of huggingface model card standard but notice some model creator using it - # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' - metadata.basename = model_card.get("model_type") + if metadata is None: + metadata = [] + + if isinstance(tags_value, str): + metadata.append(tags_value) + elif isinstance(tags_value, list): + metadata.extend(tags_value) + + # LLAMA.cpp's direct internal convention + # (Definitely not part of hugging face formal/informal standard) + ######################################### + use_model_card_metadata(metadata.name, "name") + use_model_card_metadata(metadata.author, "author") + use_model_card_metadata(metadata.version, "version") + use_model_card_metadata(metadata.organization, "organization") + use_model_card_metadata(metadata.description, "description") + use_model_card_metadata(metadata.finetune, "finetune") + use_model_card_metadata(metadata.basename, "basename") + use_model_card_metadata(metadata.size_label, "size_label") + use_model_card_metadata(metadata.source_url, "url") + use_model_card_metadata(metadata.source_doi, "doi") + use_model_card_metadata(metadata.source_uuid, "uuid") + use_model_card_metadata(metadata.source_repo_url, "repo_url") + + # LLAMA.cpp's huggingface style convention + # (Definitely not part of hugging face formal/informal standard... but with model_ appended to match their style) + ########################################### + use_model_card_metadata(metadata.name, "model_name") + use_model_card_metadata(metadata.author, "model_author") + use_model_card_metadata(metadata.version, "model_version") + use_model_card_metadata(metadata.organization, "model_organization") + use_model_card_metadata(metadata.description, "model_description") + use_model_card_metadata(metadata.finetune, "model_finetune") + use_model_card_metadata(metadata.basename, "model_basename") + use_model_card_metadata(metadata.size_label, "model_size_label") + use_model_card_metadata(metadata.source_url, "model_url") + use_model_card_metadata(metadata.source_doi, "model_doi") + use_model_card_metadata(metadata.source_uuid, "model_uuid") + use_model_card_metadata(metadata.source_repo_url, "model_repo_url") + + # Hugging Face Direct Convention + ################################# + + # Not part of huggingface model card standard but notice some model creator using it + # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' + use_model_card_metadata(metadata.name, "model_name") + use_model_card_metadata(metadata.author, "model_creator") + use_model_card_metadata(metadata.basename, "model_type") if "base_model" in model_card: # This represents the parent models that this is based on @@ -329,58 +373,18 @@ class Metadata: base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}" metadata.base_models.append(base_model) - if "license" in model_card and metadata.license is None: - metadata.license = model_card.get("license") + use_model_card_metadata(metadata.license, "license") + use_model_card_metadata(metadata.license_name, "license_name") + use_model_card_metadata(metadata.license_link, "license_link") - if "license_name" in model_card and metadata.license_name is None: - metadata.license_name = model_card.get("license_name") + use_array_model_card_metadata(metadata.tags, "tags") + use_array_model_card_metadata(metadata.tags, "pipeline_tag") - if "license_link" in model_card and metadata.license_link is None: - metadata.license_link = model_card.get("license_link") + use_array_model_card_metadata(metadata.languages, "languages") + use_array_model_card_metadata(metadata.languages, "language") - tags_value = model_card.get("tags", None) - if tags_value is not None: - - if metadata.tags is None: - metadata.tags = [] - - if isinstance(tags_value, str): - metadata.tags.append(tags_value) - elif isinstance(tags_value, list): - metadata.tags.extend(tags_value) - - pipeline_tags_value = model_card.get("pipeline_tag", None) - if pipeline_tags_value is not None: - - if metadata.tags is None: - metadata.tags = [] - - if isinstance(pipeline_tags_value, str): - metadata.tags.append(pipeline_tags_value) - elif isinstance(pipeline_tags_value, list): - metadata.tags.extend(pipeline_tags_value) - - language_value = model_card.get("languages", model_card.get("language", None)) - if language_value is not None: - - if metadata.languages is None: - metadata.languages = [] - - if isinstance(language_value, str): - metadata.languages.append(language_value) - elif isinstance(language_value, list): - metadata.languages.extend(language_value) - - dataset_value = model_card.get("datasets", model_card.get("dataset", None)) - if dataset_value is not None: - - if metadata.datasets is None: - metadata.datasets = [] - - if isinstance(dataset_value, str): - metadata.datasets.append(dataset_value) - elif isinstance(dataset_value, list): - metadata.datasets.extend(dataset_value) + use_array_model_card_metadata(metadata.datasets, "datasets") + use_array_model_card_metadata(metadata.datasets, "dataset") # Hugging Face Parameter Heuristics ####################################