metadata: Detailed Dataset Authorship Metadata (#8875)

Converter script can now read these two fields as a detailed base model and dataset source.
This was done so that it will be easier for Hugging Face to integrate detailed metadata as needed.

 -  base_model_sources (List[dict], optional)
 -  dataset_sources (List[dict], optional)

Dataset now represented as:

 - general.dataset.count
 - general.dataset.{id}.name
 - general.dataset.{id}.author
 - general.dataset.{id}.version
 - general.dataset.{id}.organization
 - general.dataset.{id}.description
 - general.dataset.{id}.url
 - general.dataset.{id}.doi
 - general.dataset.{id}.uuid
 - general.dataset.{id}.repo_url

This also adds to base model these metadata:

 - general.base_model.{id}.description
This commit is contained in:
Brian 2024-11-13 21:10:38 +11:00 committed by GitHub
parent 2e82ffa4af
commit a0ec17b32e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 233 additions and 25 deletions

View file

@ -840,6 +840,8 @@ class OutputFile:
self.gguf.add_base_model_version(key, base_model_entry["version"])
if "organization" in base_model_entry:
self.gguf.add_base_model_organization(key, base_model_entry["organization"])
if "description" in base_model_entry:
self.gguf.add_base_model_description(key, base_model_entry["description"])
if "url" in base_model_entry:
self.gguf.add_base_model_url(key, base_model_entry["url"])
if "doi" in base_model_entry:
@ -849,12 +851,32 @@ class OutputFile:
if "repo_url" in base_model_entry:
self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
if metadata.datasets is not None:
self.gguf.add_dataset_count(len(metadata.datasets))
for key, dataset_entry in enumerate(metadata.datasets):
if "name" in dataset_entry:
self.gguf.add_dataset_name(key, dataset_entry["name"])
if "author" in dataset_entry:
self.gguf.add_dataset_author(key, dataset_entry["author"])
if "version" in dataset_entry:
self.gguf.add_dataset_version(key, dataset_entry["version"])
if "organization" in dataset_entry:
self.gguf.add_dataset_organization(key, dataset_entry["organization"])
if "description" in dataset_entry:
self.gguf.add_dataset_description(key, dataset_entry["description"])
if "url" in dataset_entry:
self.gguf.add_dataset_url(key, dataset_entry["url"])
if "doi" in dataset_entry:
self.gguf.add_dataset_doi(key, dataset_entry["doi"])
if "uuid" in dataset_entry:
self.gguf.add_dataset_uuid(key, dataset_entry["uuid"])
if "repo_url" in dataset_entry:
self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"])
if metadata.tags is not None:
self.gguf.add_tags(metadata.tags)
if metadata.languages is not None:
self.gguf.add_languages(metadata.languages)
if metadata.datasets is not None:
self.gguf.add_datasets(metadata.datasets)
def add_meta_arch(self, params: Params) -> None:
# Metadata About The Neural Architecture Itself