metadata: Detailed Dataset Authorship Metadata (#8875)
Converter script can now read these two fields as a detailed base model and dataset source. This was done so that it will be easier for Hugging Face to integrate detailed metadata as needed. - base_model_sources (List[dict], optional) - dataset_sources (List[dict], optional) Dataset now represented as: - general.dataset.count - general.dataset.{id}.name - general.dataset.{id}.author - general.dataset.{id}.version - general.dataset.{id}.organization - general.dataset.{id}.description - general.dataset.{id}.url - general.dataset.{id}.doi - general.dataset.{id}.uuid - general.dataset.{id}.repo_url This also adds to base model these metadata: - general.base_model.{id}.description
This commit is contained in:
parent
2e82ffa4af
commit
a0ec17b32e
5 changed files with 233 additions and 25 deletions
|
@ -568,6 +568,9 @@ class GGUFWriter:
|
|||
def add_base_model_organization(self, source_id: int, organization: str) -> None:
|
||||
self.add_string(Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization)
|
||||
|
||||
def add_base_model_description(self, source_id: int, description: str) -> None:
|
||||
self.add_string(Keys.General.BASE_MODEL_DESCRIPTION.format(id=source_id), description)
|
||||
|
||||
def add_base_model_url(self, source_id: int, url: str) -> None:
|
||||
self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url)
|
||||
|
||||
|
@ -580,15 +583,42 @@ class GGUFWriter:
|
|||
def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None:
|
||||
self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url)
|
||||
|
||||
def add_dataset_count(self, source_count: int) -> None:
|
||||
self.add_uint32(Keys.General.DATASET_COUNT, source_count)
|
||||
|
||||
def add_dataset_name(self, source_id: int, name: str) -> None:
|
||||
self.add_string(Keys.General.DATASET_NAME.format(id=source_id), name)
|
||||
|
||||
def add_dataset_author(self, source_id: int, author: str) -> None:
|
||||
self.add_string(Keys.General.DATASET_AUTHOR.format(id=source_id), author)
|
||||
|
||||
def add_dataset_version(self, source_id: int, version: str) -> None:
|
||||
self.add_string(Keys.General.DATASET_VERSION.format(id=source_id), version)
|
||||
|
||||
def add_dataset_organization(self, source_id: int, organization: str) -> None:
|
||||
self.add_string(Keys.General.DATASET_ORGANIZATION.format(id=source_id), organization)
|
||||
|
||||
def add_dataset_description(self, source_id: int, description: str) -> None:
|
||||
self.add_string(Keys.General.DATASET_DESCRIPTION.format(id=source_id), description)
|
||||
|
||||
def add_dataset_url(self, source_id: int, url: str) -> None:
|
||||
self.add_string(Keys.General.DATASET_URL.format(id=source_id), url)
|
||||
|
||||
def add_dataset_doi(self, source_id: int, doi: str) -> None:
|
||||
self.add_string(Keys.General.DATASET_DOI.format(id=source_id), doi)
|
||||
|
||||
def add_dataset_uuid(self, source_id: int, uuid: str) -> None:
|
||||
self.add_string(Keys.General.DATASET_UUID.format(id=source_id), uuid)
|
||||
|
||||
def add_dataset_repo_url(self, source_id: int, repo_url: str) -> None:
|
||||
self.add_string(Keys.General.DATASET_REPO_URL.format(id=source_id), repo_url)
|
||||
|
||||
def add_tags(self, tags: Sequence[str]) -> None:
|
||||
self.add_array(Keys.General.TAGS, tags)
|
||||
|
||||
def add_languages(self, languages: Sequence[str]) -> None:
|
||||
self.add_array(Keys.General.LANGUAGES, languages)
|
||||
|
||||
def add_datasets(self, datasets: Sequence[str]) -> None:
|
||||
self.add_array(Keys.General.DATASETS, datasets)
|
||||
|
||||
def add_tensor_data_layout(self, layout: str) -> None:
|
||||
self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue