gguf-py : fix some metadata name extraction edge cases (#8591)

* gguf-py : fix some metadata name extraction edge cases * convert_lora : use the lora dir for the model card path * gguf-py : more metadata edge cases fixes Multiple finetune versions are now joined together, and the removal of the basename annotation on trailing versions is more robust. * gguf-py : add more name metadata extraction tests * convert_lora : fix default filename The default filename was previously hardcoded. * convert_hf : Model.fname_out can no longer be None * gguf-py : do not use title case for naming convention Some models use acronyms in lowercase, which can't be title-cased like other words, so it's best to simply use the same case as in the original model name. Note that the size label still has an uppercased suffix to make it distinguishable from the context size of a finetune.
2024-07-20 21:58:49 -04:00 · 2024-07-20 21:58:49 -04:00 · 328884f421
commit 328884f421
parent c69c63039c
5 changed files with 112 additions and 44 deletions
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@ -54,6 +54,7 @@ class Metadata:

        model_card = Metadata.load_model_card(model_path)
        hf_params = Metadata.load_hf_parameters(model_path)
+        # TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter

        # heuristics
        metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
@ -177,6 +178,12 @@ class Metadata:
            org_component = None

        name_parts: list[str] = model_full_name_component.split('-')
+
+        # Remove empty parts
+        for i in reversed(range(len(name_parts))):
+            if len(name_parts[i]) == 0:
+                del name_parts[i]
+
        name_types: list[
            set[Literal["basename", "size_label", "finetune", "version", "type"]]
        ] = [set() for _ in name_parts]
@ -223,9 +230,19 @@ class Metadata:
                name_parts[i] = part
            # Some easy to recognize finetune names
            elif i > 0 and re.fullmatch(r'chat|instruct|vision|lora', part, re.IGNORECASE):
-                name_types[i].add("finetune")
-                if part.lower() == "lora":
-                    name_parts[i] = "LoRA"
+                if total_params < 0 and part.lower() == "lora":
+                    # ignore redundant "lora" in the finetune part when the output is a lora adapter
+                    name_types[i].add("type")
+                else:
+                    name_types[i].add("finetune")
+
+        # Ignore word-based size labels when there is at least a number-based one present
+        # TODO: should word-based size labels always be removed instead?
+        if any(c.isdecimal() for n, t in zip(name_parts, name_types) if "size_label" in t for c in n):
+            for n, t in zip(name_parts, name_types):
+                if "size_label" in t:
+                    if all(c.isalpha() for c in n):
+                        t.remove("size_label")

        at_start = True
        # Find the basename through the annotated name
@ -240,18 +257,18 @@ class Metadata:

        # Remove the basename annotation from trailing version
        for part, t in zip(reversed(name_parts), reversed(name_types)):
-            if "basename" in t:
-                if len(t) > 1:
-                    t.remove("basename")
+            if "basename" in t and len(t) > 1:
+                t.remove("basename")
            else:
                break

        basename = "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) or None
-        size_label = "-".join(s for s, t in zip(name_parts, name_types) if "size_label" in t) or None
+        # Deduplicate size labels using order-preserving 'dict' ('set' seems to sort the keys)
+        size_label = "-".join(dict.fromkeys(s for s, t in zip(name_parts, name_types) if "size_label" in t).keys()) or None
        finetune = "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) or None
        # TODO: should the basename version always be excluded?
-        # TODO: should multiple versions be joined together?
-        version = ([v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t] or [None])[-1]
+        # NOTE: multiple finetune versions are joined together
+        version = "-".join(v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t) or None

        if size_label is None and finetune is None and version is None:
            # Too ambiguous, output nothing
--- a/gguf-py/gguf/utility.py
+++ b/gguf-py/gguf/utility.py
@ -50,15 +50,15 @@ def naming_convention(model_name: str | None, base_name: str | None, finetune_st
    # Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention

    if base_name is not None:
-        name = base_name.strip().title().replace(' ', '-').replace('/', '-')
+        name = base_name.strip().replace(' ', '-').replace('/', '-')
    elif model_name is not None:
-        name = model_name.strip().title().replace(' ', '-').replace('/', '-')
+        name = model_name.strip().replace(' ', '-').replace('/', '-')
    else:
        name = "ggml-model"

    parameters = f"-{size_label}" if size_label is not None else ""

-    finetune = f"-{finetune_string.strip().title().replace(' ', '-')}" if finetune_string is not None else ""
+    finetune = f"-{finetune_string.strip().replace(' ', '-')}" if finetune_string is not None else ""

    version = f"-{version_string.strip().replace(' ', '-')}" if version_string is not None else ""

--- a/gguf-py/tests/test_metadata.py
+++ b/gguf-py/tests/test_metadata.py
@ -54,7 +54,7 @@ class TestMetadataMethod(unittest.TestCase):
        self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Meta-Llama-3-8B"),
                         ('Meta-Llama-3-8B', "NousResearch", 'Meta-Llama-3', None, None, '8B'))

-        # Can't detect all non standard form in a heuristically safe way... best to err in caution and output nothing...
+        # Non standard naming
        self.assertEqual(gguf.Metadata.get_model_id_components("Qwen1.5-MoE-A2.7B-Chat"),
                         ('Qwen1.5-MoE-A2.7B-Chat', None, 'Qwen1.5-MoE', 'Chat', None, 'A2.7B'))

@ -71,7 +71,7 @@ class TestMetadataMethod(unittest.TestCase):
        self.assertEqual(gguf.Metadata.get_model_id_components("delphi-suite/stories-llama2-50k", 50 * 10**3),
                         ('stories-llama2-50k', 'delphi-suite', 'stories-llama2', None, None, '50K'))

-        # None standard and not easy to disambiguate
+        # Non standard and not easy to disambiguate
        self.assertEqual(gguf.Metadata.get_model_id_components("DeepSeek-Coder-V2-Lite-Instruct"),
                         ('DeepSeek-Coder-V2-Lite-Instruct', None, 'DeepSeek-Coder-V2-Lite', 'Instruct', None, None))

@ -123,6 +123,51 @@ class TestMetadataMethod(unittest.TestCase):
        self.assertEqual(gguf.Metadata.get_model_id_components("bigscience/bloom-7b1-petals"),
                         ('bloom-7b1-petals', 'bigscience', 'bloom', 'petals', None, '7.1B'))

+        # Ignore full-text size labels when there are number-based ones, and deduplicate size labels
+        self.assertEqual(gguf.Metadata.get_model_id_components("MaziyarPanahi/GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1"),
+                         ('GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1', 'MaziyarPanahi', 'GreenNode-mini', 'multilingual-v1olet-Mistral-Instruct', 'v0.1', '7B'))
+
+        # Instruct in a name without a size label
+        self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/Mistral-Nemo-Instruct-2407"),
+                         ('Mistral-Nemo-Instruct-2407', 'mistralai', 'Mistral-Nemo', 'Instruct', '2407', None))
+
+        # Non-obvious splitting relying on 'chat' keyword
+        self.assertEqual(gguf.Metadata.get_model_id_components("deepseek-ai/DeepSeek-V2-Chat-0628"),
+                         ('DeepSeek-V2-Chat-0628', 'deepseek-ai', 'DeepSeek-V2', 'Chat', '0628', None))
+
+        # Multiple versions
+        self.assertEqual(gguf.Metadata.get_model_id_components("OpenGVLab/Mini-InternVL-Chat-2B-V1-5"),
+                         ('Mini-InternVL-Chat-2B-V1-5', 'OpenGVLab', 'Mini-InternVL', 'Chat', 'V1-5', '2B'))
+
+        # TODO: DPO in the name
+        self.assertEqual(gguf.Metadata.get_model_id_components("jondurbin/bagel-dpo-2.8b-v0.2"),
+                         ('bagel-dpo-2.8b-v0.2', 'jondurbin', 'bagel-dpo', None, 'v0.2', '2.8B'))
+
+        # DPO in name, but can't be used for the finetune to keep 'LLaMA-3' in the basename
+        self.assertEqual(gguf.Metadata.get_model_id_components("voxmenthe/SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized"),
+                         ('SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized', 'voxmenthe', 'SFR-Iterative-DPO-LLaMA-3', 'R-unquantized', None, '8B'))
+
+        # Too ambiguous
+        # TODO: should "base" be a 'finetune' or 'size_label'?
+        # (in this case it should be a size label, but other models use it to signal that they are not finetuned)
+        self.assertEqual(gguf.Metadata.get_model_id_components("microsoft/Florence-2-base"),
+                         ('Florence-2-base', 'microsoft', None, None, None, None))
+
+        ## Invalid cases ##
+
+        # Start with a dash and has dashes in rows
+        self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/-Mistral--Nemo-Base-2407-"),
+                         ('-Mistral--Nemo-Base-2407-', 'mistralai', 'Mistral-Nemo-Base', None, '2407', None))
+
+        ## LoRA ##
+
+        self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B"),
+                         ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration-LoRA', None, '8B'))
+
+        # Negative size --> output is a LoRA adaper --> prune "LoRA" out of the name to avoid redundancy with the suffix
+        self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B", -1234),
+                         ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration', None, '8B'))
+
    def test_apply_metadata_heuristic_from_model_card(self):
        model_card = {
            'tags': ['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'],
@ -134,7 +179,7 @@ class TestMetadataMethod(unittest.TestCase):
        }
        got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
        expect = gguf.Metadata()
-        expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': 'v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
+        expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': '14-v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
        expect.tags=['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl']
        expect.languages=['en']
        expect.datasets=['teknium/OpenHermes-2.5']