Add more information to GGUFReader and examples comments

2023-11-09 02:52:42 -07:00 · 2023-11-09 02:52:42 -07:00 · 8e250fe527
commit 8e250fe527
parent 2360aaadb4
3 changed files with 38 additions and 2 deletions
--- a/gguf-py/examples/dump_gguf.py
+++ b/gguf-py/examples/dump_gguf.py
@ -8,6 +8,8 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
 from gguf import GGUFReader, GGUFValueType  # noqa: E402


+# For more information about what field.parts and field.data represent,
+# please see the comments in the modify_gguf.py example.
 def dump_gguf(filename: str) -> None:
    print(f'* Loading: {filename}')
    reader = GGUFReader(filename, 'r')
--- a/gguf-py/examples/modify_gguf.py
+++ b/gguf-py/examples/modify_gguf.py
@ -8,12 +8,41 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
 from gguf import GGUFReader  # noqa: E402


+def minimal_example(filename: str) -> None:
+    reader = GGUFReader(filename, 'r+')
+    field = reader.fields['tokenizer.ggml.bos_token_id']
+    if field is None:
+        return
+    part_index = field.data[0]
+    field.parts[part_index][0] = 2 # Set tokenizer.ggml.bos_token_id to 2
+    #
+    # So what's this field.data thing? It's helpful because field.parts contains
+    # _every_ part of the GGUF field. For example, tokenizer.ggml.bos_token_id consists
+    # of:
+    #
+    #  Part index 0: Key length (27)
+    #  Part index 1: Key data ("tokenizer.ggml.bos_token_id")
+    #  Part index 2: Field type (4, the id for GGUFValueType.UINT32)
+    #  Part index 3: Field value
+    #
+    # Note also that each part is an NDArray slice, so even a part that
+    # is only a single value like the key length will be a NDArray of
+    # the key length type (numpy.uint32).
+    #
+    # The .data attribute in the Field is a list of relevant part indexes
+    # and doesn't contain internal GGUF details like the key length part.
+    # In this case, .data will be [3] - just the part index of the
+    # field value itself.
+
+
 def change_gguf(reader: GGUFReader, key: str, value: str) -> None:
    field = reader.get_field(key)
    if field is None:
        print(f'! Field {repr(key)} not found', file = sys.stderr)
        sys.exit(1)
-
+    # Note that field.types is a list of types. This is because the GGUF
+    # format supports arrays. For example, an array of UINT32 would
+    # look like [GGUFValueType.ARRAY, GGUFValueType.UINT32]
    handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None
    if handler is None:
        print(f'! Field {repr(key)} has unsupported type: {field.types}')
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@ -1,8 +1,12 @@
+#
+# GGUF file reading/modification support. For API usage information,
+# please see examples/modify_gguf.py and examples/dump_gguf.py
+#
 from __future__ import annotations

 import os
 from collections import OrderedDict
-from typing import Any, Dict, Literal, NamedTuple, TypeVar, Union
+from typing import Any, Literal, NamedTuple, TypeVar, Union

 import numpy as np
 import numpy.typing as npt
@ -23,6 +27,7 @@ from gguf.constants import (
    GGUFValueType,
 )

+
 READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION]