Merge branch 'master' into compilade/faster-lazy-safetensors

2024-07-15 15:24:25 -04:00 · 2024-07-15 15:24:25 -04:00 · 2a49a68d70
commit 2a49a68d70
parent 7cda4dd7e9 97bdd26eee
25 changed files with 1531 additions and 720 deletions
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -19,6 +19,7 @@ GGML_QUANT_VERSION     = 2  # GGML_QNT_VERSION from ggml.h

 class Keys:
    class General:
+        TYPE                 = "general.type"
        ARCHITECTURE         = "general.architecture"
        QUANTIZATION_VERSION = "general.quantization_version"
        ALIGNMENT            = "general.alignment"
@ -120,11 +121,20 @@ class Keys:
        MIDDLE_ID            = "tokenizer.ggml.middle_token_id"
        EOT_ID               = "tokenizer.ggml.eot_token_id"

+    class Adapter:
+        TYPE       = "adapter.type"
+        LORA_ALPHA = "adapter.lora.alpha"
+
 #
 # recommended mapping of model tensor names for storage in gguf
 #


+class GGUFType:
+    MODEL   = "model"
+    ADAPTER = "adapter"
+
+
 class MODEL_ARCH(IntEnum):
    LLAMA        = auto()
    FALCON       = auto()
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -424,6 +424,9 @@ class GGUFWriter:
                fout.close()
            self.fout = None

+    def add_type(self, type_name: str) -> None:
+        self.add_string(Keys.General.TYPE, type_name)
+
    def add_architecture(self) -> None:
        self.add_string(Keys.General.ARCHITECTURE, self.arch)

--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@ -43,7 +43,7 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.
        osize *= dim
    out = np.empty(shape=osize, dtype=otype)
    # compute over groups of 16 rows (arbitrary, but seems good for performance)
-    n_groups = rows.shape[0] // 16
+    n_groups = (rows.shape[0] // 16) or 1
    np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
    return out.reshape(oshape)