From fcf2da4621dc0f7079ca318a48ca961793ab9e4c Mon Sep 17 00:00:00 2001
From: Eddie-Wang1120 <wangjinheng1120@163.com>
Date: Wed, 19 Jun 2024 21:48:04 +0800
Subject: [PATCH] add dequantize

---
 convert-hf-to-gguf.py     | 49 +++++++++++++--------------------------
 ggml-quants.c             | 20 ++++++++++++++++
 ggml-quants.h             |  1 +
 ggml.c                    |  1 +
 gguf-py/gguf/constants.py |  3 +++
 llama.cpp                 |  2 +-
 6 files changed, 42 insertions(+), 34 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 0b19e470c..224569907 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1420,40 +1420,23 @@ class BitnetModel(Model):
         return weight.type(dtype), scale.type(torch.float32)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # transform weight into 1/0/-1 (in fp32)
-        if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
-                          "down_proj.weight", "up_proj.weight", "gate_proj.weight",
-                          "o_proj.weight")):
+        new_name = self.map_tensor_name(name)
+
+        if any(self.match_model_tensor_name(new_name, key, bid) for key in [
+            gguf.MODEL_TENSOR.ATTN_Q,
+            gguf.MODEL_TENSOR.ATTN_K,
+            gguf.MODEL_TENSOR.ATTN_V,
+            gguf.MODEL_TENSOR.ATTN_OUT,
+            gguf.MODEL_TENSOR.FFN_UP,
+            gguf.MODEL_TENSOR.FFN_DOWN,
+            gguf.MODEL_TENSOR.FFN_GATE,
+        ]):
+            # transform weight into 1/0/-1 (in fp32)
             weight_torch, scale_torch = self.weight_quant(data_torch)
-
-        tensors: list[tuple[str, Tensor]] = []
-
-        if name.endswith("q_proj.weight"):
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), weight_torch))
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid, suffix=".scale"), scale_torch))
-        elif name.endswith("k_proj.weight"):
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), weight_torch))
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid, suffix=".scale"), scale_torch))
-        elif name.endswith("v_proj.weight"):
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), weight_torch))
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid, suffix=".scale"), scale_torch))
-        elif name.endswith("o_proj.weight"):
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), weight_torch))
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid, suffix=".scale"), scale_torch))
-        elif name.endswith("up_proj.weight"):
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), weight_torch))
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid, suffix=".scale"), scale_torch))
-        elif name.endswith("down_proj.weight"):
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), weight_torch))
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid, suffix=".scale"), scale_torch))
-        elif name.endswith("gate_proj.weight"):
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), weight_torch))
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid, suffix=".scale"), scale_torch))
-
-        if len(tensors) == 0:
-            tensors.append((self.map_tensor_name(name), data_torch))
-
-        return tensors
+            yield (new_name, weight_torch)
+            yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
+        else:
+            yield (new_name, data_torch)
 
 
 @Model.register("GrokForCausalLM")
diff --git a/ggml-quants.c b/ggml-quants.c
index a3c8c6731..a3633fc53 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1545,6 +1545,26 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
 #endif
 }
 
+void dequantize_row_q2_2(const block_q2_2 * restrict x, float * restrict y, int64_t k) {
+    static const int qk = QK2_2;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+
+        for (int j = 0; j < qk/4; ++j) {
+            const int8_t * q = (const int8_t *) (q22_grid + x[i].qs[j]);
+
+            *y++ = (float) q[0];
+            *y++ = (float) q[1];
+            *y++ = (float) q[2];
+            *y++ = (float) q[3];
+        }
+    }
+}
+
 void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
     static const int qk = QK4_0;
 
diff --git a/ggml-quants.h b/ggml-quants.h
index e5ef8a8ca..e159cef5f 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -55,6 +55,7 @@ void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
 void quantize_row_iq2_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
 // Dequantization
+void dequantize_row_q2_2(const block_q2_2 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
diff --git a/ggml.c b/ggml.c
index d714171f7..55effd717 100644
--- a/ggml.c
+++ b/ggml.c
@@ -620,6 +620,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .type_name                = "q2_2",
         .blck_size                = QK2_2,
         .type_size                = sizeof(block_q2_2),
+        .to_float                 = (ggml_to_float_t) dequantize_row_q2_2,
         .is_quantized             = true,
         .from_float               = quantize_row_q2_2,
         .from_float_reference     = (ggml_from_float_t) quantize_row_q2_2_reference,
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 1fc8fcde5..301200869 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -923,6 +923,7 @@ class GGMLQuantizationType(IntEnum):
     F64     = 28
     IQ1_M   = 29
     BF16    = 30
+    Q2_2    = 31
 
 
 # TODO: add GGMLFileType from ggml_ftype in ggml.h
@@ -964,6 +965,7 @@ class LlamaFileType(IntEnum):
     MOSTLY_IQ4_XS        = 30  # except 1d tensors
     MOSTLY_IQ1_M         = 31  # except 1d tensors
     MOSTLY_BF16          = 32  # except 1d tensors
+    MOSTLY_Q2_2          = 33  # except 1d tensors
 
     GUESSED              = 1024  # not specified in the model file
 
@@ -1010,6 +1012,7 @@ QK_K = 256
 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
     GGMLQuantizationType.F32:     (1, 4),
     GGMLQuantizationType.F16:     (1, 2),
+    GGMLQuantizationType.Q2_2:    (32, 8),
     GGMLQuantizationType.Q4_0:    (32, 2 + 16),
     GGMLQuantizationType.Q4_1:    (32, 2 + 2 + 16),
     GGMLQuantizationType.Q5_0:    (32, 2 + 4 + 16),
diff --git a/llama.cpp b/llama.cpp
index c87dd9c3c..85182f4bb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3885,6 +3885,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_ALL_F32:     return "all F32";
         case LLAMA_FTYPE_MOSTLY_F16:  return "F16";
         case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
+        case LLAMA_FTYPE_MOSTLY_Q2_2: return "Q2_2";
         case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
         case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
         case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
@@ -11705,7 +11706,6 @@ struct llm_build_context {
 
                 cb(cur, "ffn_gate", il);
 
-
                 cur = ggml_silu(ctx0, cur);
                 cb(cur, "ffn_silu", il);