From fcf2da4621dc0f7079ca318a48ca961793ab9e4c Mon Sep 17 00:00:00 2001 From: Eddie-Wang1120 Date: Wed, 19 Jun 2024 21:48:04 +0800 Subject: [PATCH] add dequantize --- convert-hf-to-gguf.py | 49 +++++++++++++-------------------------- ggml-quants.c | 20 ++++++++++++++++ ggml-quants.h | 1 + ggml.c | 1 + gguf-py/gguf/constants.py | 3 +++ llama.cpp | 2 +- 6 files changed, 42 insertions(+), 34 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 0b19e470c..224569907 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1420,40 +1420,23 @@ class BitnetModel(Model): return weight.type(dtype), scale.type(torch.float32) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # transform weight into 1/0/-1 (in fp32) - if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", - "down_proj.weight", "up_proj.weight", "gate_proj.weight", - "o_proj.weight")): + new_name = self.map_tensor_name(name) + + if any(self.match_model_tensor_name(new_name, key, bid) for key in [ + gguf.MODEL_TENSOR.ATTN_Q, + gguf.MODEL_TENSOR.ATTN_K, + gguf.MODEL_TENSOR.ATTN_V, + gguf.MODEL_TENSOR.ATTN_OUT, + gguf.MODEL_TENSOR.FFN_UP, + gguf.MODEL_TENSOR.FFN_DOWN, + gguf.MODEL_TENSOR.FFN_GATE, + ]): + # transform weight into 1/0/-1 (in fp32) weight_torch, scale_torch = self.weight_quant(data_torch) - - tensors: list[tuple[str, Tensor]] = [] - - if name.endswith("q_proj.weight"): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), weight_torch)) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid, suffix=".scale"), scale_torch)) - elif name.endswith("k_proj.weight"): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), weight_torch)) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid, suffix=".scale"), scale_torch)) - elif name.endswith("v_proj.weight"): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), weight_torch)) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid, suffix=".scale"), scale_torch)) - elif name.endswith("o_proj.weight"): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), weight_torch)) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid, suffix=".scale"), scale_torch)) - elif name.endswith("up_proj.weight"): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), weight_torch)) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid, suffix=".scale"), scale_torch)) - elif name.endswith("down_proj.weight"): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), weight_torch)) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid, suffix=".scale"), scale_torch)) - elif name.endswith("gate_proj.weight"): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), weight_torch)) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid, suffix=".scale"), scale_torch)) - - if len(tensors) == 0: - tensors.append((self.map_tensor_name(name), data_torch)) - - return tensors + yield (new_name, weight_torch) + yield (new_name.removesuffix(".weight") + ".scale", scale_torch) + else: + yield (new_name, data_torch) @Model.register("GrokForCausalLM") diff --git a/ggml-quants.c b/ggml-quants.c index a3c8c6731..a3633fc53 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1545,6 +1545,26 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) #endif } +void dequantize_row_q2_2(const block_q2_2 * restrict x, float * restrict y, int64_t k) { + static const int qk = QK2_2; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + + for (int j = 0; j < qk/4; ++j) { + const int8_t * q = (const int8_t *) (q22_grid + x[i].qs[j]); + + *y++ = (float) q[0]; + *y++ = (float) q[1]; + *y++ = (float) q[2]; + *y++ = (float) q[3]; + } + } +} + void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) { static const int qk = QK4_0; diff --git a/ggml-quants.h b/ggml-quants.h index e5ef8a8ca..e159cef5f 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -55,6 +55,7 @@ void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); // Dequantization +void dequantize_row_q2_2(const block_q2_2 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); diff --git a/ggml.c b/ggml.c index d714171f7..55effd717 100644 --- a/ggml.c +++ b/ggml.c @@ -620,6 +620,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_name = "q2_2", .blck_size = QK2_2, .type_size = sizeof(block_q2_2), + .to_float = (ggml_to_float_t) dequantize_row_q2_2, .is_quantized = true, .from_float = quantize_row_q2_2, .from_float_reference = (ggml_from_float_t) quantize_row_q2_2_reference, diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 1fc8fcde5..301200869 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -923,6 +923,7 @@ class GGMLQuantizationType(IntEnum): F64 = 28 IQ1_M = 29 BF16 = 30 + Q2_2 = 31 # TODO: add GGMLFileType from ggml_ftype in ggml.h @@ -964,6 +965,7 @@ class LlamaFileType(IntEnum): MOSTLY_IQ4_XS = 30 # except 1d tensors MOSTLY_IQ1_M = 31 # except 1d tensors MOSTLY_BF16 = 32 # except 1d tensors + MOSTLY_Q2_2 = 33 # except 1d tensors GUESSED = 1024 # not specified in the model file @@ -1010,6 +1012,7 @@ QK_K = 256 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { GGMLQuantizationType.F32: (1, 4), GGMLQuantizationType.F16: (1, 2), + GGMLQuantizationType.Q2_2: (32, 8), GGMLQuantizationType.Q4_0: (32, 2 + 16), GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), diff --git a/llama.cpp b/llama.cpp index c87dd9c3c..85182f4bb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3885,6 +3885,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "F16"; case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; + case LLAMA_FTYPE_MOSTLY_Q2_2: return "Q2_2"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: @@ -11705,7 +11706,6 @@ struct llm_build_context { cb(cur, "ffn_gate", il); - cur = ggml_silu(ctx0, cur); cb(cur, "ffn_silu", il);