[ggml] add iq4_nl_4x4 format

2024-10-25 17:28:59 +08:00 · 2024-10-25 17:28:59 +08:00 · 75e8cbb2ab
commit 75e8cbb2ab
parent 561b7f2364
10 changed files with 190 additions and 0 deletions
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -51,6 +51,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
    { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
    { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
    { "IQ4_NL_4_4", LLAMA_FTYPE_MOSTLY_IQ4_NL_4_4, " 4.50 bpw non-linear quantization", },
    { "F16",      LLAMA_FTYPE_MOSTLY_F16,      "14.00G, +0.0020 ppl @ Mistral-7B",  },
    { "BF16",     LLAMA_FTYPE_MOSTLY_BF16,     "14.00G, -0.0050 ppl @ Mistral-7B",  },
    { "F32",      LLAMA_FTYPE_ALL_F32,         "26.00G              @ 7B",          },
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -389,6 +389,7 @@ extern "C" {
        GGML_TYPE_Q4_0_8_8 = 33,
        GGML_TYPE_TQ1_0   = 34,
        GGML_TYPE_TQ2_0   = 35,
        GGML_TYPE_IQ4_NL_4_4 = 36,
        GGML_TYPE_COUNT,
    };
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@ -187,6 +187,8 @@ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y)
 }
 #endif
 static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
 static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
    block_q4_0x4 out;
@ -584,6 +586,42 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
    return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
 }
 static size_t quantize_iq4_nl_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
    assert(n_per_row % QK4_NL == 0);
    const int nb = n_per_row / QK4_NL;
    void * out_ptr = NULL;
    if (nrows_interleaved == 8) {
        out_ptr = (block_q4_0x8 *) dst;
    }
    else if (nrows_interleaved == 4) {
        out_ptr = (block_q4_0x4 *) dst;
    }
    assert(nrows_interleaved <= 8);
    block_q4_0 dst_tmp[8];
    for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
        for (int64_t x = 0; x < nb; x++) {
            for (int i  = 0; i < nrows_interleaved; i++ ) {
                quantize_row_iq4_nl_ref(src + b + i * n_per_row + x * QK4_NL, (block_iq4_nl *) dst_tmp + i, QK4_NL);
            }
            if (nrows_interleaved == 8) {
                *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave, 0x00);
                out_ptr = (block_q4_0x8 *) out_ptr + 1;
            }
            else if (nrows_interleaved == 4) {
                *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave, 0x00);
                out_ptr = (block_q4_0x4 *) out_ptr + 1;
            }
        }
    }
    return ((nrow * n_per_row) / QK4_NL * sizeof(block_iq4_nl));
 }
 size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
    UNUSED(quant_weights);
    return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
@ -599,6 +637,11 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_
    return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
 }
 size_t quantize_iq4_nl_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
    UNUSED(quant_weights);
    return quantize_iq4_nl_nr_bl(src, dst, nrow, n_per_row, 4, 4);
 }
 void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@ -1075,6 +1118,50 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
    }
 }
 void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    const int ncols_interleaved = 4;
    const int blocklen = 4;
    assert (n % qk == 0);
    assert (nc % ncols_interleaved == 0);
    UNUSED(s);
    UNUSED(bs);
    UNUSED(vx);
    UNUSED(vy);
    UNUSED(nr);
    UNUSED(nc);
    UNUSED(nb);
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);
    float sumf[4];
    int sumi;
    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
    for (int x = 0; x < nc / ncols_interleaved; x++) {
        const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
        for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
        for (int l = 0; l < nb; l++) {
            for (int k = 0; k < (qk / (2 * blocklen)); k++) {
                for (int j = 0; j < ncols_interleaved; j++) {
                    sumi = 0;
                    for (int i = 0; i < blocklen; ++i) {
                        const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
                        const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
                        sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
                    }
                    sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
                }
            }
        }
        for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
    }
 }
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@ -3057,3 +3144,59 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
        }
    }
 }
 void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    const int ncols_interleaved = 4;
    const int blocklen = 4;
    assert (n % qk == 0);
    assert (nr % 4 == 0);
    assert (nc % ncols_interleaved == 0);
    UNUSED(s);
    UNUSED(bs);
    UNUSED(vx);
    UNUSED(vy);
    UNUSED(nr);
    UNUSED(nc);
    UNUSED(nb);
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);
    {
        float sumf[4][4];
        int sumi;
        for (int y = 0; y < nr / 4; y++) {
            const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
            for (int x = 0; x < nc / ncols_interleaved; x++) {
                const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
                for (int m = 0; m < 4; m++) {
                    for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
                }
                for (int l = 0; l < nb; l++) {
                    for (int k = 0; k < (qk / (2 * blocklen)); k++) {
                        for (int m = 0; m < 4; m++) {
                            for (int j = 0; j < ncols_interleaved; j++) {
                                sumi = 0;
                                for (int i = 0; i < blocklen; ++i) {
                                    const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
                                    const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
                                    sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
                                            (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
                                }
                                sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
                            }
                        }
                    }
                }
                for (int m = 0; m < 4; m++) {
                    for (int j = 0; j < ncols_interleaved; j++)
                        s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
                }
            }
        }
    }
 }
--- a/ggml/src/ggml-aarch64.h
+++ b/ggml/src/ggml-aarch64.h
@ -22,16 +22,19 @@ void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
 size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq4_nl_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 // GEMV
 void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 // GEMM
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 #ifdef __cplusplus
 }
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@ -418,6 +418,12 @@ typedef struct {
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
 typedef struct {
    ggml_half d[4];        // deltas for 4 iq4_nl blocks
    uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
 } block_iq4_nlx4;
 static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL
--- a/ggml/src/ggml-cpu.c
+++ b/ggml/src/ggml-cpu.c
@ -424,6 +424,14 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_IQ4_NL_4_4] = {
        .vec_dot                  = NULL,
        .vec_dot_type             = GGML_TYPE_Q8_0,
        .nrows                    = 1,
        .ncols                    = 4,
        .gemv                     = ggml_gemv_iq4_nl_4x4_q8_0,
        .gemm                     = ggml_gemm_iq4_nl_4x4_q8_0,
    },
 };
 const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@ -9119,6 +9127,7 @@ static void ggml_compute_forward_clamp(
        case GGML_TYPE_Q4_0_4_4:
        case GGML_TYPE_Q4_0_4_8:
        case GGML_TYPE_Q4_0_8_8:
        case GGML_TYPE_IQ4_NL_4_4:
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@ -15721,6 +15721,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
            {
                VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
            } break;
        case GGML_TYPE_IQ4_NL_4_4:
            {
                VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_iq4_nlx4, data, nbytes / sizeof(block_iq4_nlx4), 4);
            } break;
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -852,6 +852,16 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
        .from_float               = quantize_row_tq2_0,
        .from_float_ref           = (ggml_from_float_t) quantize_row_tq2_0_ref,
    },
    [GGML_TYPE_IQ4_NL_4_4] = {
        .type_name                = "iq4_nl_4x4",
        .blck_size                = QK4_NL,
        .blck_size_interleave     = 4,
        .type_size                = sizeof(block_iq4_nl),
        .is_quantized             = true,
        .to_float                 = NULL,
        .from_float               = NULL,
        .from_float_ref           = NULL,
    }
 };
 const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
@ -6854,6 +6864,7 @@ size_t ggml_quantize_chunk(
        case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_NL_4_4: result = quantize_iq4_nl_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_F16:
            {
                size_t elemsize = sizeof(ggml_fp16_t);
--- a/include/llama.h
+++ b/include/llama.h
@ -176,6 +176,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ4_NL_4_4    = 38, // except 1d tensors
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -4434,6 +4434,7 @@ struct llama_model_loader {
                case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
                case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
                case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
                case GGML_TYPE_IQ4_NL_4_4: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL_4_4; break;
                default:
                    {
                        LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@ -5198,6 +5199,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
        case LLAMA_FTYPE_MOSTLY_IQ4_NL_4_4: return "IQ4_NL_4_4 - 4.5 bpw";
        default: return "unknown, may not work";
    }
@ -18146,6 +18148,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                     new_type == GGML_TYPE_Q4_0_8_8) {
                new_type = GGML_TYPE_Q4_0;
            }
            else if (new_type == GGML_TYPE_IQ4_NL_4_4) {
                new_type = GGML_TYPE_IQ4_NL;
            }
            else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
                new_type = GGML_TYPE_Q4_K;
            }
@ -18471,6 +18476,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
        case LLAMA_FTYPE_MOSTLY_IQ4_NL_4_4: default_type = GGML_TYPE_IQ4_NL_4_4; break;
        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
    }
@ -18816,6 +18822,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
            }
            if (new_type == GGML_TYPE_IQ4_NL_4_4) {
                if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
                else chunk_size_multiplier = 4;
            }
            LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
            fflush(stdout);