WIP: make i-quants work for QK_K = 64

2024-02-27 17:30:11 +02:00 · 2024-02-27 17:30:11 +02:00 · 13ba37f1aa
commit 13ba37f1aa
parent 0becb22ac0
3 changed files with 35 additions and 5 deletions
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -4227,6 +4227,9 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
 void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
    assert(k % QK_K == 0);
 #if QK_K == 64
    dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
 #else
    const int nb = k / QK_K;
    for (int i = 0; i < nb; i++) {
@ -4246,6 +4249,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
            qs += 16;
        }
    }
 #endif
 }
 //===================================== Q8_K ==============================================
@ -10455,6 +10459,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
    UNUSED(by);
    UNUSED(bs);
    assert(n % QK_K == 0);
 #if QK_K == 64
    ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc);
 #else
    const block_iq4_xs * restrict x = vx;
    const block_q8_K   * restrict y = vy;
@ -10574,6 +10581,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
    }
    *s = sumf;
 #endif
 #endif
 }
 // ================================ IQ2 quantization =============================================
@ -10921,7 +10929,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
    const int kMaxQ = 3;
-    const int nbl = n/256;
+    const int nbl = n/QK_K;
    block_iq2_xxs * y = vy;
@ -11094,7 +11102,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
    const int kMaxQ = 3;
-    const int nbl = n/256;
+    const int nbl = n/QK_K;
    block_iq2_xs * y = vy;
@ -12037,7 +12045,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
    GGML_ASSERT(n%QK_K == 0);
-    const int nbl = n/256;
+    const int nbl = n/QK_K;
    block_iq1_s * y = vy;
@ -12315,6 +12323,9 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest
 }
 size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
 #if QK_K == 64
    return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
 #else
    (void)hist;
    GGML_ASSERT(n_per_row%QK_K == 0);
    int nblock = n_per_row/QK_K;
@ -12333,6 +12344,7 @@ size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, i
        qrow += nblock*sizeof(block_iq4_xs);
    }
    return nrow * nblock * sizeof(block_iq4_xs);
 #endif
 }
 void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
@ -12363,7 +12375,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
    const int kMaxQ = 3;
-    const int nbl = n/256;
+    const int nbl = n/QK_K;
    block_iq2_s * y = vy;
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -230,6 +230,10 @@ typedef struct {
 } block_iq4_nl;
 static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
 #if QK_K == 64
 #define block_iq4_xs block_iq4_nl
 //typedef struct block_iq4_nl block_iq4_xs;
 #else
 typedef struct {
    ggml_fp16_t d;
    uint16_t scales_h;
@ -237,6 +241,7 @@ typedef struct {
    uint8_t  qs[QK_K/2];
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
 #endif
 #ifdef __cplusplus
 extern "C" {
--- a/ggml.c
+++ b/ggml.c
@ -728,14 +728,22 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
    },
    [GGML_TYPE_IQ4_XS] = {
        .type_name                = "iq4_xs",
 #if QK_K == 64
        .blck_size                = QK4_NL,
 #else
        .blck_size                = QK_K,
 #endif
        .type_size                = sizeof(block_iq4_xs),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq4_xs,
        .from_float               = quantize_row_iq4_xs,
        .from_float_reference     = (ggml_from_float_t)quantize_row_iq4_xs_reference,
        .vec_dot                  = ggml_vec_dot_iq4_xs_q8_K,
 #if QK_K == 64
        .vec_dot_type             = GGML_TYPE_Q8_0,
 #else
        .vec_dot_type             = GGML_TYPE_Q8_K,
 #endif
        .nrows                    = 1,
    },
    [GGML_TYPE_Q8_K] = {
@ -19830,6 +19838,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
                GGML_ASSERT(result == row_size * nrows);
            } break;
        case GGML_TYPE_IQ4_NL:
 #if QK_K == 64
        case GGML_TYPE_IQ4_XS:
 #endif
            {
                GGML_ASSERT(start % QK4_NL == 0);
                GGML_ASSERT(start % n_per_row == 0);
@ -19838,15 +19849,17 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
                result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
                GGML_ASSERT(result == row_size * nrows);
            } break;
 #if QK_K != 64
        case GGML_TYPE_IQ4_XS:
            {
-                GGML_ASSERT(start % QK4_NL == 0);
+                GGML_ASSERT(start % QK_K == 0);
                GGML_ASSERT(start % n_per_row == 0);
                size_t start_row = start / n_per_row;
                size_t row_size = ggml_row_size(type, n_per_row);
                result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
                GGML_ASSERT(result == row_size * nrows);
            } break;
 #endif
        case GGML_TYPE_F16:
            {
                size_t elemsize = sizeof(ggml_fp16_t);