diff --git a/ggml-quants.c b/ggml-quants.c index 607d50925..42e6c6692 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -4227,6 +4227,9 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); +#if QK_K == 64 + dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k); +#else const int nb = k / QK_K; for (int i = 0; i < nb; i++) { @@ -4246,6 +4249,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, qs += 16; } } +#endif } //===================================== Q8_K ============================================== @@ -10455,6 +10459,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * UNUSED(by); UNUSED(bs); assert(n % QK_K == 0); +#if QK_K == 64 + ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc); +#else const block_iq4_xs * restrict x = vx; const block_q8_K * restrict y = vy; @@ -10574,6 +10581,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * } *s = sumf; #endif +#endif } // ================================ IQ2 quantization ============================================= @@ -10921,7 +10929,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict const int kMaxQ = 3; - const int nbl = n/256; + const int nbl = n/QK_K; block_iq2_xxs * y = vy; @@ -11094,7 +11102,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v const int kMaxQ = 3; - const int nbl = n/256; + const int nbl = n/QK_K; block_iq2_xs * y = vy; @@ -12037,7 +12045,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?"); GGML_ASSERT(n%QK_K == 0); - const int nbl = n/256; + const int nbl = n/QK_K; block_iq1_s * y = vy; @@ -12315,6 +12323,9 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest } size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { +#if QK_K == 64 + return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights); +#else (void)hist; GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; @@ -12333,6 +12344,7 @@ size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, i qrow += nblock*sizeof(block_iq4_xs); } return nrow * nblock * sizeof(block_iq4_xs); +#endif } void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) { @@ -12363,7 +12375,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy const int kMaxQ = 3; - const int nbl = n/256; + const int nbl = n/QK_K; block_iq2_s * y = vy; diff --git a/ggml-quants.h b/ggml-quants.h index 2c61134c4..316e35687 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -230,6 +230,10 @@ typedef struct { } block_iq4_nl; static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding"); +#if QK_K == 64 +#define block_iq4_xs block_iq4_nl +//typedef struct block_iq4_nl block_iq4_xs; +#else typedef struct { ggml_fp16_t d; uint16_t scales_h; @@ -237,6 +241,7 @@ typedef struct { uint8_t qs[QK_K/2]; } block_iq4_xs; static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); +#endif #ifdef __cplusplus extern "C" { diff --git a/ggml.c b/ggml.c index d66db3352..4591644ad 100644 --- a/ggml.c +++ b/ggml.c @@ -728,14 +728,22 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { }, [GGML_TYPE_IQ4_XS] = { .type_name = "iq4_xs", +#if QK_K == 64 + .blck_size = QK4_NL, +#else .blck_size = QK_K, +#endif .type_size = sizeof(block_iq4_xs), .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_iq4_xs, .from_float = quantize_row_iq4_xs, .from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference, .vec_dot = ggml_vec_dot_iq4_xs_q8_K, +#if QK_K == 64 + .vec_dot_type = GGML_TYPE_Q8_0, +#else .vec_dot_type = GGML_TYPE_Q8_K, +#endif .nrows = 1, }, [GGML_TYPE_Q8_K] = { @@ -19830,6 +19838,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i GGML_ASSERT(result == row_size * nrows); } break; case GGML_TYPE_IQ4_NL: +#if QK_K == 64 + case GGML_TYPE_IQ4_XS: +#endif { GGML_ASSERT(start % QK4_NL == 0); GGML_ASSERT(start % n_per_row == 0); @@ -19838,15 +19849,17 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); GGML_ASSERT(result == row_size * nrows); } break; +#if QK_K != 64 case GGML_TYPE_IQ4_XS: { - GGML_ASSERT(start % QK4_NL == 0); + GGML_ASSERT(start % QK_K == 0); GGML_ASSERT(start % n_per_row == 0); size_t start_row = start / n_per_row; size_t row_size = ggml_row_size(type, n_per_row); result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); GGML_ASSERT(result == row_size * nrows); } break; +#endif case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t);