From cd4a7c4cb49f8b0cc45e907c6b1bac3978ef0e53 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 21 Mar 2024 10:37:38 +0200 Subject: [PATCH 1/3] Make quantize_row_iq4_nl do the same thing is quantization on CUDA --- ggml-quants.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 109dd6660..61989d135 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -11705,9 +11705,8 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l, float * scales, float * weight, uint8_t * L, const int8_t * values, - const float * quant_weights) { - - const int ntry = 7; + const float * quant_weights, + const int ntry) { float sigma2 = 0; for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j]; @@ -11823,7 +11822,7 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow for (int ibl = 0; ibl < nblock; ++ibl) { const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL; quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, - &scale, weight, L, kvalues_iq4nl, qw); + &scale, weight, L, kvalues_iq4nl, qw, 7); } src += n_per_row; qrow += nblock*sizeof(block_iq4_nl); @@ -11832,9 +11831,21 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow } void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { - assert(k % QK4_NL == 0); - block_iq4_nl * restrict y = vy; - quantize_row_iq4_nl_reference(x, y, k); + GGML_ASSERT(k%QK4_NL == 0); + int nblock = k/QK4_NL; + uint8_t L[QK4_NL]; + float weight[QK4_NL]; + uint16_t unused_h; + uint8_t * unused_l = NULL; + float scale; + block_iq4_nl * iq4 = (block_iq4_nl *)vy; + for (int ibl = 0; ibl < nblock; ++ibl) { + quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, + &scale, weight, L, kvalues_iq4nl, NULL, -1); + } + //assert(k % QK4_NL == 0); + //block_iq4_nl * restrict y = vy; + //quantize_row_iq4_nl_reference(x, y, k); } void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) { @@ -11857,7 +11868,7 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow for (int ibl = 0; ibl < nblock; ++ibl) { const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL; quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l, - scales, weight, L, kvalues_iq4nl, qw); + scales, weight, L, kvalues_iq4nl, qw, 7); } src += n_per_row; qrow += nblock*sizeof(block_iq4_xs); From 30eef31b070666ebe9537fd4c28cf4b1946a9fb6 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 21 Mar 2024 12:19:16 +0200 Subject: [PATCH 2/3] Make quantize_row_iq4_nl do the same thing is quantization on CUDA This time for real. backend-ops tests pass. --- ggml-quants.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 61989d135..29cfafa3a 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -11718,6 +11718,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block float max_scale = 0, amax_scale = 0; for (int ib = 0; ib < super_block_size/block_size; ++ib) { const float * xb = x + ib*block_size; + uint8_t * Lb = L + ib*block_size; if (quant_weights) { const float * qw = quant_weights + ib*block_size; for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); @@ -11735,12 +11736,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block scales[ib] = 0; continue; } - float d = -max/values[0]; + float d = ntry > 0 ? -max/values[0] : max/values[0]; float id = 1/d; float sumqx = 0, sumq2 = 0; for (int j = 0; j < block_size; ++j) { float al = id*xb[j]; int l = best_index_int8(16, values, al); + Lb[j] = l; float q = values[l]; float w = weight[j]; sumqx += w*q*xb[j]; @@ -11795,9 +11797,11 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block } } else { dh[0] = GGML_FP32_TO_FP16(scales[0]); - float id = scales[0] ? 1/scales[0] : 0; - for (int j = 0; j < super_block_size; ++j) { - L[j] = best_index_int8(16, values, id*x[j]); + if (ntry > 0) { + float id = scales[0] ? 1/scales[0] : 0; + for (int j = 0; j < super_block_size; ++j) { + L[j] = best_index_int8(16, values, id*x[j]); + } } } From 68e4fed4d93d886ae4b4d5d6d9cc6c8602f83a7d Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 21 Mar 2024 12:18:03 +0100 Subject: [PATCH 3/3] Now fix test-quantize-fns --- ggml-quants.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 29cfafa3a..2eaca0593 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -11847,14 +11847,11 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l, &scale, weight, L, kvalues_iq4nl, NULL, -1); } - //assert(k % QK4_NL == 0); - //block_iq4_nl * restrict y = vy; - //quantize_row_iq4_nl_reference(x, y, k); } void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) { assert(k % QK4_NL == 0); - quantize_iq4_nl(x, y, 1, k, NULL); + quantize_row_iq4_nl(x, y, k); } size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {