From b335f73a6020019dcbb70b000312715ce80e05e9 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 13 May 2023 01:30:11 +0800 Subject: [PATCH] BACKWARDS COMPAT QUANT SHIM is ready, but upstream model converter is BORKED. BORK BORK. --- ggml.c | 31 +++++++++-- ggml.h | 14 +++-- ggml_v2.c | 93 ++++++++++++++++----------------- otherarch/rwkv.cpp | 12 ++--- otherarch/tools/common-ggml.cpp | 14 ++--- 5 files changed, 95 insertions(+), 69 deletions(-) diff --git a/ggml.c b/ggml.c index dd4b75960..addf61303 100644 --- a/ggml.c +++ b/ggml.c @@ -393,9 +393,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) { } } -//legacy functions -#include "ggml_v2.c" - // // timing // @@ -724,6 +721,31 @@ typedef struct { } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); +#define QK4_2 16 +typedef struct { + ggml_fp16_t d; // delta + uint8_t qs[QK4_2 / 2]; // nibbles / quants +} block_q4_2; +static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); + +#define QK4_3 16 +typedef struct { + ggml_fp16_t d; // delta + ggml_fp16_t m; // min + uint8_t qs[QK4_3 / 2]; // nibbles / quants +} block_q4_3; +static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding"); + +#define QK8_1 32 +typedef struct { + float d; // delta + float s0; // d * sum(qs[i]) low + float s1; // d * sum(qs[i]) high + int8_t qs[QK8_1]; // quants +} block_q8_1_v2; +static_assert(sizeof(block_q8_1_v2) == 3*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); + + // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { static const int qk = QK4_0; @@ -12398,3 +12420,6 @@ int ggml_cpu_has_vsx(void) { } //////////////////////////////////////////////////////////////////////////////// + +//legacy functions +#include "ggml_v2.c" \ No newline at end of file diff --git a/ggml.h b/ggml.h index 19b89d887..a045bfc3e 100644 --- a/ggml.h +++ b/ggml.h @@ -876,15 +876,21 @@ extern "C" { // GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + GGML_API size_t ggml_quantize_q4_0_v2(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_1_v2(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_2_v2(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_3_v2(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q5_0_v2(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q5_1_v2(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q8_0_v2(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + GGML_API size_t ggml_quantize_chunk_v2(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); // // system info // diff --git a/ggml_v2.c b/ggml_v2.c index 84edf7754..7a4281691 100644 --- a/ggml_v2.c +++ b/ggml_v2.c @@ -43,9 +43,12 @@ static inline __m256i bytes_from_nibbles_32_v2(const uint8_t * rsi) bytes = _mm256_or_si256( low, high ); return bytes; } +#endif +#endif - +#if __ARM_NEON +#if !defined(__aarch64__) int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) { int8x8_t res; @@ -133,38 +136,10 @@ uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) { return res; } - - - #endif #endif -#define QK4_2 16 -typedef struct { - ggml_fp16_t d; // delta - uint8_t qs[QK4_2 / 2]; // nibbles / quants -} block_q4_2_v2; -static_assert(sizeof(block_q4_2_v2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); - -#define QK4_3 16 -typedef struct { - ggml_fp16_t d; // delta - ggml_fp16_t m; // min - uint8_t qs[QK4_3 / 2]; // nibbles / quants -} block_q4_3_v2; -static_assert(sizeof(block_q4_3_v2) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding"); - - -#define QK8_1 32 -typedef struct { - float d; // delta - float s0; // d * sum(qs[i]) low - float s1; // d * sum(qs[i]) high - int8_t qs[QK8_1]; // quants -} block_q8_1_v2; -static_assert(sizeof(block_q8_1_v2) == 3*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); - // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference_v2(const float * restrict x, block_q4_0 * restrict y, int k) { assert(k % QK4_0 == 0); @@ -676,7 +651,7 @@ static void quantize_row_q4_1_v2(const float * restrict x, void * restrict vy, i } // reference implementation for deterministic creation of model files -static void quantize_row_q4_2_reference_v2(const float * restrict x, block_q4_2_v2 * restrict y, int k) { +static void quantize_row_q4_2_reference_v2(const float * restrict x, block_q4_2 * restrict y, int k) { assert(k % QK4_2 == 0); const int nb = k / QK4_2; @@ -717,12 +692,12 @@ static void quantize_row_q4_2_reference_v2(const float * restrict x, block_q4_2_ static void quantize_row_q4_2_v2(const float * restrict x, void * restrict vy, int k) { assert(k % QK4_2 == 0); - block_q4_2_v2 * restrict y = vy; + block_q4_2 * restrict y = vy; quantize_row_q4_2_reference_v2(x, y, k); } -static void quantize_row_q4_3_reference_v2(const float * restrict x, block_q4_3_v2 * restrict y, int k) { +static void quantize_row_q4_3_reference_v2(const float * restrict x, block_q4_3 * restrict y, int k) { assert(k % QK4_3 == 0); const int nb = k / QK4_3; @@ -760,7 +735,7 @@ static void quantize_row_q4_3_reference_v2(const float * restrict x, block_q4_3_ static void quantize_row_q4_3_v2(const float * restrict x, void * restrict vy, int k) { assert(k % QK4_3 == 0); - block_q4_3_v2 * restrict y = vy; + block_q4_3 * restrict y = vy; quantize_row_q4_3_reference_v2(x, y, k); } @@ -1443,7 +1418,7 @@ static void dequantize_row_q4_2_v2(const void * restrict vx, float * restrict y, assert(k % QK4_2 == 0); const int nb = k / QK4_2; - const block_q4_2_v2 * restrict x = vx; + const block_q4_2 * restrict x = vx; for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); @@ -1472,7 +1447,7 @@ static void dequantize_row_q4_3_v2(const void * restrict vx, float * restrict y, assert(k % QK4_3 == 0); const int nb = k / QK4_3; - const block_q4_3_v2 * restrict x = vx; + const block_q4_3 * restrict x = vx; for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); @@ -1975,7 +1950,7 @@ static void ggml_vec_dot_q4_2_q8_0_v2(const int n, float * restrict s, const voi assert(nb % 2 == 0); assert(QK8_0 == 2*QK4_2); - const block_q4_2_v2 * restrict x = vx; + const block_q4_2 * restrict x = vx; const block_q8_0 * restrict y = vy; #if defined(__ARM_NEON) @@ -1983,10 +1958,10 @@ static void ggml_vec_dot_q4_2_q8_0_v2(const int n, float * restrict s, const voi float32x4_t sumv1 = vdupq_n_f32(0.0f); for (int i = 0; i < nb; i += 2) { - const block_q4_2_v2 * restrict x0_0 = &x[2*(i + 0) + 0]; - const block_q4_2_v2 * restrict x0_1 = &x[2*(i + 0) + 1]; - const block_q4_2_v2 * restrict x1_0 = &x[2*(i + 1) + 0]; - const block_q4_2_v2 * restrict x1_1 = &x[2*(i + 1) + 1]; + const block_q4_2 * restrict x0_0 = &x[2*(i + 0) + 0]; + const block_q4_2 * restrict x0_1 = &x[2*(i + 0) + 1]; + const block_q4_2 * restrict x1_0 = &x[2*(i + 1) + 0]; + const block_q4_2 * restrict x1_1 = &x[2*(i + 1) + 1]; const block_q8_0 * restrict y0 = &y[i + 0]; const block_q8_0 * restrict y1 = &y[i + 1]; @@ -2132,7 +2107,7 @@ static void ggml_vec_dot_q4_3_q8_1_v2(const int n, float * restrict s, const voi assert(nb % 2 == 0); assert(QK8_1 == 2*QK4_3); - const block_q4_3_v2 * restrict x = vx; + const block_q4_3 * restrict x = vx; const block_q8_1_v2 * restrict y = vy; #if defined(__ARM_NEON) @@ -2143,8 +2118,8 @@ static void ggml_vec_dot_q4_3_q8_1_v2(const int n, float * restrict s, const voi float summs1 = 0.0f; for (int i = 0; i < nb; ++i) { - const block_q4_3_v2 * restrict x0_0 = &x[2*(i + 0) + 0]; - const block_q4_3_v2 * restrict x0_1 = &x[2*(i + 0) + 1]; + const block_q4_3 * restrict x0_0 = &x[2*(i + 0) + 0]; + const block_q4_3 * restrict x0_1 = &x[2*(i + 0) + 1]; const block_q8_1_v2 * restrict y0 = &y[i + 0]; @@ -2816,7 +2791,7 @@ size_t ggml_quantize_q4_2_v2(const float * src, void * dst, int n, int k, int64_ const int nb = k / QK4_2; for (int j = 0; j < n; j += k) { - block_q4_2_v2 * restrict y = (block_q4_2_v2 *)dst + j/QK4_2; + block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2; quantize_row_q4_2_reference_v2(src + j, y, k); @@ -2831,7 +2806,7 @@ size_t ggml_quantize_q4_2_v2(const float * src, void * dst, int n, int k, int64_ } } - return (n/QK4_2*sizeof(block_q4_2_v2)); + return (n/QK4_2*sizeof(block_q4_2)); } size_t ggml_quantize_q4_3_v2(const float * src, void * dst, int n, int k, int64_t * hist) { @@ -2839,7 +2814,7 @@ size_t ggml_quantize_q4_3_v2(const float * src, void * dst, int n, int k, int64_ const int nb = k / QK4_3; for (int j = 0; j < n; j += k) { - block_q4_3_v2 * restrict y = (block_q4_3_v2 *)dst + j/QK4_3; + block_q4_3 * restrict y = (block_q4_3 *)dst + j/QK4_3; quantize_row_q4_3_reference_v2(src + j, y, k); @@ -2854,7 +2829,7 @@ size_t ggml_quantize_q4_3_v2(const float * src, void * dst, int n, int k, int64_ } } - return (n/QK4_3*sizeof(block_q4_3_v2)); + return (n/QK4_3*sizeof(block_q4_3)); } size_t ggml_quantize_q5_0_v2(const float * src, void * dst, int n, int k, int64_t * hist) { @@ -2917,6 +2892,26 @@ size_t ggml_quantize_q5_1_v2(const float * src, void * dst, int n, int k, int64_ return (n/QK5_1*sizeof(block_q5_1)); } +size_t ggml_quantize_q8_0_v2(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + for (int j = 0; j < n; j += k) { + block_q8_0 * restrict y = (block_q8_0 *)dst + j/QK8_0; + + quantize_row_q8_0_reference_v2(src + j, y, k); + + for (int i = 0; i < nb; i++) { + for (int l = 0; l < QK8_0; ++l) { + const int8_t vi = y[i].qs[l]; + + hist[vi/16 + 8]++; + } + } + } + + return (n/QK8_0*sizeof(block_q8_0)); +} //TODO: integrate size_t ggml_quantize_chunk_v2(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { @@ -2937,13 +2932,13 @@ size_t ggml_quantize_chunk_v2(enum ggml_type type, const float * src, void * dst case GGML_TYPE_Q4_2: { GGML_ASSERT(start % QK4_2 == 0); - block_q4_2_v2 * block = (block_q4_2_v2*)dst + start / QK4_2; + block_q4_2 * block = (block_q4_2*)dst + start / QK4_2; result = ggml_quantize_q4_2_v2(src + start, block, n, n, hist); } break; case GGML_TYPE_Q4_3: { GGML_ASSERT(start % QK4_3 == 0); - block_q4_3_v2 * block = (block_q4_3_v2*)dst + start / QK4_3; + block_q4_3 * block = (block_q4_3*)dst + start / QK4_3; result = ggml_quantize_q4_3_v2(src + start, block, n, n, hist); } break; case GGML_TYPE_Q5_0: diff --git a/otherarch/rwkv.cpp b/otherarch/rwkv.cpp index 702872020..99849c7f7 100644 --- a/otherarch/rwkv.cpp +++ b/otherarch/rwkv.cpp @@ -773,22 +773,22 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode switch (type) { case GGML_TYPE_Q4_0: - cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q4_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); break; case GGML_TYPE_Q4_1: - cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q4_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); break; case GGML_TYPE_Q4_2: - cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q4_2_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); break; case GGML_TYPE_Q5_0: - cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q5_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); break; case GGML_TYPE_Q5_1: - cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q5_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); break; case GGML_TYPE_Q8_0: - cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q8_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); break; default: { fprintf(stderr, "unsupported quantization type %d\n", type); diff --git a/otherarch/tools/common-ggml.cpp b/otherarch/tools/common-ggml.cpp index 9899247e5..350e95039 100644 --- a/otherarch/tools/common-ggml.cpp +++ b/otherarch/tools/common-ggml.cpp @@ -166,31 +166,31 @@ bool ggml_common_quantize_0( switch ((ggml_type) ttype) { case GGML_TYPE_Q4_0: { - cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q4_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q4_1: { - cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q4_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q4_2: { - cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q4_2_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q4_3: { - cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q4_3_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q5_0: { - cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q5_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q5_1: { - cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q5_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q8_0: { - cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q8_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_F32: case GGML_TYPE_F16: