From ab0a7d1531b3719486ac284933b71a5655f0a32a Mon Sep 17 00:00:00 2001 From: Howard Su Date: Thu, 1 Jun 2023 22:17:53 +0800 Subject: [PATCH] Add normalfloat4 as Q4_2 --- examples/quantize/quantize.cpp | 1 + ggml.c | 214 +++++++++++++++++++++++++++++++++ ggml.h | 2 +- llama.h | 2 +- 4 files changed, 217 insertions(+), 2 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 769dd36a4..73b01420b 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -9,6 +9,7 @@ static const std::map LLAMA_FTYPE_MAP = { {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0}, {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1}, + {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2}, {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0}, {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1}, {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0}, diff --git a/ggml.c b/ggml.c index 4cd0d7211..3c0e2b9aa 100644 --- a/ggml.c +++ b/ggml.c @@ -783,6 +783,13 @@ typedef struct { } block_q4_1; static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding"); +#define QK4_2 32 +typedef struct { + ggml_fp16_t d; // delta + uint8_t qs[QK4_2 / 2]; // nibbles / quants +} block_q4_2; +static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); + #define QK5_0 32 typedef struct { ggml_fp16_t d; // delta @@ -894,10 +901,101 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r } } +static inline unsigned char dQuantizeNF4(float x) +{ + + // the values for this tree was generated by test_normal_map_tree + // in the file tests/test_functional.py + if(x > 0.03979014977812767f) + if(x > 0.3893125355243683f) // 1 + if(x > 0.6427869200706482f) // 11 + if(x > 0.8614784181118011f) // 111 + return 0b1111; + else + return 0b1110; + else + if(x > 0.5016634166240692f) // 110 + return 0b1101; + else + return 0b1100; + else + if(x > 0.2035212516784668f) // 10 + if(x > 0.2920137718319893f) // 101 + return 0b1011; + else + return 0b1010; + else + if(x > 0.1202552504837513f) // 100 + return 0b1001; + else + return 0b1000; + else + if(x > -0.33967943489551544f) // 0 + if(x > -0.13791173323988914f) // 01 + if(x > -0.045525018125772476f) // 011 + return 0b0111; + else + return 0b0110; + else + if(x > -0.23460740596055984f) // 010 + return 0b0101; + else + return 0b0100; + else + if(x > -0.6106329262256622f) // 00 + if(x > -0.4599952697753906f) // 001 + return 0b0011; + else + return 0b0010; + else + if(x > -0.8480964004993439f) // 000 + return 0b0001; + else + return 0b0000; +} + static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { quantize_row_q4_1_reference(x, y, k); } +static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * restrict y, int k) { + static const int qk = QK4_2; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + } + } + + const float id = amax ? 1.0f/amax : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(amax); + + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; + + const uint8_t xi0 = dQuantizeNF4(x0); + const uint8_t xi1 = dQuantizeNF4(x1); + + y[i].qs[j] = xi0; + y[i].qs[j] |= xi1 << 4; + } + } +} + +static void quantize_row_q4_2(const float * restrict x, void * restrict y, int k) { + quantize_row_q4_2_reference(x, y, k); +} + static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { static const int qk = QK5_0; @@ -1439,6 +1537,80 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict } } +static inline float dhDequantizeNF4(unsigned char val) +{ + // the values for this tree was generated by test_normal_map_tree + // in the file tests/test_functional.py + if((val & 0b1000) == 8) + if((val & 0b0100) == 4) // 1 + if((val & 0b0010) == 2) // 11 + if((val & 0b0001) == 1) // 111 + return 1.0f; + else + return 0.7229568362236023f; + else + if((val & 0b0001) == 1) // 110 + return 0.5626170039176941f; + else + return 0.44070982933044434f; + else + if((val & 0b0010) == 2) //10 + if((val & 0b0001) == 1) // 101 + return 0.33791524171829224f; + else + return 0.24611230194568634f; + else + if((val & 0b0001) == 1) // 100 + return 0.16093020141124725f; + else + return 0.07958029955625534f; + + else + if((val & 0b0100) == 4) // 0 + if((val & 0b0010) == 2) //01 + if((val & 0b0001) == 1) // 011 + return 0.0f; + else + return -0.09105003625154495f; + else + if((val & 0b0001) == 1) // 010 + return -0.18477343022823334f; + else + return -0.28444138169288635f; + else + if((val & 0b0010) == 2) //00 + if((val & 0b0001) == 1) // 001 + return -0.39491748809814453f; + else + return -0.5250730514526367f; + else + if((val & 0b0001) == 1) // 000 + return -0.6961928009986877f; + else + return -1.0f; + +} + +static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict y, int k) { + static const int qk = QK4_2; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (int j = 0; j < qk/2; ++j) { + const int x0 = (x[i].qs[j] & 0x0F); + const int x1 = (x[i].qs[j] >> 4); + + y[i*qk + j + 0 ] = dhDequantizeNF4(x0) * d; + y[i*qk + j + qk/2] = dhDequantizeNF4(x1) * d; + } + } +} + static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { static const int qk = QK5_0; @@ -1512,6 +1684,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); @@ -1533,6 +1706,14 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_q = ggml_vec_dot_q4_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, }, + [GGML_TYPE_Q4_2] = { + .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_2, + .quantize_row_q = quantize_row_q4_2, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference, + .quantize_row_q_dot = quantize_row_q8_0, + .vec_dot_q = ggml_vec_dot_q4_2_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + }, [GGML_TYPE_Q5_0] = { .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_0, .quantize_row_q = quantize_row_q5_0, @@ -2564,6 +2745,35 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * #endif } +static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nb % 2 == 0); + + const block_q4_2 * restrict x = vx; + const block_q8_0 * restrict y = vy; + + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[i].qs[j] & 0x0F); + const int v1 = (x[i].qs[j] >> 4); + + sumi += (dhDequantizeNF4(v0) * y[i].qs[j]) + (dhDequantizeNF4(v1) * y[i].qs[j + qk/2]); + } + + sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); + } + + *s = sumf; +} + static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = QK8_0; const int nb = n / qk; @@ -3440,6 +3650,7 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F16] = 1, [GGML_TYPE_Q4_0] = QK4_0, [GGML_TYPE_Q4_1] = QK4_1, + [GGML_TYPE_Q4_2] = QK4_2, [GGML_TYPE_Q5_0] = QK5_0, [GGML_TYPE_Q5_1] = QK5_1, [GGML_TYPE_Q8_0] = QK8_0, @@ -3455,6 +3666,7 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F16] = sizeof(ggml_fp16_t), [GGML_TYPE_Q4_0] = sizeof(block_q4_0), [GGML_TYPE_Q4_1] = sizeof(block_q4_1), + [GGML_TYPE_Q4_2] = sizeof(block_q4_2), [GGML_TYPE_Q5_0] = sizeof(block_q5_0), [GGML_TYPE_Q5_1] = sizeof(block_q5_1), [GGML_TYPE_Q8_0] = sizeof(block_q8_0), @@ -3471,6 +3683,7 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_F16] = "f16", [GGML_TYPE_Q4_0] = "q4_0", [GGML_TYPE_Q4_1] = "q4_1", + [GGML_TYPE_Q4_2] = "q4_2", [GGML_TYPE_Q5_0] = "q5_0", [GGML_TYPE_Q5_1] = "q5_1", [GGML_TYPE_Q8_0] = "q8_0", @@ -3486,6 +3699,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { [GGML_TYPE_F16] = false, [GGML_TYPE_Q4_0] = true, [GGML_TYPE_Q4_1] = true, + [GGML_TYPE_Q4_2] = true, [GGML_TYPE_Q5_0] = true, [GGML_TYPE_Q5_1] = true, [GGML_TYPE_Q8_0] = true, diff --git a/ggml.h b/ggml.h index 60c0ad8bf..983077735 100644 --- a/ggml.h +++ b/ggml.h @@ -235,7 +235,7 @@ extern "C" { GGML_TYPE_F16 = 1, GGML_TYPE_Q4_0 = 2, GGML_TYPE_Q4_1 = 3, - // GGML_TYPE_Q4_2 = 4, support has been removed + GGML_TYPE_Q4_2 = 4, // NormalFloat4 // GGML_TYPE_Q4_3 (5) support has been removed GGML_TYPE_Q5_0 = 6, GGML_TYPE_Q5_1 = 7, diff --git a/llama.h b/llama.h index c6b0a2889..f601c1bf8 100644 --- a/llama.h +++ b/llama.h @@ -89,7 +89,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed + LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // normal float // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors