Add normalfloat4 as Q4_2

2023-06-01 22:17:53 +08:00 · 2023-06-01 22:17:53 +08:00 · ab0a7d1531
commit ab0a7d1531
parent ffb06a345e
4 changed files with 217 additions and 2 deletions
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -9,6 +9,7 @@
 static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
  {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
  {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
  {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
  {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
  {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
  {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
--- a/ggml.c
+++ b/ggml.c
@ -783,6 +783,13 @@ typedef struct {
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
 #define QK4_2 32
 typedef struct {
    ggml_fp16_t d;          // delta
    uint8_t qs[QK4_2 / 2];  // nibbles / quants
 } block_q4_2;
 static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
 #define QK5_0 32
 typedef struct {
    ggml_fp16_t d;         // delta
@ -894,10 +901,101 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
    }
 }
 static inline unsigned char dQuantizeNF4(float x)
 {
  // the values for this tree was generated by test_normal_map_tree
  // in the file tests/test_functional.py
  if(x > 0.03979014977812767f)
    if(x > 0.3893125355243683f) // 1
      if(x > 0.6427869200706482f) // 11
        if(x > 0.8614784181118011f) // 111
          return 0b1111;
        else
          return 0b1110;
      else
        if(x > 0.5016634166240692f) // 110
          return 0b1101;
        else
          return 0b1100;
    else
      if(x > 0.2035212516784668f) // 10
        if(x > 0.2920137718319893f) // 101
          return 0b1011;
        else
          return 0b1010;
      else
        if(x > 0.1202552504837513f) // 100
          return 0b1001;
        else
          return 0b1000;
  else
    if(x > -0.33967943489551544f) // 0
      if(x > -0.13791173323988914f) // 01
        if(x > -0.045525018125772476f) // 011
          return 0b0111;
        else
          return 0b0110;
      else
        if(x > -0.23460740596055984f) // 010
          return 0b0101;
        else
          return 0b0100;
    else
      if(x > -0.6106329262256622f) // 00
        if(x > -0.4599952697753906f) // 001
          return 0b0011;
        else
          return 0b0010;
      else
        if(x > -0.8480964004993439f) // 000
          return 0b0001;
        else
          return 0b0000;
 }
 static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
    quantize_row_q4_1_reference(x, y, k);
 }
 static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * restrict y, int k) {
    static const int qk = QK4_2;
    assert(k % qk == 0);
    const int nb = k / qk;
    for (int i = 0; i < nb; i++) {
        float amax = 0.0f; // absolute max
        for (int j = 0; j < qk; j++) {
            const float v = x[i*qk + j];
            if (amax < fabsf(v)) {
                amax = fabsf(v);
            }
        }
        const float id = amax ? 1.0f/amax : 0.0f;
        y[i].d = GGML_FP32_TO_FP16(amax);
        for (int j = 0; j < qk/2; ++j) {
            const float x0 = x[i*qk + 0    + j]*id;
            const float x1 = x[i*qk + qk/2 + j]*id;
            const uint8_t xi0 = dQuantizeNF4(x0);
            const uint8_t xi1 = dQuantizeNF4(x1);
            y[i].qs[j]  = xi0;
            y[i].qs[j] |= xi1 << 4;
        }
    }
 }
 static void quantize_row_q4_2(const float * restrict x, void * restrict y, int k) {
    quantize_row_q4_2_reference(x, y, k);
 }
 static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
    static const int qk = QK5_0;
@ -1439,6 +1537,80 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
    }
 }
 static inline float dhDequantizeNF4(unsigned char val)
 {
  // the values for this tree was generated by test_normal_map_tree
  // in the file tests/test_functional.py
  if((val & 0b1000) == 8)
    if((val & 0b0100) == 4) // 1
      if((val & 0b0010) == 2) // 11
        if((val & 0b0001) == 1) // 111
          return 1.0f;
        else
          return 0.7229568362236023f;
      else
        if((val & 0b0001) == 1) // 110
          return 0.5626170039176941f;
        else
          return 0.44070982933044434f;
    else
      if((val & 0b0010) == 2) //10
        if((val & 0b0001) == 1) // 101
          return 0.33791524171829224f;
        else
          return 0.24611230194568634f;
      else
        if((val & 0b0001) == 1) // 100
          return 0.16093020141124725f;
        else
          return 0.07958029955625534f;
  else
    if((val & 0b0100) == 4) // 0
      if((val & 0b0010) == 2) //01
        if((val & 0b0001) == 1) // 011
          return 0.0f;
        else
          return -0.09105003625154495f;
      else
        if((val & 0b0001) == 1) // 010
          return -0.18477343022823334f;
        else
          return -0.28444138169288635f;
    else
      if((val & 0b0010) == 2) //00
        if((val & 0b0001) == 1) // 001
          return -0.39491748809814453f;
        else
          return -0.5250730514526367f;
      else
        if((val & 0b0001) == 1) // 000
          return -0.6961928009986877f;
        else
          return -1.0f;
 }
 static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict y, int k) {
    static const int qk = QK4_2;
    assert(k % qk == 0);
    const int nb = k / qk;
    for (int i = 0; i < nb; i++) {
        const float d = GGML_FP16_TO_FP32(x[i].d);
        for (int j = 0; j < qk/2; ++j) {
            const int x0 = (x[i].qs[j] & 0x0F);
            const int x1 = (x[i].qs[j] >>   4);
            y[i*qk + j + 0   ] = dhDequantizeNF4(x0) * d;
            y[i*qk + j + qk/2] = dhDequantizeNF4(x1) * d;
        }
    }
 }
 static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
    static const int qk = QK5_0;
@ -1512,6 +1684,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
 static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@ -1533,6 +1706,14 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
        .vec_dot_q                = ggml_vec_dot_q4_1_q8_1,
        .vec_dot_type             = GGML_TYPE_Q8_1,
    },
    [GGML_TYPE_Q4_2] = {
        .dequantize_row_q         = (dequantize_row_q_t)dequantize_row_q4_2,
        .quantize_row_q           = quantize_row_q4_2,
        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference,
        .quantize_row_q_dot       = quantize_row_q8_0,
        .vec_dot_q                = ggml_vec_dot_q4_2_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
    },
    [GGML_TYPE_Q5_0] = {
        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q5_0,
        .quantize_row_q           = quantize_row_q5_0,
@ -2564,6 +2745,35 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
 #endif
 }
 static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
    const int qk = QK8_0;
    const int nb = n / qk;
    assert(n % qk == 0);
    assert(nb % 2 == 0);
    const block_q4_2 * restrict x = vx;
    const block_q8_0 * restrict y = vy;
    // scalar
    float sumf = 0.0;
    for (int i = 0; i < nb; i++) {
        int sumi = 0;
        for (int j = 0; j < qk/2; ++j) {
            const int v0 = (x[i].qs[j] & 0x0F);
            const int v1 = (x[i].qs[j] >>   4);
            sumi += (dhDequantizeNF4(v0) * y[i].qs[j]) + (dhDequantizeNF4(v1) * y[i].qs[j + qk/2]);
        }
        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
    }
    *s = sumf;
 }
 static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
    const int qk = QK8_0;
    const int nb = n / qk;
@ -3440,6 +3650,7 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
    [GGML_TYPE_F16]  = 1,
    [GGML_TYPE_Q4_0] = QK4_0,
    [GGML_TYPE_Q4_1] = QK4_1,
    [GGML_TYPE_Q4_2] = QK4_2,
    [GGML_TYPE_Q5_0] = QK5_0,
    [GGML_TYPE_Q5_1] = QK5_1,
    [GGML_TYPE_Q8_0] = QK8_0,
@ -3455,6 +3666,7 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
    [GGML_TYPE_F16]  = sizeof(ggml_fp16_t),
    [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
    [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
    [GGML_TYPE_Q4_2] = sizeof(block_q4_2),
    [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
    [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
    [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
@ -3471,6 +3683,7 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
    [GGML_TYPE_F16]  = "f16",
    [GGML_TYPE_Q4_0] = "q4_0",
    [GGML_TYPE_Q4_1] = "q4_1",
    [GGML_TYPE_Q4_2] = "q4_2",
    [GGML_TYPE_Q5_0] = "q5_0",
    [GGML_TYPE_Q5_1] = "q5_1",
    [GGML_TYPE_Q8_0] = "q8_0",
@ -3486,6 +3699,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
    [GGML_TYPE_F16]  = false,
    [GGML_TYPE_Q4_0] = true,
    [GGML_TYPE_Q4_1] = true,
    [GGML_TYPE_Q4_2] = true,
    [GGML_TYPE_Q5_0] = true,
    [GGML_TYPE_Q5_1] = true,
    [GGML_TYPE_Q8_0] = true,
--- a/ggml.h
+++ b/ggml.h
@ -235,7 +235,7 @@ extern "C" {
        GGML_TYPE_F16  = 1,
        GGML_TYPE_Q4_0 = 2,
        GGML_TYPE_Q4_1 = 3,
-        // GGML_TYPE_Q4_2 = 4, support has been removed
+        GGML_TYPE_Q4_2 = 4, // NormalFloat4
        // GGML_TYPE_Q4_3 (5) support has been removed
        GGML_TYPE_Q5_0 = 6,
        GGML_TYPE_Q5_1 = 7,
--- a/llama.h
+++ b/llama.h
@ -89,7 +89,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
+        LLAMA_FTYPE_MOSTLY_Q4_2          = 5, // normal float
        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors