Add normalfloat4 as Q4_2

This commit is contained in:
Howard Su 2023-06-01 22:17:53 +08:00
parent ffb06a345e
commit ab0a7d1531
4 changed files with 217 additions and 2 deletions

View file

@ -9,6 +9,7 @@
static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = { static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0}, {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1}, {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0}, {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1}, {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0}, {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},

214
ggml.c
View file

@ -783,6 +783,13 @@ typedef struct {
} block_q4_1; } block_q4_1;
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding"); static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
#define QK4_2 32
typedef struct {
ggml_fp16_t d; // delta
uint8_t qs[QK4_2 / 2]; // nibbles / quants
} block_q4_2;
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
#define QK5_0 32 #define QK5_0 32
typedef struct { typedef struct {
ggml_fp16_t d; // delta ggml_fp16_t d; // delta
@ -894,10 +901,101 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
} }
} }
static inline unsigned char dQuantizeNF4(float x)
{
// the values for this tree was generated by test_normal_map_tree
// in the file tests/test_functional.py
if(x > 0.03979014977812767f)
if(x > 0.3893125355243683f) // 1
if(x > 0.6427869200706482f) // 11
if(x > 0.8614784181118011f) // 111
return 0b1111;
else
return 0b1110;
else
if(x > 0.5016634166240692f) // 110
return 0b1101;
else
return 0b1100;
else
if(x > 0.2035212516784668f) // 10
if(x > 0.2920137718319893f) // 101
return 0b1011;
else
return 0b1010;
else
if(x > 0.1202552504837513f) // 100
return 0b1001;
else
return 0b1000;
else
if(x > -0.33967943489551544f) // 0
if(x > -0.13791173323988914f) // 01
if(x > -0.045525018125772476f) // 011
return 0b0111;
else
return 0b0110;
else
if(x > -0.23460740596055984f) // 010
return 0b0101;
else
return 0b0100;
else
if(x > -0.6106329262256622f) // 00
if(x > -0.4599952697753906f) // 001
return 0b0011;
else
return 0b0010;
else
if(x > -0.8480964004993439f) // 000
return 0b0001;
else
return 0b0000;
}
static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
quantize_row_q4_1_reference(x, y, k); quantize_row_q4_1_reference(x, y, k);
} }
static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * restrict y, int k) {
static const int qk = QK4_2;
assert(k % qk == 0);
const int nb = k / qk;
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
for (int j = 0; j < qk; j++) {
const float v = x[i*qk + j];
if (amax < fabsf(v)) {
amax = fabsf(v);
}
}
const float id = amax ? 1.0f/amax : 0.0f;
y[i].d = GGML_FP32_TO_FP16(amax);
for (int j = 0; j < qk/2; ++j) {
const float x0 = x[i*qk + 0 + j]*id;
const float x1 = x[i*qk + qk/2 + j]*id;
const uint8_t xi0 = dQuantizeNF4(x0);
const uint8_t xi1 = dQuantizeNF4(x1);
y[i].qs[j] = xi0;
y[i].qs[j] |= xi1 << 4;
}
}
}
static void quantize_row_q4_2(const float * restrict x, void * restrict y, int k) {
quantize_row_q4_2_reference(x, y, k);
}
static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
static const int qk = QK5_0; static const int qk = QK5_0;
@ -1439,6 +1537,80 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
} }
} }
static inline float dhDequantizeNF4(unsigned char val)
{
// the values for this tree was generated by test_normal_map_tree
// in the file tests/test_functional.py
if((val & 0b1000) == 8)
if((val & 0b0100) == 4) // 1
if((val & 0b0010) == 2) // 11
if((val & 0b0001) == 1) // 111
return 1.0f;
else
return 0.7229568362236023f;
else
if((val & 0b0001) == 1) // 110
return 0.5626170039176941f;
else
return 0.44070982933044434f;
else
if((val & 0b0010) == 2) //10
if((val & 0b0001) == 1) // 101
return 0.33791524171829224f;
else
return 0.24611230194568634f;
else
if((val & 0b0001) == 1) // 100
return 0.16093020141124725f;
else
return 0.07958029955625534f;
else
if((val & 0b0100) == 4) // 0
if((val & 0b0010) == 2) //01
if((val & 0b0001) == 1) // 011
return 0.0f;
else
return -0.09105003625154495f;
else
if((val & 0b0001) == 1) // 010
return -0.18477343022823334f;
else
return -0.28444138169288635f;
else
if((val & 0b0010) == 2) //00
if((val & 0b0001) == 1) // 001
return -0.39491748809814453f;
else
return -0.5250730514526367f;
else
if((val & 0b0001) == 1) // 000
return -0.6961928009986877f;
else
return -1.0f;
}
static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict y, int k) {
static const int qk = QK4_2;
assert(k % qk == 0);
const int nb = k / qk;
for (int i = 0; i < nb; i++) {
const float d = GGML_FP16_TO_FP32(x[i].d);
for (int j = 0; j < qk/2; ++j) {
const int x0 = (x[i].qs[j] & 0x0F);
const int x1 = (x[i].qs[j] >> 4);
y[i*qk + j + 0 ] = dhDequantizeNF4(x0) * d;
y[i*qk + j + qk/2] = dhDequantizeNF4(x1) * d;
}
}
}
static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
static const int qk = QK5_0; static const int qk = QK5_0;
@ -1512,6 +1684,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@ -1533,6 +1706,14 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
.vec_dot_q = ggml_vec_dot_q4_1_q8_1, .vec_dot_q = ggml_vec_dot_q4_1_q8_1,
.vec_dot_type = GGML_TYPE_Q8_1, .vec_dot_type = GGML_TYPE_Q8_1,
}, },
[GGML_TYPE_Q4_2] = {
.dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_2,
.quantize_row_q = quantize_row_q4_2,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference,
.quantize_row_q_dot = quantize_row_q8_0,
.vec_dot_q = ggml_vec_dot_q4_2_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q5_0] = { [GGML_TYPE_Q5_0] = {
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_0, .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_0,
.quantize_row_q = quantize_row_q5_0, .quantize_row_q = quantize_row_q5_0,
@ -2564,6 +2745,35 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
#endif #endif
} }
static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
const int qk = QK8_0;
const int nb = n / qk;
assert(n % qk == 0);
assert(nb % 2 == 0);
const block_q4_2 * restrict x = vx;
const block_q8_0 * restrict y = vy;
// scalar
float sumf = 0.0;
for (int i = 0; i < nb; i++) {
int sumi = 0;
for (int j = 0; j < qk/2; ++j) {
const int v0 = (x[i].qs[j] & 0x0F);
const int v1 = (x[i].qs[j] >> 4);
sumi += (dhDequantizeNF4(v0) * y[i].qs[j]) + (dhDequantizeNF4(v1) * y[i].qs[j + qk/2]);
}
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
}
*s = sumf;
}
static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
const int qk = QK8_0; const int qk = QK8_0;
const int nb = n / qk; const int nb = n / qk;
@ -3440,6 +3650,7 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_F16] = 1, [GGML_TYPE_F16] = 1,
[GGML_TYPE_Q4_0] = QK4_0, [GGML_TYPE_Q4_0] = QK4_0,
[GGML_TYPE_Q4_1] = QK4_1, [GGML_TYPE_Q4_1] = QK4_1,
[GGML_TYPE_Q4_2] = QK4_2,
[GGML_TYPE_Q5_0] = QK5_0, [GGML_TYPE_Q5_0] = QK5_0,
[GGML_TYPE_Q5_1] = QK5_1, [GGML_TYPE_Q5_1] = QK5_1,
[GGML_TYPE_Q8_0] = QK8_0, [GGML_TYPE_Q8_0] = QK8_0,
@ -3455,6 +3666,7 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_F16] = sizeof(ggml_fp16_t), [GGML_TYPE_F16] = sizeof(ggml_fp16_t),
[GGML_TYPE_Q4_0] = sizeof(block_q4_0), [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
[GGML_TYPE_Q4_1] = sizeof(block_q4_1), [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
[GGML_TYPE_Q4_2] = sizeof(block_q4_2),
[GGML_TYPE_Q5_0] = sizeof(block_q5_0), [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
[GGML_TYPE_Q5_1] = sizeof(block_q5_1), [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
[GGML_TYPE_Q8_0] = sizeof(block_q8_0), [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
@ -3471,6 +3683,7 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
[GGML_TYPE_F16] = "f16", [GGML_TYPE_F16] = "f16",
[GGML_TYPE_Q4_0] = "q4_0", [GGML_TYPE_Q4_0] = "q4_0",
[GGML_TYPE_Q4_1] = "q4_1", [GGML_TYPE_Q4_1] = "q4_1",
[GGML_TYPE_Q4_2] = "q4_2",
[GGML_TYPE_Q5_0] = "q5_0", [GGML_TYPE_Q5_0] = "q5_0",
[GGML_TYPE_Q5_1] = "q5_1", [GGML_TYPE_Q5_1] = "q5_1",
[GGML_TYPE_Q8_0] = "q8_0", [GGML_TYPE_Q8_0] = "q8_0",
@ -3486,6 +3699,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
[GGML_TYPE_F16] = false, [GGML_TYPE_F16] = false,
[GGML_TYPE_Q4_0] = true, [GGML_TYPE_Q4_0] = true,
[GGML_TYPE_Q4_1] = true, [GGML_TYPE_Q4_1] = true,
[GGML_TYPE_Q4_2] = true,
[GGML_TYPE_Q5_0] = true, [GGML_TYPE_Q5_0] = true,
[GGML_TYPE_Q5_1] = true, [GGML_TYPE_Q5_1] = true,
[GGML_TYPE_Q8_0] = true, [GGML_TYPE_Q8_0] = true,

2
ggml.h
View file

@ -235,7 +235,7 @@ extern "C" {
GGML_TYPE_F16 = 1, GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2, GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3, GGML_TYPE_Q4_1 = 3,
// GGML_TYPE_Q4_2 = 4, support has been removed GGML_TYPE_Q4_2 = 4, // NormalFloat4
// GGML_TYPE_Q4_3 (5) support has been removed // GGML_TYPE_Q4_3 (5) support has been removed
GGML_TYPE_Q5_0 = 6, GGML_TYPE_Q5_0 = 6,
GGML_TYPE_Q5_1 = 7, GGML_TYPE_Q5_1 = 7,

View file

@ -89,7 +89,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // normal float
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors