q4_0c continous row layout

Introduce alternative quantized formats q4_0c and q8_0c, corresponding
exactly to q4_0 and q8_0, except that quantized values and scales are
laid out continuously in memory, and the nibbles in q4_0 are rearranged.

This should simplify SIMD implementations, at the expense of slighly
more complex scalar implementations.
This commit is contained in:
Håkon H. Hitland 2023-04-17 23:36:29 +02:00
parent 221946777c
commit a1e6fb9281
2 changed files with 207 additions and 4 deletions

209
ggml.c
View file

@ -772,6 +772,14 @@ typedef struct {
} block_q8_1;
static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
#define QK4_0C (4*32)
#define QK4_0C_MUL (QK4_0C / QK4_0)
// TODO: nicer description - pseudostruct?
// q4_0c : (uint8_t[QK4_0C/2]) qs[nb] || float d[n]
#define QK8_0C 32
// q8_0c : uint8_t qs[n] || float d[n]
// reference implementation for deterministic creation of model files
static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
assert(k % QK4_0 == 0);
@ -1117,6 +1125,57 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
#endif
}
static void quantize_row_q4_0c_reference(const float * restrict x, uint8_t * restrict y, int k) {
assert(k % QK4_0C == 0);
const int nb = k / QK4_0;
const int nsb = k / QK4_0C;
// Split y into nibbles section and scales section
uint8_t * restrict qs = y;
float * restrict ds = (float *) (y + QK4_0C/2 * nsb);
for (int i = 0; i < nb/2; i++) {
// Interleave two output blocks in low and high nibbles
const int src0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ...
const int src1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ...
const float * xb[2] = {
x + QK4_0 * src0, // block in low nibbles
x + QK4_0 * src1, // block in high nibbles
};
// Find multiplier for each block
float d[2];
float id[2];
for (int j = 0; j < 2; j++) {
float amax = 0.0f; // absolute max
for (int l = 0; l < QK4_0; l++) {
const float v = xb[j][l];
amax = MAX(amax, fabsf(v));
}
d[j] = amax / ((1 << 3) - 1);
id[j] = d[j] ? 1.0f/d[j] : 0.0f;
}
ds[src0] = d[0];
ds[src1] = d[1];
for (int l = 0; l < QK4_0; l++) {
const float v0 = xb[0][l]*id[0];
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
const float v1 = xb[1][l]*id[1];
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
assert(vi0 < 16);
assert(vi1 < 16);
qs[i*QK4_0 + l] = vi0 | (vi1 << 4);
}
}
}
static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) {
assert(k % QK4_1 == 0);
const int nb = k / QK4_1;
@ -1658,6 +1717,40 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
#endif
}
// reference implementation for deterministic creation of model files
static void quantize_row_q8_0c_reference(const float * restrict x, void * restrict y, int k) {
assert(k % QK8_0 == 0);
const int nb = k / QK8_0;
uint8_t * restrict qs = y;
float * restrict ds = (float *) ((uint8_t *) y + QK8_0C * nb);
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
for (int l = 0; l < QK8_0; l++) {
const float v = x[i*QK8_0 + l];
amax = MAX(amax, fabsf(v));
}
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
ds[i] = d;
for (int l = 0; l < QK8_0; ++l) {
const float v = x[i*QK8_0 + l]*id;
qs[i*QK8_0 + l] = roundf(v);
}
}
}
static void quantize_row_q8_0c(const float * restrict x, void * restrict vy, int k) {
assert(k % QK8_0 == 0);
quantize_row_q8_0c_reference(x, vy, k);
}
static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) {
assert(k % QK4_0 == 0);
const int nb = k / QK4_0;
@ -1776,6 +1869,41 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in
#endif
}
static void dequantize_row_q4_0c(const void * restrict vx, float * restrict y, int k) {
assert(k % QK4_0C == 0);
const int nb = k / QK4_0;
const int nsb = k / QK4_0C;
// Split vx into nibbles section and scales section
const uint8_t * restrict qs = vx;
const float * restrict ds = (const float *) ((const uint8_t *) vx + QK4_0C/2 * nsb);
// scalar
for (int i = 0; i < nb/2; i++) {
const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ...
const int dst1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ...
const float d0 = ds[dst0];
const float d1 = ds[dst1];
for (int l = 0; l < QK4_0; l++) {
const uint8_t vi = qs[i * QK4_0 + l];
const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;
const float v0 = (vi0 - 8)*d0;
const float v1 = (vi1 - 8)*d1;
y[dst0*QK4_0 + l] = v0;
y[dst1*QK4_0 + l] = v1;
assert(!isnan(y[dst0*QK4_0 + l]));
assert(!isnan(y[dst1*QK4_0 + l]));
}
}
}
static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) {
assert(k % QK4_1 == 0);
const int nb = k / QK4_1;
@ -2002,6 +2130,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
}
static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@ -2017,6 +2146,14 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
.vec_dot_q = ggml_vec_dot_q4_0_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
},
[GGML_TYPE_Q4_0C] = {
.dequantize_row_q = dequantize_row_q4_0c,
//.quantize_row_q = quantize_row_q4_0c,
.quantize_row_q = (quantize_row_q_t) quantize_row_q4_0c_reference,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0c_reference,
.quantize_row_q_dot = quantize_row_q8_0c,
.vec_dot_q = ggml_vec_dot_q4_0c_q8_0c,
},
[GGML_TYPE_Q4_1] = {
.dequantize_row_q = dequantize_row_q4_1,
.quantize_row_q = quantize_row_q4_1,
@ -2065,6 +2202,13 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
.vec_dot_q = NULL, // TODO
.vec_dot_type = GGML_TYPE_Q8_1,
},
[GGML_TYPE_Q8_0C] = {
.dequantize_row_q = NULL,
.quantize_row_q = quantize_row_q8_0c,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0c_reference,
.quantize_row_q_dot = quantize_row_q8_0c,
.vec_dot_q = NULL,
},
};
// For internal test use
@ -2835,6 +2979,51 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
#endif
}
static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
const int nb = n / QK4_0;
const int nsb = n / QK4_0C;
assert(n % QK4_0C == 0);
// Split into nibbles and scales sections
const uint8_t * restrict xqs = vx;
const float * restrict xds = (const float *) ((const uint8_t *) vx + nsb*QK4_0C/2);
const int8_t * restrict yqs = vy;
const float * restrict yds = (const float *) ((const uint8_t *) vy + nb*QK8_0C);
float sumf = 0.0;
// scalar
for (int i = 0; i < nb/2; i++) {
const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ...
const int dst1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ...
const float dx0 = xds[dst0];
const float dx1 = xds[dst1];
const float dy0 = yds[dst0];
const float dy1 = yds[dst1];
int sumi0 = 0;
int sumi1 = 0;
for (int l = 0; l < QK4_0; l++) {
const uint8_t v0 = xqs[i*QK4_0 + l];
const int i0 = (int8_t) (v0 & 0xf) - 8;
const int i1 = (int8_t) (v0 >> 4) - 8;
const int i2 = yqs[dst0*QK4_0 + l];
const int i3 = yqs[dst1*QK4_0 + l];
sumi0 += i0*i2;
sumi1 += i1*i3;
}
sumf += dx0*dy0*sumi0 + dx1*dy1*sumi1;
}
*s = sumf;
}
static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
const int nb = n / QK8_1;
@ -3885,66 +4074,74 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = 1,
[GGML_TYPE_F16] = 1,
[GGML_TYPE_Q4_0] = QK4_0,
[GGML_TYPE_Q4_0C] = QK4_0C,
[GGML_TYPE_Q4_1] = QK4_1,
[GGML_TYPE_Q4_2] = QK4_2,
[GGML_TYPE_Q5_0] = QK5_0,
[GGML_TYPE_Q5_1] = QK5_1,
[GGML_TYPE_Q8_0] = QK8_0,
[GGML_TYPE_Q8_0C] = QK8_0C,
[GGML_TYPE_Q8_1] = QK8_1,
[GGML_TYPE_I8] = 1,
[GGML_TYPE_I16] = 1,
[GGML_TYPE_I32] = 1,
};
static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated");
static_assert(GGML_TYPE_COUNT == 15, "GGML_BLCK_SIZE is outdated");
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = sizeof(float),
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
[GGML_TYPE_Q4_0C] = 4*sizeof(block_q4_0),
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
[GGML_TYPE_Q4_2] = sizeof(block_q4_2),
[GGML_TYPE_Q5_0] = sizeof(block_q5_0),
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
[GGML_TYPE_Q8_0C] = sizeof(block_q8_0),
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
[GGML_TYPE_I8] = sizeof(int8_t),
[GGML_TYPE_I16] = sizeof(int16_t),
[GGML_TYPE_I32] = sizeof(int32_t),
};
static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated");
static_assert(GGML_TYPE_COUNT == 15, "GGML_TYPE_SIZE is outdated");
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = "f32",
[GGML_TYPE_F16] = "f16",
[GGML_TYPE_Q4_0] = "q4_0",
[GGML_TYPE_Q4_0C] = "q4_0c",
[GGML_TYPE_Q4_1] = "q4_1",
[GGML_TYPE_Q4_2] = "q4_2",
[GGML_TYPE_Q5_0] = "q5_0",
[GGML_TYPE_Q5_1] = "q5_1",
[GGML_TYPE_Q8_0] = "q8_0",
[GGML_TYPE_Q8_0C] = "q8_0c",
[GGML_TYPE_Q8_1] = "q8_1",
[GGML_TYPE_I8] = "i8",
[GGML_TYPE_I16] = "i16",
[GGML_TYPE_I32] = "i32",
};
static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated");
static_assert(GGML_TYPE_COUNT == 15, "GGML_TYPE_NAME is outdated");
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = false,
[GGML_TYPE_F16] = false,
[GGML_TYPE_Q4_0] = true,
[GGML_TYPE_Q4_0C] = true,
[GGML_TYPE_Q4_1] = true,
[GGML_TYPE_Q4_2] = true,
[GGML_TYPE_Q5_0] = true,
[GGML_TYPE_Q5_1] = true,
[GGML_TYPE_Q8_0] = true,
[GGML_TYPE_Q8_0C] = true,
[GGML_TYPE_Q8_1] = true,
[GGML_TYPE_I8] = false,
[GGML_TYPE_I16] = false,
[GGML_TYPE_I32] = false,
};
static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
static_assert(GGML_TYPE_COUNT == 15, "GGML_IS_QUANTIZED is outdated");
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"NONE",
@ -8763,11 +8960,13 @@ static void ggml_compute_forward_mul_mat(
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_0C:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_0C:
case GGML_TYPE_Q8_1:
{
ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
@ -8994,11 +9193,13 @@ static void ggml_compute_forward_get_rows(
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_0C:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_0C:
case GGML_TYPE_Q8_1:
{
ggml_compute_forward_get_rows_q(params, src0, src1, dst);

2
ggml.h
View file

@ -237,6 +237,8 @@ extern "C" {
GGML_TYPE_Q5_1 = 7,
GGML_TYPE_Q8_0 = 8,
GGML_TYPE_Q8_1 = 9,
GGML_TYPE_Q4_0C = 10,
GGML_TYPE_Q8_0C = 11,
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,