Adding Q3_K and Q8_K (de)-quantization

This commit is contained in:
Iwan Kawrakow 2023-05-27 20:26:36 +03:00
parent 8673a41385
commit b4f71347ff
4 changed files with 335 additions and 10 deletions

30
ggml.c
View file

@ -2,6 +2,7 @@
#define _GNU_SOURCE
#include "ggml.h"
#include "k_quants.h"
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
@ -1565,6 +1566,14 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
.vec_dot_q = NULL, // TODO
.vec_dot_type = GGML_TYPE_Q8_1,
},
[GGML_TYPE_Q3_K] = {
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
.quantize_row_q = quantize_row_q3_K,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
.quantize_row_q_dot = NULL, //quantize_row_q8_K,
.vec_dot_q = NULL, //ggml_vec_dot_q3_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
};
// For internal test use
@ -3444,11 +3453,13 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q5_1] = QK5_1,
[GGML_TYPE_Q8_0] = QK8_0,
[GGML_TYPE_Q8_1] = QK8_1,
[GGML_TYPE_Q3_K] = QK_K,
[GGML_TYPE_Q8_K] = QK_K,
[GGML_TYPE_I8] = 1,
[GGML_TYPE_I16] = 1,
[GGML_TYPE_I32] = 1,
};
static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated");
static_assert(GGML_TYPE_COUNT == 15, "GGML_BLCK_SIZE is outdated");
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = sizeof(float),
@ -3459,11 +3470,13 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
[GGML_TYPE_I8] = sizeof(int8_t),
[GGML_TYPE_I16] = sizeof(int16_t),
[GGML_TYPE_I32] = sizeof(int32_t),
};
static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated");
static_assert(GGML_TYPE_COUNT == 15, "GGML_TYPE_SIZE is outdated");
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
@ -3475,11 +3488,13 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q5_1] = "q5_1",
[GGML_TYPE_Q8_0] = "q8_0",
[GGML_TYPE_Q8_1] = "q8_1",
[GGML_TYPE_Q3_K] = "q3_K",
[GGML_TYPE_Q8_K] = "q8_K",
[GGML_TYPE_I8] = "i8",
[GGML_TYPE_I16] = "i16",
[GGML_TYPE_I32] = "i32",
};
static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated");
static_assert(GGML_TYPE_COUNT == 15, "GGML_TYPE_NAME is outdated");
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = false,
@ -3490,11 +3505,13 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q5_1] = true,
[GGML_TYPE_Q8_0] = true,
[GGML_TYPE_Q8_1] = true,
[GGML_TYPE_Q3_K] = true,
[GGML_TYPE_Q8_K] = true,
[GGML_TYPE_I8] = false,
[GGML_TYPE_I16] = false,
[GGML_TYPE_I32] = false,
};
static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
static_assert(GGML_TYPE_COUNT == 15, "GGML_IS_QUANTIZED is outdated");
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"NONE",
@ -3801,6 +3818,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
}
@ -10996,6 +11014,8 @@ static void ggml_compute_forward_alibi(
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q8_K:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
@ -11067,6 +11087,8 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q8_K:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:

7
ggml.h
View file

@ -241,6 +241,12 @@ extern "C" {
GGML_TYPE_Q5_1 = 7,
GGML_TYPE_Q8_0 = 8,
GGML_TYPE_Q8_1 = 9,
// k-quantizations
GGML_TYPE_Q3_K = 10,
//GGML_TYPE_Q4_K = 11,
//GGML_TYPE_Q5_K = 12,
//GGML_TYPE_Q6_K = 13,
GGML_TYPE_Q8_K = 11,
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
@ -264,6 +270,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
GGML_FTYPE_MOSTLY_Q3_K = 10, // except 1d tensors
};
// available tensor operations:

View file

@ -1,13 +1,20 @@
#include "k_quants.h"
#include "ggml.h"
#include <math.h>
#include <string.h>
#include <assert.h>
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
//
// 3-6 bit quantization in super-blocks
//
//
// ===================== Helper functions
//
@ -124,8 +131,214 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
return scale;
}
static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
float max = 0;
float amax = 0;
for (int i = 0; i < n; ++i) {
float ax = fabsf(x[i]);
if (ax > amax) { amax = ax; max = x[i]; }
}
if (!amax) { // all zero
for (int i = 0; i < n; ++i) { L[i] = 0; }
return 0.f;
}
float iscale = -nmax / max;
if (do_rmse) {
float sumlx = 0;
float suml2 = 0;
for (int i = 0; i < n; ++i) {
int l = nearest_int(iscale * x[i]);
l = MAX(-nmax, MIN(nmax-1, l));
L[i] = l;
float w = x[i]*x[i];
sumlx += w*x[i]*l;
suml2 += w*l*l;
}
for (int itry = 0; itry < 5; ++itry) {
int n_changed = 0;
for (int i = 0; i < n; ++i) {
float w = x[i]*x[i];
float slx = sumlx - w*x[i]*L[i];
if (slx > 0) {
float sl2 = suml2 - w*L[i]*L[i];
int new_l = nearest_int(x[i] * sl2 / slx);
new_l = MAX(-nmax, MIN(nmax-1, new_l));
if (new_l != L[i]) {
slx += w*x[i]*new_l;
sl2 += w*new_l*new_l;
if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
L[i] = new_l; sumlx = slx; suml2 = sl2;
++n_changed;
}
}
}
}
if (!n_changed) {
break;
}
}
for (int i = 0; i < n; ++i) {
L[i] += nmax;
}
return sumlx / suml2;
}
for (int i = 0; i < n; ++i) {
int l = nearest_int(iscale * x[i]);
l = MAX(-nmax, MIN(nmax-1, l));
L[i] = l + nmax;
}
return 1/iscale;
}
static void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
//========================= 3-bit (de)-quantization
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
int8_t L[QK_K];
float scales[QK_K / 16];
for (int i = 0; i < nb; i++) {
float max_scale = 0;
float amax = 0;
for (int j = 0; j < QK_K/16; ++j) {
scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true);
float scale = fabsf(scales[j]);
if (scale > amax) {
amax = scale; max_scale = scales[j];
}
}
memset(y[i].scales, 0, 12);
if (max_scale) {
float iscale = -32.f/max_scale;
for (int j = 0; j < QK_K/16; ++j) {
int8_t l = nearest_int(iscale*scales[j]);
l = MAX(-32, MIN(31, l)) + 32;
if (j < 8) {
y[i].scales[j] = l & 0xF;
} else {
y[i].scales[j-8] |= ((l & 0xF) << 4);
}
l >>= 4;
y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
}
y[i].d = ggml_fp32_to_fp16(1/iscale);
} else {
y[i].d = ggml_fp32_to_fp16(0.f);
}
int8_t sc;
for (int j = 0; j < QK_K/16; ++j) {
sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
float d = ggml_fp16_to_fp32(y[i].d) * sc;
if (!d) {
continue;
}
for (int ii = 0; ii < 16; ++ii) {
int l = nearest_int(x[16*j + ii]/d);
l = MAX(-4, MIN(3, l));
L[16*j + ii] = l + 4;
}
}
memset(y[i].hmask, 0, QK_K/8);
// We put the high-bit for the 1st 32 quants into bit 0, the next 32 into bit 1, etc.
int m = 0;
uint8_t hm = 1;
for (int j = 0; j < QK_K; ++j) {
if (L[j] > 3) {
y[i].hmask[m] |= hm;
L[j] -= 4;
}
if (++m == QK_K/8) {
m = 0; hm <<= 1;
}
}
for (int j = 0; j < QK_K; j += 128) {
for (int l = 0; l < 32; ++l) {
y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
}
}
x += QK_K;
}
}
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
assert(k % QK_K == 0);
assert(QK_K == 256);
const int nb = k / QK_K;
const uint32_t kmask1 = 0x03030303;
const uint32_t kmask2 = 0x0f0f0f0f;
uint32_t aux[4];
const int8_t * scales = (const int8_t*)aux;
for (int i = 0; i < nb; i++) {
const float d_all = ggml_fp16_to_fp32(x[i].d);
const uint8_t * restrict q = x[i].qs;
const uint8_t * restrict hm = x[i].hmask;
uint8_t m = 1;
memcpy(aux, x[i].scales, 12);
uint32_t tmp = aux[2];
aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
int is = 0;
float dl;
for (int n = 0; n < QK_K; n += 128) {
int shift = 0;
for (int j = 0; j < 4; ++j) {
dl = d_all * (scales[is++] - 32);
for (int l = 0; l < 16; ++l) {
*y++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4));
}
dl = d_all * (scales[is++] - 32);
for (int l = 0; l < 16; ++l) {
*y++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4));
}
shift += 2;
m <<= 1;
}
q += 32;
}
}
}
void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
quantize_row_q3_K_reference(x, vy, k);
}
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
const int nb = k / QK_K;
// TODO - collect histograms - although, at a second thought, I don't really care about them
(void)hist;
for (int j = 0; j < nb; j += k) {
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
quantize_row_q3_K_reference(src + j, y, k);
}
return (n/QK_K*sizeof(block_q3_K));
}
// ====================== 6-bit (de)-quantization
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
@ -189,3 +402,57 @@ static void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * r
}
}
//===================================== Q8_K ==============================================
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
for (int i = 0; i < nb; i++) {
float max = 0;
float amax = 0;
for (int j = 0; j < QK_K; ++j) {
float ax = fabsf(x[j]);
if (ax > amax) {
amax = ax; max = x[j];
}
}
if (!amax) {
y[i].d = 0;
memset(y[i].qs, 0, QK_K);
x += QK_K;
continue;
}
const float iscale = -128.f/max;
for (int j = 0; j < QK_K; ++j) {
int v = nearest_int(iscale*x[j]);
y[i].qs[j] = MIN(127, v);
}
for (int j = 0; j < QK_K/16; ++j) {
int sum = 0;
for (int ii = 0; ii < 16; ++ii) {
sum += y[i].qs[j*16 + ii];
}
y[i].bsums[j] = sum;
}
y[i].d = 1/iscale;
x += QK_K;
}
}
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
assert(k % QK_K == 0);
const int nb = k / QK_K;
for (int i = 0; i < nb; i++) {
for (int j = 0; j < QK_K; ++j) {
*y++ = x[i].d * x[i].qs[j];
}
}
}
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
quantize_row_q8_K_reference(x, y, k);
}

View file

@ -2,15 +2,17 @@
#include "ggml.h"
#include <stdint.h>
#include <assert.h>
//
// 3-6 bit quantization in super-blocks
//
#include <stddef.h>
// Super-block size
#define QK_K 256
//
// Super-block quantization structures
//
// 3-bit quantization
// weight is represented as x = a * q
// 16 blocks of 16 elemenets each
@ -70,7 +72,34 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
// Quantization
static void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
// Dequantization
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
// Dot product
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
// Quantization with histogram collection
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);