Adding Q3_K and Q8_K (de)-quantization
This commit is contained in:
parent
8673a41385
commit
b4f71347ff
4 changed files with 335 additions and 10 deletions
30
ggml.c
30
ggml.c
|
@ -2,6 +2,7 @@
|
||||||
#define _GNU_SOURCE
|
#define _GNU_SOURCE
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "k_quants.h"
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||||
|
@ -1565,6 +1566,14 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_q = NULL, // TODO
|
.vec_dot_q = NULL, // TODO
|
||||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||||
},
|
},
|
||||||
|
[GGML_TYPE_Q3_K] = {
|
||||||
|
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K,
|
||||||
|
.quantize_row_q = quantize_row_q3_K,
|
||||||
|
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference,
|
||||||
|
.quantize_row_q_dot = NULL, //quantize_row_q8_K,
|
||||||
|
.vec_dot_q = NULL, //ggml_vec_dot_q3_K_q8_K,
|
||||||
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
// For internal test use
|
// For internal test use
|
||||||
|
@ -3444,11 +3453,13 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_Q5_1] = QK5_1,
|
[GGML_TYPE_Q5_1] = QK5_1,
|
||||||
[GGML_TYPE_Q8_0] = QK8_0,
|
[GGML_TYPE_Q8_0] = QK8_0,
|
||||||
[GGML_TYPE_Q8_1] = QK8_1,
|
[GGML_TYPE_Q8_1] = QK8_1,
|
||||||
|
[GGML_TYPE_Q3_K] = QK_K,
|
||||||
|
[GGML_TYPE_Q8_K] = QK_K,
|
||||||
[GGML_TYPE_I8] = 1,
|
[GGML_TYPE_I8] = 1,
|
||||||
[GGML_TYPE_I16] = 1,
|
[GGML_TYPE_I16] = 1,
|
||||||
[GGML_TYPE_I32] = 1,
|
[GGML_TYPE_I32] = 1,
|
||||||
};
|
};
|
||||||
static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated");
|
static_assert(GGML_TYPE_COUNT == 15, "GGML_BLCK_SIZE is outdated");
|
||||||
|
|
||||||
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_F32] = sizeof(float),
|
[GGML_TYPE_F32] = sizeof(float),
|
||||||
|
@ -3459,11 +3470,13 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
||||||
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
||||||
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
||||||
|
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
|
||||||
|
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
|
||||||
[GGML_TYPE_I8] = sizeof(int8_t),
|
[GGML_TYPE_I8] = sizeof(int8_t),
|
||||||
[GGML_TYPE_I16] = sizeof(int16_t),
|
[GGML_TYPE_I16] = sizeof(int16_t),
|
||||||
[GGML_TYPE_I32] = sizeof(int32_t),
|
[GGML_TYPE_I32] = sizeof(int32_t),
|
||||||
};
|
};
|
||||||
static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated");
|
static_assert(GGML_TYPE_COUNT == 15, "GGML_TYPE_SIZE is outdated");
|
||||||
|
|
||||||
|
|
||||||
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
||||||
|
@ -3475,11 +3488,13 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_Q5_1] = "q5_1",
|
[GGML_TYPE_Q5_1] = "q5_1",
|
||||||
[GGML_TYPE_Q8_0] = "q8_0",
|
[GGML_TYPE_Q8_0] = "q8_0",
|
||||||
[GGML_TYPE_Q8_1] = "q8_1",
|
[GGML_TYPE_Q8_1] = "q8_1",
|
||||||
|
[GGML_TYPE_Q3_K] = "q3_K",
|
||||||
|
[GGML_TYPE_Q8_K] = "q8_K",
|
||||||
[GGML_TYPE_I8] = "i8",
|
[GGML_TYPE_I8] = "i8",
|
||||||
[GGML_TYPE_I16] = "i16",
|
[GGML_TYPE_I16] = "i16",
|
||||||
[GGML_TYPE_I32] = "i32",
|
[GGML_TYPE_I32] = "i32",
|
||||||
};
|
};
|
||||||
static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated");
|
static_assert(GGML_TYPE_COUNT == 15, "GGML_TYPE_NAME is outdated");
|
||||||
|
|
||||||
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_F32] = false,
|
[GGML_TYPE_F32] = false,
|
||||||
|
@ -3490,11 +3505,13 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
||||||
[GGML_TYPE_Q5_1] = true,
|
[GGML_TYPE_Q5_1] = true,
|
||||||
[GGML_TYPE_Q8_0] = true,
|
[GGML_TYPE_Q8_0] = true,
|
||||||
[GGML_TYPE_Q8_1] = true,
|
[GGML_TYPE_Q8_1] = true,
|
||||||
|
[GGML_TYPE_Q3_K] = true,
|
||||||
|
[GGML_TYPE_Q8_K] = true,
|
||||||
[GGML_TYPE_I8] = false,
|
[GGML_TYPE_I8] = false,
|
||||||
[GGML_TYPE_I16] = false,
|
[GGML_TYPE_I16] = false,
|
||||||
[GGML_TYPE_I32] = false,
|
[GGML_TYPE_I32] = false,
|
||||||
};
|
};
|
||||||
static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
|
static_assert(GGML_TYPE_COUNT == 15, "GGML_IS_QUANTIZED is outdated");
|
||||||
|
|
||||||
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"NONE",
|
"NONE",
|
||||||
|
@ -3801,6 +3818,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||||
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
|
||||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||||
}
|
}
|
||||||
|
@ -10996,6 +11014,8 @@ static void ggml_compute_forward_alibi(
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
|
case GGML_TYPE_Q3_K:
|
||||||
|
case GGML_TYPE_Q8_K:
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
case GGML_TYPE_I32:
|
case GGML_TYPE_I32:
|
||||||
|
@ -11067,6 +11087,8 @@ static void ggml_compute_forward_clamp(
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
|
case GGML_TYPE_Q3_K:
|
||||||
|
case GGML_TYPE_Q8_K:
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
case GGML_TYPE_I32:
|
case GGML_TYPE_I32:
|
||||||
|
|
7
ggml.h
7
ggml.h
|
@ -241,6 +241,12 @@ extern "C" {
|
||||||
GGML_TYPE_Q5_1 = 7,
|
GGML_TYPE_Q5_1 = 7,
|
||||||
GGML_TYPE_Q8_0 = 8,
|
GGML_TYPE_Q8_0 = 8,
|
||||||
GGML_TYPE_Q8_1 = 9,
|
GGML_TYPE_Q8_1 = 9,
|
||||||
|
// k-quantizations
|
||||||
|
GGML_TYPE_Q3_K = 10,
|
||||||
|
//GGML_TYPE_Q4_K = 11,
|
||||||
|
//GGML_TYPE_Q5_K = 12,
|
||||||
|
//GGML_TYPE_Q6_K = 13,
|
||||||
|
GGML_TYPE_Q8_K = 11,
|
||||||
GGML_TYPE_I8,
|
GGML_TYPE_I8,
|
||||||
GGML_TYPE_I16,
|
GGML_TYPE_I16,
|
||||||
GGML_TYPE_I32,
|
GGML_TYPE_I32,
|
||||||
|
@ -264,6 +270,7 @@ extern "C" {
|
||||||
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||||
|
GGML_FTYPE_MOSTLY_Q3_K = 10, // except 1d tensors
|
||||||
};
|
};
|
||||||
|
|
||||||
// available tensor operations:
|
// available tensor operations:
|
||||||
|
|
269
k_quants.c
269
k_quants.c
|
@ -1,13 +1,20 @@
|
||||||
#include "k_quants.h"
|
#include "k_quants.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
#undef MIN
|
#undef MIN
|
||||||
#undef MAX
|
#undef MAX
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
|
//
|
||||||
|
// 3-6 bit quantization in super-blocks
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// ===================== Helper functions
|
// ===================== Helper functions
|
||||||
//
|
//
|
||||||
|
@ -124,8 +131,214 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
||||||
return scale;
|
return scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
|
||||||
|
float max = 0;
|
||||||
|
float amax = 0;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
float ax = fabsf(x[i]);
|
||||||
|
if (ax > amax) { amax = ax; max = x[i]; }
|
||||||
|
}
|
||||||
|
if (!amax) { // all zero
|
||||||
|
for (int i = 0; i < n; ++i) { L[i] = 0; }
|
||||||
|
return 0.f;
|
||||||
|
}
|
||||||
|
float iscale = -nmax / max;
|
||||||
|
if (do_rmse) {
|
||||||
|
float sumlx = 0;
|
||||||
|
float suml2 = 0;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
int l = nearest_int(iscale * x[i]);
|
||||||
|
l = MAX(-nmax, MIN(nmax-1, l));
|
||||||
|
L[i] = l;
|
||||||
|
float w = x[i]*x[i];
|
||||||
|
sumlx += w*x[i]*l;
|
||||||
|
suml2 += w*l*l;
|
||||||
|
}
|
||||||
|
for (int itry = 0; itry < 5; ++itry) {
|
||||||
|
int n_changed = 0;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
float w = x[i]*x[i];
|
||||||
|
float slx = sumlx - w*x[i]*L[i];
|
||||||
|
if (slx > 0) {
|
||||||
|
float sl2 = suml2 - w*L[i]*L[i];
|
||||||
|
int new_l = nearest_int(x[i] * sl2 / slx);
|
||||||
|
new_l = MAX(-nmax, MIN(nmax-1, new_l));
|
||||||
|
if (new_l != L[i]) {
|
||||||
|
slx += w*x[i]*new_l;
|
||||||
|
sl2 += w*new_l*new_l;
|
||||||
|
if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
|
||||||
|
L[i] = new_l; sumlx = slx; suml2 = sl2;
|
||||||
|
++n_changed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!n_changed) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
L[i] += nmax;
|
||||||
|
}
|
||||||
|
return sumlx / suml2;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
int l = nearest_int(iscale * x[i]);
|
||||||
|
l = MAX(-nmax, MIN(nmax-1, l));
|
||||||
|
L[i] = l + nmax;
|
||||||
|
}
|
||||||
|
return 1/iscale;
|
||||||
|
}
|
||||||
|
|
||||||
static void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
|
//========================= 3-bit (de)-quantization
|
||||||
|
|
||||||
|
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
int8_t L[QK_K];
|
||||||
|
float scales[QK_K / 16];
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
|
float max_scale = 0;
|
||||||
|
float amax = 0;
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true);
|
||||||
|
float scale = fabsf(scales[j]);
|
||||||
|
if (scale > amax) {
|
||||||
|
amax = scale; max_scale = scales[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
memset(y[i].scales, 0, 12);
|
||||||
|
if (max_scale) {
|
||||||
|
float iscale = -32.f/max_scale;
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
int8_t l = nearest_int(iscale*scales[j]);
|
||||||
|
l = MAX(-32, MIN(31, l)) + 32;
|
||||||
|
if (j < 8) {
|
||||||
|
y[i].scales[j] = l & 0xF;
|
||||||
|
} else {
|
||||||
|
y[i].scales[j-8] |= ((l & 0xF) << 4);
|
||||||
|
}
|
||||||
|
l >>= 4;
|
||||||
|
y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
|
||||||
|
}
|
||||||
|
y[i].d = ggml_fp32_to_fp16(1/iscale);
|
||||||
|
} else {
|
||||||
|
y[i].d = ggml_fp32_to_fp16(0.f);
|
||||||
|
}
|
||||||
|
|
||||||
|
int8_t sc;
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
|
||||||
|
sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
|
||||||
|
float d = ggml_fp16_to_fp32(y[i].d) * sc;
|
||||||
|
if (!d) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (int ii = 0; ii < 16; ++ii) {
|
||||||
|
int l = nearest_int(x[16*j + ii]/d);
|
||||||
|
l = MAX(-4, MIN(3, l));
|
||||||
|
L[16*j + ii] = l + 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
memset(y[i].hmask, 0, QK_K/8);
|
||||||
|
// We put the high-bit for the 1st 32 quants into bit 0, the next 32 into bit 1, etc.
|
||||||
|
int m = 0;
|
||||||
|
uint8_t hm = 1;
|
||||||
|
for (int j = 0; j < QK_K; ++j) {
|
||||||
|
if (L[j] > 3) {
|
||||||
|
y[i].hmask[m] |= hm;
|
||||||
|
L[j] -= 4;
|
||||||
|
}
|
||||||
|
if (++m == QK_K/8) {
|
||||||
|
m = 0; hm <<= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int j = 0; j < QK_K; j += 128) {
|
||||||
|
for (int l = 0; l < 32; ++l) {
|
||||||
|
y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
x += QK_K;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
assert(QK_K == 256);
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
const uint32_t kmask1 = 0x03030303;
|
||||||
|
const uint32_t kmask2 = 0x0f0f0f0f;
|
||||||
|
|
||||||
|
uint32_t aux[4];
|
||||||
|
const int8_t * scales = (const int8_t*)aux;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
|
const float d_all = ggml_fp16_to_fp32(x[i].d);
|
||||||
|
|
||||||
|
const uint8_t * restrict q = x[i].qs;
|
||||||
|
const uint8_t * restrict hm = x[i].hmask;
|
||||||
|
uint8_t m = 1;
|
||||||
|
|
||||||
|
memcpy(aux, x[i].scales, 12);
|
||||||
|
uint32_t tmp = aux[2];
|
||||||
|
aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||||
|
aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||||
|
aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||||
|
aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||||
|
|
||||||
|
int is = 0;
|
||||||
|
float dl;
|
||||||
|
for (int n = 0; n < QK_K; n += 128) {
|
||||||
|
int shift = 0;
|
||||||
|
for (int j = 0; j < 4; ++j) {
|
||||||
|
|
||||||
|
dl = d_all * (scales[is++] - 32);
|
||||||
|
for (int l = 0; l < 16; ++l) {
|
||||||
|
*y++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4));
|
||||||
|
}
|
||||||
|
|
||||||
|
dl = d_all * (scales[is++] - 32);
|
||||||
|
for (int l = 0; l < 16; ++l) {
|
||||||
|
*y++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4));
|
||||||
|
}
|
||||||
|
|
||||||
|
shift += 2;
|
||||||
|
m <<= 1;
|
||||||
|
}
|
||||||
|
q += 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
|
quantize_row_q3_K_reference(x, vy, k);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
// TODO - collect histograms - although, at a second thought, I don't really care about them
|
||||||
|
(void)hist;
|
||||||
|
|
||||||
|
for (int j = 0; j < nb; j += k) {
|
||||||
|
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
||||||
|
quantize_row_q3_K_reference(src + j, y, k);
|
||||||
|
}
|
||||||
|
return (n/QK_K*sizeof(block_q3_K));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ====================== 6-bit (de)-quantization
|
||||||
|
|
||||||
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
@ -189,3 +402,57 @@ static void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * r
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//===================================== Q8_K ==============================================
|
||||||
|
|
||||||
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
|
||||||
|
float max = 0;
|
||||||
|
float amax = 0;
|
||||||
|
for (int j = 0; j < QK_K; ++j) {
|
||||||
|
float ax = fabsf(x[j]);
|
||||||
|
if (ax > amax) {
|
||||||
|
amax = ax; max = x[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!amax) {
|
||||||
|
y[i].d = 0;
|
||||||
|
memset(y[i].qs, 0, QK_K);
|
||||||
|
x += QK_K;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const float iscale = -128.f/max;
|
||||||
|
for (int j = 0; j < QK_K; ++j) {
|
||||||
|
int v = nearest_int(iscale*x[j]);
|
||||||
|
y[i].qs[j] = MIN(127, v);
|
||||||
|
}
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
int sum = 0;
|
||||||
|
for (int ii = 0; ii < 16; ++ii) {
|
||||||
|
sum += y[i].qs[j*16 + ii];
|
||||||
|
}
|
||||||
|
y[i].bsums[j] = sum;
|
||||||
|
}
|
||||||
|
y[i].d = 1/iscale;
|
||||||
|
x += QK_K;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
for (int j = 0; j < QK_K; ++j) {
|
||||||
|
*y++ = x[i].d * x[i].qs[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
|
||||||
|
quantize_row_q8_K_reference(x, y, k);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
39
k_quants.h
39
k_quants.h
|
@ -2,15 +2,17 @@
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <stddef.h>
|
||||||
//
|
|
||||||
// 3-6 bit quantization in super-blocks
|
|
||||||
//
|
|
||||||
|
|
||||||
// Super-block size
|
// Super-block size
|
||||||
#define QK_K 256
|
#define QK_K 256
|
||||||
|
|
||||||
|
//
|
||||||
|
// Super-block quantization structures
|
||||||
|
//
|
||||||
|
|
||||||
// 3-bit quantization
|
// 3-bit quantization
|
||||||
// weight is represented as x = a * q
|
// weight is represented as x = a * q
|
||||||
// 16 blocks of 16 elemenets each
|
// 16 blocks of 16 elemenets each
|
||||||
|
@ -70,7 +72,34 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
|
||||||
|
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
static void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
||||||
|
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
||||||
|
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
||||||
|
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
||||||
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
||||||
|
|
||||||
|
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
||||||
|
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
||||||
|
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
||||||
|
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
||||||
|
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
||||||
|
|
||||||
|
// Dequantization
|
||||||
|
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
||||||
|
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
||||||
|
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
||||||
|
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
|
||||||
|
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
||||||
|
|
||||||
|
// Dot product
|
||||||
|
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
|
||||||
|
// Quantization with histogram collection
|
||||||
|
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue