WIP: make i-quants work for QK_K = 64
This commit is contained in:
parent
0becb22ac0
commit
13ba37f1aa
3 changed files with 35 additions and 5 deletions
|
@ -4227,6 +4227,9 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
|
||||||
|
|
||||||
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
|
void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
|
#if QK_K == 64
|
||||||
|
dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k);
|
||||||
|
#else
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
|
@ -4246,6 +4249,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
|
||||||
qs += 16;
|
qs += 16;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
//===================================== Q8_K ==============================================
|
//===================================== Q8_K ==============================================
|
||||||
|
@ -10455,6 +10459,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(by);
|
UNUSED(by);
|
||||||
UNUSED(bs);
|
UNUSED(bs);
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
#if QK_K == 64
|
||||||
|
ggml_vec_dot_iq4_nl_q8_0(n, s, bs, vx, bx, vy, by, nrc);
|
||||||
|
#else
|
||||||
|
|
||||||
const block_iq4_xs * restrict x = vx;
|
const block_iq4_xs * restrict x = vx;
|
||||||
const block_q8_K * restrict y = vy;
|
const block_q8_K * restrict y = vy;
|
||||||
|
@ -10574,6 +10581,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||||
}
|
}
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// ================================ IQ2 quantization =============================================
|
// ================================ IQ2 quantization =============================================
|
||||||
|
@ -10921,7 +10929,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
||||||
|
|
||||||
const int kMaxQ = 3;
|
const int kMaxQ = 3;
|
||||||
|
|
||||||
const int nbl = n/256;
|
const int nbl = n/QK_K;
|
||||||
|
|
||||||
block_iq2_xxs * y = vy;
|
block_iq2_xxs * y = vy;
|
||||||
|
|
||||||
|
@ -11094,7 +11102,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
||||||
|
|
||||||
const int kMaxQ = 3;
|
const int kMaxQ = 3;
|
||||||
|
|
||||||
const int nbl = n/256;
|
const int nbl = n/QK_K;
|
||||||
|
|
||||||
block_iq2_xs * y = vy;
|
block_iq2_xs * y = vy;
|
||||||
|
|
||||||
|
@ -12037,7 +12045,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
||||||
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
||||||
GGML_ASSERT(n%QK_K == 0);
|
GGML_ASSERT(n%QK_K == 0);
|
||||||
|
|
||||||
const int nbl = n/256;
|
const int nbl = n/QK_K;
|
||||||
|
|
||||||
block_iq1_s * y = vy;
|
block_iq1_s * y = vy;
|
||||||
|
|
||||||
|
@ -12315,6 +12323,9 @@ void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * rest
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||||
|
#if QK_K == 64
|
||||||
|
return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
|
||||||
|
#else
|
||||||
(void)hist;
|
(void)hist;
|
||||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||||
int nblock = n_per_row/QK_K;
|
int nblock = n_per_row/QK_K;
|
||||||
|
@ -12333,6 +12344,7 @@ size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, i
|
||||||
qrow += nblock*sizeof(block_iq4_xs);
|
qrow += nblock*sizeof(block_iq4_xs);
|
||||||
}
|
}
|
||||||
return nrow * nblock * sizeof(block_iq4_xs);
|
return nrow * nblock * sizeof(block_iq4_xs);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
|
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
|
||||||
|
@ -12363,7 +12375,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
||||||
|
|
||||||
const int kMaxQ = 3;
|
const int kMaxQ = 3;
|
||||||
|
|
||||||
const int nbl = n/256;
|
const int nbl = n/QK_K;
|
||||||
|
|
||||||
block_iq2_s * y = vy;
|
block_iq2_s * y = vy;
|
||||||
|
|
||||||
|
|
|
@ -230,6 +230,10 @@ typedef struct {
|
||||||
} block_iq4_nl;
|
} block_iq4_nl;
|
||||||
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
||||||
|
|
||||||
|
#if QK_K == 64
|
||||||
|
#define block_iq4_xs block_iq4_nl
|
||||||
|
//typedef struct block_iq4_nl block_iq4_xs;
|
||||||
|
#else
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_fp16_t d;
|
ggml_fp16_t d;
|
||||||
uint16_t scales_h;
|
uint16_t scales_h;
|
||||||
|
@ -237,6 +241,7 @@ typedef struct {
|
||||||
uint8_t qs[QK_K/2];
|
uint8_t qs[QK_K/2];
|
||||||
} block_iq4_xs;
|
} block_iq4_xs;
|
||||||
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
15
ggml.c
15
ggml.c
|
@ -728,14 +728,22 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
},
|
},
|
||||||
[GGML_TYPE_IQ4_XS] = {
|
[GGML_TYPE_IQ4_XS] = {
|
||||||
.type_name = "iq4_xs",
|
.type_name = "iq4_xs",
|
||||||
|
#if QK_K == 64
|
||||||
|
.blck_size = QK4_NL,
|
||||||
|
#else
|
||||||
.blck_size = QK_K,
|
.blck_size = QK_K,
|
||||||
|
#endif
|
||||||
.type_size = sizeof(block_iq4_xs),
|
.type_size = sizeof(block_iq4_xs),
|
||||||
.is_quantized = true,
|
.is_quantized = true,
|
||||||
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
||||||
.from_float = quantize_row_iq4_xs,
|
.from_float = quantize_row_iq4_xs,
|
||||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
|
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
|
||||||
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
||||||
|
#if QK_K == 64
|
||||||
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||||
|
#else
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
#endif
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
[GGML_TYPE_Q8_K] = {
|
[GGML_TYPE_Q8_K] = {
|
||||||
|
@ -19830,6 +19838,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
|
#if QK_K == 64
|
||||||
|
case GGML_TYPE_IQ4_XS:
|
||||||
|
#endif
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK4_NL == 0);
|
GGML_ASSERT(start % QK4_NL == 0);
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
|
@ -19838,15 +19849,17 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
|
#if QK_K != 64
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK4_NL == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
|
#endif
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
size_t elemsize = sizeof(ggml_fp16_t);
|
size_t elemsize = sizeof(ggml_fp16_t);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue