2-bit quantizations (#4897)
* imatrix: load * imatrix: WIP * imatrix: Add Q2_K quantization * imatrix: also guard against Q2_K_S quantization without importance matrix * imatrix: guard even more against low-bit quantization misuse --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
parent
807179ec58
commit
147b17ac94
9 changed files with 1149 additions and 82 deletions
36
ggml.c
36
ggml.c
|
@ -585,8 +585,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.type_size = sizeof(block_iq2_xxs),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
||||
.from_float = quantize_row_iq2_xxs,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_iq2_xxs_reference,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
},
|
||||
|
@ -596,8 +596,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.type_size = sizeof(block_iq2_xs),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
||||
.from_float = quantize_row_iq2_xs,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference,
|
||||
.from_float = NULL,
|
||||
.from_float_reference = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
},
|
||||
|
@ -18665,8 +18665,11 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
|
|||
return (n/QK8_0*sizeof(block_q8_0));
|
||||
}
|
||||
|
||||
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
|
||||
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
||||
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
|
||||
(void)imatrix;
|
||||
size_t result = 0;
|
||||
int n = nrows * n_per_row;
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
|
@ -18701,8 +18704,11 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|||
case GGML_TYPE_Q2_K:
|
||||
{
|
||||
GGML_ASSERT(start % QK_K == 0);
|
||||
block_q2_K * block = (block_q2_K*)dst + start / QK_K;
|
||||
result = ggml_quantize_q2_K(src + start, block, n, n, hist);
|
||||
GGML_ASSERT(start % n_per_row == 0);
|
||||
size_t start_row = start / n_per_row;
|
||||
size_t row_size = ggml_row_size(type, n_per_row);
|
||||
result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||
GGML_ASSERT(result == row_size * nrows);
|
||||
} break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
{
|
||||
|
@ -18731,14 +18737,22 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|||
case GGML_TYPE_IQ2_XXS:
|
||||
{
|
||||
GGML_ASSERT(start % QK_K == 0);
|
||||
block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
|
||||
result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist);
|
||||
GGML_ASSERT(start % n_per_row == 0);
|
||||
GGML_ASSERT(imatrix);
|
||||
size_t start_row = start / n_per_row;
|
||||
size_t row_size = ggml_row_size(type, n_per_row);
|
||||
result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||
GGML_ASSERT(result == row_size * nrows);
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
{
|
||||
GGML_ASSERT(start % QK_K == 0);
|
||||
block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
|
||||
result = ggml_quantize_iq2_xs(src + start, block, n, n, hist);
|
||||
GGML_ASSERT(start % n_per_row == 0);
|
||||
GGML_ASSERT(imatrix);
|
||||
size_t start_row = start / n_per_row;
|
||||
size_t row_size = ggml_row_size(type, n_per_row);
|
||||
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||
GGML_ASSERT(result == row_size * nrows);
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue