2-bit quantizations (#4897)

* imatrix: load

* imatrix: WIP

* imatrix: Add Q2_K quantization

* imatrix: also guard against Q2_K_S quantization without importance matrix

* imatrix: guard even more against low-bit quantization misuse

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow 2024-01-14 09:45:56 +02:00 committed by GitHub
parent 807179ec58
commit 147b17ac94
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 1149 additions and 82 deletions

36
ggml.c
View file

@ -585,8 +585,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.type_size = sizeof(block_iq2_xxs),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
.from_float = quantize_row_iq2_xxs,
.from_float_reference = (ggml_from_float_t) quantize_row_iq2_xxs_reference,
.from_float = NULL,
.from_float_reference = NULL,
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
@ -596,8 +596,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.type_size = sizeof(block_iq2_xs),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
.from_float = quantize_row_iq2_xs,
.from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference,
.from_float = NULL,
.from_float_reference = NULL,
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
},
@ -18665,8 +18665,11 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
return (n/QK8_0*sizeof(block_q8_0));
}
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
(void)imatrix;
size_t result = 0;
int n = nrows * n_per_row;
switch (type) {
case GGML_TYPE_Q4_0:
{
@ -18701,8 +18704,11 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
case GGML_TYPE_Q2_K:
{
GGML_ASSERT(start % QK_K == 0);
block_q2_K * block = (block_q2_K*)dst + start / QK_K;
result = ggml_quantize_q2_K(src + start, block, n, n, hist);
GGML_ASSERT(start % n_per_row == 0);
size_t start_row = start / n_per_row;
size_t row_size = ggml_row_size(type, n_per_row);
result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
GGML_ASSERT(result == row_size * nrows);
} break;
case GGML_TYPE_Q3_K:
{
@ -18731,14 +18737,22 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
case GGML_TYPE_IQ2_XXS:
{
GGML_ASSERT(start % QK_K == 0);
block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist);
GGML_ASSERT(start % n_per_row == 0);
GGML_ASSERT(imatrix);
size_t start_row = start / n_per_row;
size_t row_size = ggml_row_size(type, n_per_row);
result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
GGML_ASSERT(result == row_size * nrows);
} break;
case GGML_TYPE_IQ2_XS:
{
GGML_ASSERT(start % QK_K == 0);
block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
result = ggml_quantize_iq2_xs(src + start, block, n, n, hist);
GGML_ASSERT(start % n_per_row == 0);
GGML_ASSERT(imatrix);
size_t start_row = start / n_per_row;
size_t row_size = ggml_row_size(type, n_per_row);
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
GGML_ASSERT(result == row_size * nrows);
} break;
case GGML_TYPE_F16:
{