ggml : remove old quantization functions
ggml-ci
This commit is contained in:
parent
fb215c3832
commit
13c1cc6a9f
6 changed files with 100 additions and 345 deletions
|
@ -1919,42 +1919,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||||
|
|
||||||
std::vector<int64_t> hist_cur(1 << 4, 0);
|
std::vector<int64_t> hist_cur(1 << 4, 0);
|
||||||
|
|
||||||
switch (new_type) {
|
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], hist_cur.data(), nullptr);
|
||||||
case GGML_TYPE_Q4_0: {
|
|
||||||
new_size = ggml_quantize_q4_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q4_1: {
|
|
||||||
new_size = ggml_quantize_q4_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q5_0: {
|
|
||||||
new_size = ggml_quantize_q5_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q5_1: {
|
|
||||||
new_size = ggml_quantize_q5_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q8_0: {
|
|
||||||
new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q2_K: {
|
|
||||||
new_size = ggml_quantize_q2_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q3_K: {
|
|
||||||
new_size = ggml_quantize_q3_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q4_K: {
|
|
||||||
new_size = ggml_quantize_q4_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q5_K: {
|
|
||||||
new_size = ggml_quantize_q5_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q6_K: {
|
|
||||||
new_size = ggml_quantize_q6_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
|
|
||||||
} break;
|
|
||||||
default: {
|
|
||||||
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t j = 0; j < hist_cur.size(); ++j) {
|
for (size_t j = 0; j < hist_cur.size(); ++j) {
|
||||||
hist_all[j] += hist_cur[j];
|
hist_all[j] += hist_cur[j];
|
||||||
|
|
129
ggml-quants.c
129
ggml-quants.c
|
@ -1704,16 +1704,6 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
quantize_row_q2_K_reference(x, vy, k);
|
quantize_row_q2_K_reference(x, vy, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
|
||||||
(void)hist; // TODO: collect histograms
|
|
||||||
|
|
||||||
for (int j = 0; j < n; j += k) {
|
|
||||||
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
|
||||||
quantize_row_q2_K_reference(src + j, y, k);
|
|
||||||
}
|
|
||||||
return (n/QK_K*sizeof(block_q2_K));
|
|
||||||
}
|
|
||||||
|
|
||||||
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
||||||
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
||||||
float rmin, float rdelta, int nstep, bool use_mad) {
|
float rmin, float rdelta, int nstep, bool use_mad) {
|
||||||
|
@ -1966,8 +1956,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
(void)hist;
|
|
||||||
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
||||||
if (!quant_weights) {
|
if (!quant_weights) {
|
||||||
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
||||||
|
@ -2186,16 +2175,6 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
quantize_row_q3_K_reference(x, vy, k);
|
quantize_row_q3_K_reference(x, vy, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
|
||||||
(void)hist; // TODO: collect histograms
|
|
||||||
|
|
||||||
for (int j = 0; j < n; j += k) {
|
|
||||||
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
|
||||||
quantize_row_q3_K_reference(src + j, y, k);
|
|
||||||
}
|
|
||||||
return (n/QK_K*sizeof(block_q3_K));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
|
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
|
||||||
#if QK_K != 256
|
#if QK_K != 256
|
||||||
(void)quant_weights;
|
(void)quant_weights;
|
||||||
|
@ -2285,8 +2264,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
(void)hist;
|
|
||||||
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
||||||
if (!quant_weights) {
|
if (!quant_weights) {
|
||||||
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
||||||
|
@ -2456,17 +2434,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
quantize_row_q4_K_reference(x, y, k);
|
quantize_row_q4_K_reference(x, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
|
||||||
assert(k % QK_K == 0);
|
|
||||||
(void)hist; // TODO: collect histograms
|
|
||||||
|
|
||||||
for (int j = 0; j < n; j += k) {
|
|
||||||
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
|
||||||
quantize_row_q4_K_reference(src + j, y, k);
|
|
||||||
}
|
|
||||||
return (n/QK_K*sizeof(block_q4_K));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
|
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
|
||||||
#if QK_K != 256
|
#if QK_K != 256
|
||||||
(void)quant_weights;
|
(void)quant_weights;
|
||||||
|
@ -2545,8 +2512,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
(void)hist;
|
|
||||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
||||||
if (!quant_weights) {
|
if (!quant_weights) {
|
||||||
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
||||||
|
@ -2757,17 +2723,6 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
quantize_row_q5_K_reference(x, y, k);
|
quantize_row_q5_K_reference(x, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
|
||||||
assert(k % QK_K == 0);
|
|
||||||
(void)hist; // TODO: collect histograms
|
|
||||||
|
|
||||||
for (int j = 0; j < n; j += k) {
|
|
||||||
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
|
||||||
quantize_row_q5_K_reference(src + j, y, k);
|
|
||||||
}
|
|
||||||
return (n/QK_K*sizeof(block_q5_K));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
|
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
|
||||||
#if QK_K != 256
|
#if QK_K != 256
|
||||||
(void)quant_weights;
|
(void)quant_weights;
|
||||||
|
@ -2866,8 +2821,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
(void)hist;
|
|
||||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
||||||
if (!quant_weights) {
|
if (!quant_weights) {
|
||||||
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
||||||
|
@ -3020,17 +2974,6 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
||||||
quantize_row_q6_K_reference(x, y, k);
|
quantize_row_q6_K_reference(x, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
|
||||||
assert(k % QK_K == 0);
|
|
||||||
(void)hist; // TODO: collect histograms
|
|
||||||
|
|
||||||
for (int j = 0; j < n; j += k) {
|
|
||||||
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
|
||||||
quantize_row_q6_K_reference(src + j, y, k);
|
|
||||||
}
|
|
||||||
return (n/QK_K*sizeof(block_q6_K));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
|
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
|
||||||
#if QK_K != 256
|
#if QK_K != 256
|
||||||
(void)quant_weights;
|
(void)quant_weights;
|
||||||
|
@ -3120,8 +3063,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
(void)hist;
|
|
||||||
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
||||||
if (!quant_weights) {
|
if (!quant_weights) {
|
||||||
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
||||||
|
@ -3165,9 +3107,10 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
if (!quant_weights) {
|
if (!quant_weights) {
|
||||||
return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
|
||||||
|
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
||||||
}
|
}
|
||||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
||||||
char * qrow = (char *)dst;
|
char * qrow = (char *)dst;
|
||||||
|
@ -3209,9 +3152,10 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
if (!quant_weights) {
|
if (!quant_weights) {
|
||||||
return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
|
||||||
|
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
||||||
}
|
}
|
||||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
||||||
char * qrow = (char *)dst;
|
char * qrow = (char *)dst;
|
||||||
|
@ -3262,9 +3206,10 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
if (!quant_weights) {
|
if (!quant_weights) {
|
||||||
return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
|
||||||
|
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
||||||
}
|
}
|
||||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
||||||
char * qrow = (char *)dst;
|
char * qrow = (char *)dst;
|
||||||
|
@ -3314,9 +3259,10 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
if (!quant_weights) {
|
if (!quant_weights) {
|
||||||
return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
|
||||||
|
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
||||||
}
|
}
|
||||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
||||||
char * qrow = (char *)dst;
|
char * qrow = (char *)dst;
|
||||||
|
@ -3328,6 +3274,13 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int
|
||||||
return nrow * row_size;
|
return nrow * row_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t quantize_q8_0(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
|
(void)quant_weights; // not used
|
||||||
|
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
||||||
|
quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
|
||||||
|
return nrow * row_size;
|
||||||
|
}
|
||||||
|
|
||||||
// ====================== "True" 2-bit (de)-quantization
|
// ====================== "True" 2-bit (de)-quantization
|
||||||
|
|
||||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
|
||||||
|
@ -10698,8 +10651,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
(void)hist;
|
|
||||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||||
int nblock = n_per_row/QK_K;
|
int nblock = n_per_row/QK_K;
|
||||||
char * qrow = (char *)dst;
|
char * qrow = (char *)dst;
|
||||||
|
@ -10711,8 +10663,7 @@ size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row,
|
||||||
return nrow * nblock * sizeof(block_iq2_xxs);
|
return nrow * nblock * sizeof(block_iq2_xxs);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
(void)hist;
|
|
||||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||||
int nblock = n_per_row/QK_K;
|
int nblock = n_per_row/QK_K;
|
||||||
char * qrow = (char *)dst;
|
char * qrow = (char *)dst;
|
||||||
|
@ -11154,8 +11105,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
(void)hist;
|
|
||||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||||
int nblock = n_per_row/QK_K;
|
int nblock = n_per_row/QK_K;
|
||||||
char * qrow = (char *)dst;
|
char * qrow = (char *)dst;
|
||||||
|
@ -11361,8 +11311,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
||||||
}
|
}
|
||||||
|
|
||||||
#define IQ3S_BLOCK_SIZE 32
|
#define IQ3S_BLOCK_SIZE 32
|
||||||
size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
(void)hist;
|
|
||||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||||
int nblock = n_per_row/QK_K;
|
int nblock = n_per_row/QK_K;
|
||||||
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
||||||
|
@ -11392,7 +11341,7 @@ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
|
||||||
|
|
||||||
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
|
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
quantize_iq3_s(x, y, 1, k, NULL, NULL);
|
quantize_iq3_s(x, y, 1, k, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -11587,8 +11536,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
(void)hist;
|
|
||||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||||
int nblock = n_per_row/QK_K;
|
int nblock = n_per_row/QK_K;
|
||||||
char * qrow = (char *)dst;
|
char * qrow = (char *)dst;
|
||||||
|
@ -11721,8 +11669,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
(void)hist;
|
|
||||||
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
||||||
int nblock = n_per_row/QK4_NL;
|
int nblock = n_per_row/QK4_NL;
|
||||||
char * qrow = (char *)dst;
|
char * qrow = (char *)dst;
|
||||||
|
@ -11752,14 +11699,13 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
|
||||||
|
|
||||||
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
|
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
|
||||||
assert(k % QK4_NL == 0);
|
assert(k % QK4_NL == 0);
|
||||||
quantize_iq4_nl(x, y, 1, k, NULL, NULL);
|
quantize_iq4_nl(x, y, 1, k, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
#if QK_K == 64
|
#if QK_K == 64
|
||||||
return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
|
return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
|
||||||
#else
|
#else
|
||||||
(void)hist;
|
|
||||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||||
int nblock = n_per_row/QK_K;
|
int nblock = n_per_row/QK_K;
|
||||||
char * qrow = (char *)dst;
|
char * qrow = (char *)dst;
|
||||||
|
@ -11788,7 +11734,7 @@ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
|
||||||
|
|
||||||
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
|
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
quantize_iq4_xs(x, y, 1, k, NULL, NULL);
|
quantize_iq4_xs(x, y, 1, k, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
// =============================== 2.5625 bpw
|
// =============================== 2.5625 bpw
|
||||||
|
@ -11961,8 +11907,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, const float * quant_weights) {
|
||||||
(void)hist;
|
|
||||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||||
int nblock = n_per_row/QK_K;
|
int nblock = n_per_row/QK_K;
|
||||||
char * qrow = (char *)dst;
|
char * qrow = (char *)dst;
|
||||||
|
@ -11976,7 +11921,7 @@ size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, in
|
||||||
|
|
||||||
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
|
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
quantize_iq2_s(x, y, 1, k, NULL, NULL);
|
quantize_iq2_s(x, y, 1, k, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
|
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
|
||||||
|
|
|
@ -333,23 +333,24 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
||||||
//
|
//
|
||||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||||
//
|
//
|
||||||
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq2_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_iq2_s (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq4_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_iq4_xs (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq3_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_iq3_s (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
|
size_t quantize_q8_0 (const float * src, void * dst, int nrows, int n_per_row, const float * imatrix);
|
||||||
|
|
||||||
void iq2xs_init_impl(enum ggml_type type);
|
void iq2xs_init_impl(enum ggml_type type);
|
||||||
void iq2xs_free_impl(enum ggml_type type);
|
void iq2xs_free_impl(enum ggml_type type);
|
||||||
|
|
|
@ -4104,43 +4104,7 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool
|
||||||
static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
|
static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
|
||||||
std::vector<int64_t> hist_cur(1 << 4, 0);
|
std::vector<int64_t> hist_cur(1 << 4, 0);
|
||||||
|
|
||||||
switch(quant) {
|
gml_quantize_chunk(quant, from, to, 0, 1, ne, hist_cur.data(), nullptr);
|
||||||
case GGML_TYPE_F32:
|
|
||||||
memcpy(to, from, sizeof(float) * ne);
|
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q4_0:
|
|
||||||
ggml_quantize_q4_0(from, to, ne, ne, hist_cur.data());
|
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q4_1:
|
|
||||||
ggml_quantize_q4_1(from, to, ne, ne, hist_cur.data());
|
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q5_0:
|
|
||||||
ggml_quantize_q5_0(from, to, ne, ne, hist_cur.data());
|
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q5_1:
|
|
||||||
ggml_quantize_q5_1(from, to, ne, ne, hist_cur.data());
|
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q8_0:
|
|
||||||
ggml_quantize_q8_0(from, to, ne, ne, hist_cur.data());
|
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q2_K:
|
|
||||||
ggml_quantize_q2_K(from, to, ne, ne, hist_cur.data());
|
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q3_K:
|
|
||||||
ggml_quantize_q3_K(from, to, ne, ne, hist_cur.data());
|
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q4_K:
|
|
||||||
ggml_quantize_q4_K(from, to, ne, ne, hist_cur.data());
|
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q5_K:
|
|
||||||
ggml_quantize_q5_K(from, to, ne, ne, hist_cur.data());
|
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q6_K:
|
|
||||||
ggml_quantize_q6_K(from, to, ne, ne, hist_cur.data());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
|
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
|
||||||
|
|
182
ggml.c
182
ggml.c
|
@ -20159,133 +20159,6 @@ void ggml_quantize_free(void) {
|
||||||
ggml_critical_section_end();
|
ggml_critical_section_end();
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
|
||||||
assert(k % QK4_0 == 0);
|
|
||||||
const int nb = k / QK4_0;
|
|
||||||
|
|
||||||
for (int b = 0; b < n; b += k) {
|
|
||||||
block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
|
|
||||||
|
|
||||||
quantize_row_q4_0_reference(src + b, y, k);
|
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
|
||||||
for (int j = 0; j < QK4_0; j += 2) {
|
|
||||||
const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
|
|
||||||
const uint8_t vi1 = y[i].qs[j/2] >> 4;
|
|
||||||
|
|
||||||
hist[vi0]++;
|
|
||||||
hist[vi1]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return (n/QK4_0*sizeof(block_q4_0));
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
|
|
||||||
assert(k % QK4_1 == 0);
|
|
||||||
const int nb = k / QK4_1;
|
|
||||||
|
|
||||||
for (int b = 0; b < n; b += k) {
|
|
||||||
block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
|
|
||||||
|
|
||||||
quantize_row_q4_1_reference(src + b, y, k);
|
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
|
||||||
for (int j = 0; j < QK4_1; j += 2) {
|
|
||||||
const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
|
|
||||||
const uint8_t vi1 = y[i].qs[j/2] >> 4;
|
|
||||||
|
|
||||||
hist[vi0]++;
|
|
||||||
hist[vi1]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return (n/QK4_1*sizeof(block_q4_1));
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
|
||||||
assert(k % QK5_0 == 0);
|
|
||||||
const int nb = k / QK5_0;
|
|
||||||
|
|
||||||
for (int b = 0; b < n; b += k) {
|
|
||||||
block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
|
|
||||||
|
|
||||||
quantize_row_q5_0_reference(src + b, y, k);
|
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
|
||||||
uint32_t qh;
|
|
||||||
memcpy(&qh, &y[i].qh, sizeof(qh));
|
|
||||||
|
|
||||||
for (int j = 0; j < QK5_0; j += 2) {
|
|
||||||
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
|
||||||
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
|
||||||
|
|
||||||
// cast to 16 bins
|
|
||||||
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
|
||||||
const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
|
|
||||||
|
|
||||||
hist[vi0]++;
|
|
||||||
hist[vi1]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return (n/QK5_0*sizeof(block_q5_0));
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
|
|
||||||
assert(k % QK5_1 == 0);
|
|
||||||
const int nb = k / QK5_1;
|
|
||||||
|
|
||||||
for (int b = 0; b < n; b += k) {
|
|
||||||
block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
|
|
||||||
|
|
||||||
quantize_row_q5_1_reference(src + b, y, k);
|
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
|
||||||
uint32_t qh;
|
|
||||||
memcpy(&qh, &y[i].qh, sizeof(qh));
|
|
||||||
|
|
||||||
for (int j = 0; j < QK5_1; j += 2) {
|
|
||||||
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
|
||||||
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
|
||||||
|
|
||||||
// cast to 16 bins
|
|
||||||
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
|
||||||
const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
|
|
||||||
|
|
||||||
hist[vi0]++;
|
|
||||||
hist[vi1]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return (n/QK5_1*sizeof(block_q5_1));
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
|
||||||
assert(k % QK8_0 == 0);
|
|
||||||
const int nb = k / QK8_0;
|
|
||||||
|
|
||||||
for (int b = 0; b < n; b += k) {
|
|
||||||
block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
|
|
||||||
|
|
||||||
quantize_row_q8_0_reference(src + b, y, k);
|
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
|
||||||
for (int j = 0; j < QK8_0; ++j) {
|
|
||||||
const int8_t vi = y[i].qs[j];
|
|
||||||
|
|
||||||
hist[vi/16 + 8]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return (n/QK8_0*sizeof(block_q8_0));
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
||||||
return
|
return
|
||||||
type == GGML_TYPE_IQ2_XXS ||
|
type == GGML_TYPE_IQ2_XXS ||
|
||||||
|
@ -20293,8 +20166,15 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
||||||
type == GGML_TYPE_IQ1_S;
|
type == GGML_TYPE_IQ1_S;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
size_t ggml_quantize_chunk(
|
||||||
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
|
enum ggml_type type,
|
||||||
|
const float * src,
|
||||||
|
void * dst,
|
||||||
|
int start,
|
||||||
|
int nrows,
|
||||||
|
int n_per_row,
|
||||||
|
int64_t * hist,
|
||||||
|
const float * imatrix) {
|
||||||
ggml_quantize_init(type); // this is noop if already initialized
|
ggml_quantize_init(type); // this is noop if already initialized
|
||||||
size_t result = 0;
|
size_t result = 0;
|
||||||
int n = nrows * n_per_row;
|
int n = nrows * n_per_row;
|
||||||
|
@ -20305,7 +20185,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
|
@ -20314,7 +20194,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
|
@ -20323,7 +20203,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
|
@ -20332,14 +20212,17 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK8_0 == 0);
|
GGML_ASSERT(start % QK8_0 == 0);
|
||||||
block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q8_0(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q8_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
{
|
{
|
||||||
|
@ -20347,7 +20230,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
|
@ -20356,7 +20239,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
@ -20365,7 +20248,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
|
@ -20374,7 +20257,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
@ -20383,7 +20266,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
|
@ -20393,7 +20276,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(imatrix);
|
GGML_ASSERT(imatrix);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
|
@ -20403,7 +20286,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(imatrix);
|
GGML_ASSERT(imatrix);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
@ -20412,7 +20295,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
|
@ -20421,7 +20304,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
|
@ -20430,7 +20313,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_IQ1_S:
|
case GGML_TYPE_IQ1_S:
|
||||||
|
@ -20439,7 +20322,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
|
@ -20451,7 +20334,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
#if QK_K != 64
|
#if QK_K != 64
|
||||||
|
@ -20461,7 +20344,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(start % n_per_row == 0);
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
#endif
|
#endif
|
||||||
|
@ -20480,6 +20363,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
default:
|
default:
|
||||||
assert(false);
|
assert(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_UNUSED(hist); // TODO: poppulate
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
24
ggml.h
24
ggml.h
|
@ -2194,25 +2194,19 @@ extern "C" {
|
||||||
GGML_API void ggml_quantize_init(enum ggml_type type);
|
GGML_API void ggml_quantize_init(enum ggml_type type);
|
||||||
GGML_API void ggml_quantize_free(void);
|
GGML_API void ggml_quantize_free(void);
|
||||||
|
|
||||||
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
|
||||||
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
|
|
||||||
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
||||||
|
|
||||||
// some quantization type cannot be used without an importance matrix
|
// some quantization type cannot be used without an importance matrix
|
||||||
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
|
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
|
||||||
|
|
||||||
// calls ggml_quantize_init internally (i.e. can allocate memory)
|
// calls ggml_quantize_init internally (i.e. can allocate memory)
|
||||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
GGML_API size_t ggml_quantize_chunk(
|
||||||
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
enum ggml_type type,
|
||||||
|
const float * src,
|
||||||
|
void * dst,
|
||||||
|
int start,
|
||||||
|
int nrows,
|
||||||
|
int n_per_row,
|
||||||
|
int64_t * hist,
|
||||||
|
const float * imatrix);
|
||||||
|
|
||||||
//
|
//
|
||||||
// gguf
|
// gguf
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue