iq1_m: checking pure iq1_m quantization
It is pretty bad: PPL(LLaMA-v2-7B) = 34 if we quantize output.weight with Q4_K.
This commit is contained in:
parent
abc1d4f951
commit
dff85a804b
3 changed files with 16 additions and 6 deletions
|
@ -11968,7 +11968,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
|
||||||
const int * kmap_q2xs = iq2_data[gindex].map;
|
const int * kmap_q2xs = iq2_data[gindex].map;
|
||||||
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
||||||
|
|
||||||
GGML_ASSERT(quant_weights && "missing quantization weights");
|
//GGML_ASSERT(quant_weights && "missing quantization weights");
|
||||||
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
||||||
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
||||||
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
||||||
|
@ -12006,8 +12006,12 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
|
||||||
|
|
||||||
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
for (int ib = 0; ib < QK_K/block_size; ++ib) {
|
||||||
const float * xb = xbl + block_size*ib;
|
const float * xb = xbl + block_size*ib;
|
||||||
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
if (quant_weights) {
|
||||||
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
const float * qw = quant_weights + QK_K*ibl + block_size*ib;
|
||||||
|
for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i];
|
||||||
|
}
|
||||||
float max = fabsf(xb[0]);
|
float max = fabsf(xb[0]);
|
||||||
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i]));
|
||||||
if (!max) {
|
if (!max) {
|
||||||
|
|
4
ggml.c
4
ggml.c
|
@ -20353,8 +20353,8 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
||||||
return
|
return
|
||||||
type == GGML_TYPE_IQ2_XXS ||
|
type == GGML_TYPE_IQ2_XXS ||
|
||||||
type == GGML_TYPE_IQ2_XS ||
|
type == GGML_TYPE_IQ2_XS ||
|
||||||
type == GGML_TYPE_IQ1_S ||
|
type == GGML_TYPE_IQ1_S;// ||
|
||||||
type == GGML_TYPE_IQ1_M;
|
//type == GGML_TYPE_IQ1_M;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_chunk(
|
size_t ggml_quantize_chunk(
|
||||||
|
|
|
@ -12919,6 +12919,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
if (!params->pure && ggml_is_quantized(default_type)) {
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
||||||
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
||||||
}
|
}
|
||||||
|
else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
||||||
|
new_type = params->token_embedding_type;
|
||||||
|
}
|
||||||
|
else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
||||||
|
new_type = params->output_tensor_type;
|
||||||
|
}
|
||||||
|
|
||||||
// If we've decided to quantize to the same type the tensor is already
|
// If we've decided to quantize to the same type the tensor is already
|
||||||
// in then there's nothing to do.
|
// in then there's nothing to do.
|
||||||
|
@ -12951,7 +12957,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
new_type == GGML_TYPE_IQ2_XS ||
|
new_type == GGML_TYPE_IQ2_XS ||
|
||||||
new_type == GGML_TYPE_IQ2_S ||
|
new_type == GGML_TYPE_IQ2_S ||
|
||||||
new_type == GGML_TYPE_IQ1_S ||
|
new_type == GGML_TYPE_IQ1_S ||
|
||||||
new_type == GGML_TYPE_IQ1_M ||
|
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
|
||||||
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
||||||
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
||||||
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue