diff --git a/ggml-quants.c b/ggml-quants.c index e801e0c28..6b88c140a 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -11968,7 +11968,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy const int * kmap_q2xs = iq2_data[gindex].map; const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; - GGML_ASSERT(quant_weights && "missing quantization weights"); + //GGML_ASSERT(quant_weights && "missing quantization weights"); GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?"); GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?"); GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?"); @@ -12006,8 +12006,12 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy for (int ib = 0; ib < QK_K/block_size; ++ib) { const float * xb = xbl + block_size*ib; - const float * qw = quant_weights + QK_K*ibl + block_size*ib; - for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + if (quant_weights) { + const float * qw = quant_weights + QK_K*ibl + block_size*ib; + for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + } else { + for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i]; + } float max = fabsf(xb[0]); for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); if (!max) { diff --git a/ggml.c b/ggml.c index be8691349..e4f33ec33 100644 --- a/ggml.c +++ b/ggml.c @@ -20353,8 +20353,8 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) { return type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || - type == GGML_TYPE_IQ1_S || - type == GGML_TYPE_IQ1_M; + type == GGML_TYPE_IQ1_S;// || + //type == GGML_TYPE_IQ1_M; } size_t ggml_quantize_chunk( diff --git a/llama.cpp b/llama.cpp index 0c1c5645a..edeec193e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12919,6 +12919,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (!params->pure && ggml_is_quantized(default_type)) { new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); } + else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { + new_type = params->token_embedding_type; + } + else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { + new_type = params->output_tensor_type; + } // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. @@ -12951,7 +12957,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_S || new_type == GGML_TYPE_IQ1_S || - new_type == GGML_TYPE_IQ1_M || + (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) || (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);