diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c2acd0f25..1376b13c1 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -621,8 +621,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; - // sanity checks - if (!llama_model_is_recurrent(&model)) + // sanity checks for models that have attention layers + if (qs.n_attention_wv != 0) { const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin(); // attention layers have a non-zero number of kv heads