diff --git a/ggml.c b/ggml.c index 8ad19f23e..8bb5144d6 100644 --- a/ggml.c +++ b/ggml.c @@ -3497,6 +3497,7 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = sizeof(block_q5_1), [GGML_TYPE_Q8_0] = sizeof(block_q8_0), [GGML_TYPE_Q8_1] = sizeof(block_q8_1), + [GGML_TYPE_Q3_K] = sizeof(block_q3_K), [GGML_TYPE_Q4_K] = sizeof(block_q4_K), [GGML_TYPE_Q5_K] = sizeof(block_q5_K), [GGML_TYPE_Q6_K] = sizeof(block_q6_K), @@ -3537,6 +3538,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = true, [GGML_TYPE_Q8_0] = true, [GGML_TYPE_Q8_1] = true, + [GGML_TYPE_Q3_K] = true, [GGML_TYPE_Q4_K] = true, [GGML_TYPE_Q5_K] = true, [GGML_TYPE_Q6_K] = true, diff --git a/llama.cpp b/llama.cpp index 7ca643286..ba0f4ad7d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2131,6 +2131,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0); } else { new_type = quantized_type; + if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K; float * f32_data; size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); llama_buffer f32_conv_buf;