From 5c5191ab68f28bc24ae26303d73a7ad08015880a Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Mon, 29 May 2023 19:32:43 +0300
Subject: [PATCH] Per convention, all QX_K quantizations use Q5_K for
 output.weight

---
 ggml.c    | 2 ++
 llama.cpp | 1 +
 2 files changed, 3 insertions(+)

diff --git a/ggml.c b/ggml.c
index 8ad19f23e..8bb5144d6 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3497,6 +3497,7 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
     [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
     [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
+    [GGML_TYPE_Q3_K] = sizeof(block_q3_K),
     [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
     [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
     [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
@@ -3537,6 +3538,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q5_1] = true,
     [GGML_TYPE_Q8_0] = true,
     [GGML_TYPE_Q8_1] = true,
+    [GGML_TYPE_Q3_K] = true,
     [GGML_TYPE_Q4_K] = true,
     [GGML_TYPE_Q5_K] = true,
     [GGML_TYPE_Q6_K] = true,
diff --git a/llama.cpp b/llama.cpp
index 7ca643286..ba0f4ad7d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2131,6 +2131,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
         } else {
             new_type = quantized_type;
+            if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K;
             float * f32_data;
             size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
             llama_buffer f32_conv_buf;