Allow quantizing k-quants to fall back when tensor size incompatible

2023-10-23 09:24:24 -06:00 · 2023-10-23 09:24:24 -06:00 · 7f20d78e7e
commit 7f20d78e7e
parent 34b2a5e1ee
1 changed files with 9 additions and 9 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -8133,20 +8133,20 @@ static ggml_type get_k_quant_type(
        int nx = tensor->ne[0];
        int ny = tensor->ne[1];
        if (nx % QK_K != 0) {
-            LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
+            LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
            convert_incompatible_tensor = true;
        }
    }
    if (convert_incompatible_tensor) {
-        if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
+        switch (new_type) {
-            new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
+            case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
-            LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
+            case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
-        } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
+            case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
-            new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
+            case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
-            LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
+            case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
-        } else {
+            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
            throw std::runtime_error("Unsupported tensor size encountered\n");
        }
        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
    }
    return new_type;