ggml : add TQ1_0 and TQ2_0 ternary quantization types

2024-07-30 17:55:54 -04:00 · 2024-07-30 17:55:54 -04:00 · 77b8f84ae7
commit 77b8f84ae7
parent 79a278e922
10 changed files with 563 additions and 16 deletions
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -26,6 +26,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "IQ2_M",    LLAMA_FTYPE_MOSTLY_IQ2_M,    " 2.7  bpw quantization",            },
    { "IQ1_S",    LLAMA_FTYPE_MOSTLY_IQ1_S,    " 1.56 bpw quantization",            },
    { "IQ1_M",    LLAMA_FTYPE_MOSTLY_IQ1_M,    " 1.75 bpw quantization",            },
+    { "TQ1_0",    LLAMA_FTYPE_MOSTLY_TQ1_0,    " 1.69 bpw ternarization",           },
+    { "TQ2_0",    LLAMA_FTYPE_MOSTLY_TQ2_0,    " 2.06 bpw ternarization",           },
    { "Q1_3",     LLAMA_FTYPE_MOSTLY_Q1_3,     " 1.63 bpw for BitNet b1.58",        },
    { "Q2_2",     LLAMA_FTYPE_MOSTLY_Q2_2,     " 2.00 bpw for BitNet b1.58",        },
    { "Q2_K",     LLAMA_FTYPE_MOSTLY_Q2_K,     " 2.96G, +3.5199 ppl @ Llama-3-8B",  },