Improve help output in quantize tool

2023-06-13 03:14:06 -06:00 · 2023-06-13 03:14:06 -06:00 · b3ea026c92
commit b3ea026c92
parent 1bedac6ff0
1 changed files with 20 additions and 20 deletions
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -17,28 +17,28 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    {
        "Q4_0",
        LLAMA_FTYPE_MOSTLY_Q4_0,
-        "approx +0.2499 perplexity, 3.50 GiB size @ 7B",
+        " 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
    },
    {
        "Q4_1",
        LLAMA_FTYPE_MOSTLY_Q4_1,
-        "approx +0.1846 perplexity, 3.90 GiB size @ 7B",
+        " 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
    },
    {
        "Q5_0",
        LLAMA_FTYPE_MOSTLY_Q5_0,
-        "approx +0.0796 perplexity, 4.30 GiB size @ 7B",
+        " 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
    },
    {
        "Q5_1",
        LLAMA_FTYPE_MOSTLY_Q5_1,
-        "approx +0.0415 perplexity, 4.70 GiB size @ 7B",
+        " 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
    },
 #ifdef GGML_USE_K_QUANTS
    {
        "Q2_K",
        LLAMA_FTYPE_MOSTLY_Q2_K,
-        "approx +0.8698 perplexity, 2.67 GiB size @ 7B",
+        " 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
    },
    {
        "Q3_K",
@ -48,17 +48,17 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    {
        "Q3_K_S",
        LLAMA_FTYPE_MOSTLY_Q3_K_S,
-        "approx +0.5505 perplexity, 2.75 GiB size @ 7B",
+        " 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
    },
    {
        "Q3_K_M",
        LLAMA_FTYPE_MOSTLY_Q3_K_M,
-        "approx +0.2437 perplexity, 3.06 GiB size @ 7B",
+        " 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
    },
    {
        "Q3_K_L",
        LLAMA_FTYPE_MOSTLY_Q3_K_L,
-        "approx +0.1803 perplexity, 3.35 GiB size @ 7B",
+        " 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
    },
    {
        "Q4_K",
@ -68,12 +68,12 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    {
        "Q4_K_S",
        LLAMA_FTYPE_MOSTLY_Q4_K_S,
-        "approx +0.1149 perplexity, 3.56 GiB size @ 7B",
+        " 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
    },
    {
        "Q4_K_M",
        LLAMA_FTYPE_MOSTLY_Q4_K_M,
-        "approx +0.0535 perplexity, 3.80 GiB size @ 7B",
+        " 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
    },
    {
        "Q5_K",
@ -83,33 +83,33 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    {
        "Q5_K_S",
        LLAMA_FTYPE_MOSTLY_Q5_K_S,
-        "approx +0.0353 perplexity, 4.33 GiB size @ 7B",
+        " 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
    },
    {
        "Q5_K_M",
        LLAMA_FTYPE_MOSTLY_Q5_K_M,
-        "approx +0.0142 perplexity, 4.45 GiB size @ 7B",
+        " 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
    },
    {
        "Q6_K",
        LLAMA_FTYPE_MOSTLY_Q6_K,
-        "approx +0.0044 perplexity, 5.15 GiB size @ 7B",
+        " 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
    },
 #endif
    {
        "Q8_0",
        LLAMA_FTYPE_MOSTLY_Q8_0,
-        "approx +0.0004 perplexity, 6.70 GiB size @ 7B",
+        " 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
    },
    {
        "F16",
        LLAMA_FTYPE_MOSTLY_F16,
-        "no significant perplexity increase, 13.00 GiB size @ 7B",
+        "13.00G              @ 7B - extremely large, virtually no quality loss - not recommended",
    },
    {
        "F32",
        LLAMA_FTYPE_ALL_F32,
-        "full quality, 26.00 GiB size @ 7B",
+        "26.00G              @ 7B - absolutely huge, lossless - not recommended",
    },
 };

@ -144,15 +144,15 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
 }

 // usage:
-//  ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
 //
 void usage(const char * executable) {
-    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n", executable);
+    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
-    fprintf(stderr, "Allowed quantization types:\n");
+    fprintf(stderr, "\nAllowed quantization types:\n");
    for (auto & it : QUANT_OPTIONS) {
-        printf("  %2d  or  %6s  :  %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
+        printf("  %2d  or  %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
    }
    exit(1);
 }