From b3ea026c924705467b6f34fec73e05bc612db798 Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Tue, 13 Jun 2023 03:14:06 -0600 Subject: [PATCH] Improve help output in quantize tool --- examples/quantize/quantize.cpp | 40 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 877c31373..4e8e6f523 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -17,28 +17,28 @@ static const std::vector QUANT_OPTIONS = { { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, - "approx +0.2499 perplexity, 3.50 GiB size @ 7B", + " 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M", }, { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, - "approx +0.1846 perplexity, 3.90 GiB size @ 7B", + " 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L", }, { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, - "approx +0.0796 perplexity, 4.30 GiB size @ 7B", + " 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M", }, { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, - "approx +0.0415 perplexity, 4.70 GiB size @ 7B", + " 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M", }, #ifdef GGML_USE_K_QUANTS { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, - "approx +0.8698 perplexity, 2.67 GiB size @ 7B", + " 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended", }, { "Q3_K", @@ -48,17 +48,17 @@ static const std::vector QUANT_OPTIONS = { { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, - "approx +0.5505 perplexity, 2.75 GiB size @ 7B", + " 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, - "approx +0.2437 perplexity, 3.06 GiB size @ 7B", + " 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss", }, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, - "approx +0.1803 perplexity, 3.35 GiB size @ 7B", + " 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss", }, { "Q4_K", @@ -68,12 +68,12 @@ static const std::vector QUANT_OPTIONS = { { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, - "approx +0.1149 perplexity, 3.56 GiB size @ 7B", + " 3.56G, +0.1149 ppl @ 7B - small, significant quality loss", }, { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, - "approx +0.0535 perplexity, 3.80 GiB size @ 7B", + " 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*", }, { "Q5_K", @@ -83,33 +83,33 @@ static const std::vector QUANT_OPTIONS = { { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, - "approx +0.0353 perplexity, 4.33 GiB size @ 7B", + " 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*", }, { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, - "approx +0.0142 perplexity, 4.45 GiB size @ 7B", + " 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, - "approx +0.0044 perplexity, 5.15 GiB size @ 7B", + " 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss", }, #endif { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, - "approx +0.0004 perplexity, 6.70 GiB size @ 7B", + " 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, - "no significant perplexity increase, 13.00 GiB size @ 7B", + "13.00G @ 7B - extremely large, virtually no quality loss - not recommended", }, { "F32", LLAMA_FTYPE_ALL_F32, - "full quality, 26.00 GiB size @ 7B", + "26.00G @ 7B - absolutely huge, lossless - not recommended", }, }; @@ -144,15 +144,15 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std: } // usage: -// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] +// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] // void usage(const char * executable) { - fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n", executable); + fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable); fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); - fprintf(stderr, "Allowed quantization types:\n"); + fprintf(stderr, "\nAllowed quantization types:\n"); for (auto & it : QUANT_OPTIONS) { - printf(" %2d or %6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str()); + printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str()); } exit(1); }