Improve help output in quantize tool

This commit is contained in:
KerfuffleV2 2023-06-13 03:14:06 -06:00
parent 1bedac6ff0
commit b3ea026c92

View file

@ -17,28 +17,28 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{
"Q4_0",
LLAMA_FTYPE_MOSTLY_Q4_0,
"approx +0.2499 perplexity, 3.50 GiB size @ 7B",
" 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
},
{
"Q4_1",
LLAMA_FTYPE_MOSTLY_Q4_1,
"approx +0.1846 perplexity, 3.90 GiB size @ 7B",
" 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
},
{
"Q5_0",
LLAMA_FTYPE_MOSTLY_Q5_0,
"approx +0.0796 perplexity, 4.30 GiB size @ 7B",
" 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
},
{
"Q5_1",
LLAMA_FTYPE_MOSTLY_Q5_1,
"approx +0.0415 perplexity, 4.70 GiB size @ 7B",
" 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
},
#ifdef GGML_USE_K_QUANTS
{
"Q2_K",
LLAMA_FTYPE_MOSTLY_Q2_K,
"approx +0.8698 perplexity, 2.67 GiB size @ 7B",
" 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
},
{
"Q3_K",
@ -48,17 +48,17 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{
"Q3_K_S",
LLAMA_FTYPE_MOSTLY_Q3_K_S,
"approx +0.5505 perplexity, 2.75 GiB size @ 7B",
" 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
},
{
"Q3_K_M",
LLAMA_FTYPE_MOSTLY_Q3_K_M,
"approx +0.2437 perplexity, 3.06 GiB size @ 7B",
" 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
},
{
"Q3_K_L",
LLAMA_FTYPE_MOSTLY_Q3_K_L,
"approx +0.1803 perplexity, 3.35 GiB size @ 7B",
" 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
},
{
"Q4_K",
@ -68,12 +68,12 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{
"Q4_K_S",
LLAMA_FTYPE_MOSTLY_Q4_K_S,
"approx +0.1149 perplexity, 3.56 GiB size @ 7B",
" 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
},
{
"Q4_K_M",
LLAMA_FTYPE_MOSTLY_Q4_K_M,
"approx +0.0535 perplexity, 3.80 GiB size @ 7B",
" 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
},
{
"Q5_K",
@ -83,33 +83,33 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{
"Q5_K_S",
LLAMA_FTYPE_MOSTLY_Q5_K_S,
"approx +0.0353 perplexity, 4.33 GiB size @ 7B",
" 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
},
{
"Q5_K_M",
LLAMA_FTYPE_MOSTLY_Q5_K_M,
"approx +0.0142 perplexity, 4.45 GiB size @ 7B",
" 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
},
{
"Q6_K",
LLAMA_FTYPE_MOSTLY_Q6_K,
"approx +0.0044 perplexity, 5.15 GiB size @ 7B",
" 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
},
#endif
{
"Q8_0",
LLAMA_FTYPE_MOSTLY_Q8_0,
"approx +0.0004 perplexity, 6.70 GiB size @ 7B",
" 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
},
{
"F16",
LLAMA_FTYPE_MOSTLY_F16,
"no significant perplexity increase, 13.00 GiB size @ 7B",
"13.00G @ 7B - extremely large, virtually no quality loss - not recommended",
},
{
"F32",
LLAMA_FTYPE_ALL_F32,
"full quality, 26.00 GiB size @ 7B",
"26.00G @ 7B - absolutely huge, lossless - not recommended",
},
};
@ -144,15 +144,15 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
}
// usage:
// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
//
void usage(const char * executable) {
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n", executable);
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
fprintf(stderr, "Allowed quantization types:\n");
fprintf(stderr, "\nAllowed quantization types:\n");
for (auto & it : QUANT_OPTIONS) {
printf(" %2d or %6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
}
exit(1);
}