diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md index cc53a82e5..04344d21a 100644 --- a/examples/perplexity/README.md +++ b/examples/perplexity/README.md @@ -73,6 +73,51 @@ The "WT 2.7m" importance matrix was created using 2.7 million Wikitext tokens an ## LLaMA 2 vs. LLaMA 3 Quantization comparison +| Metric | L2 7b q2_K | L3 8b q2_K | L2 7b q4_K_M | L3 8b q4_K_M | L2 7b q6_K | L3 8b q6_K | L2 7b q8_0 | L3 8b q8_0 | +|-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------| +| Mean PPL | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 | +| Mean PPL ratio | 1.107955 ± 0.001427 | 1.564849 ± 0.004525 | 1.014242 ± 0.000432 | 1.028160 ± 0.000723 | 1.002406 ± 0.000191 | 1.003490 ± 0.000296 | 1.000689 ± 0.000107 | 1.000425 ± 0.000161 | +| Mean ΔPPL | 0.625552 ± 0.008725 | 3.519934 ± 0.033863 | 0.082526 ± 0.002530 | 0.175482 ± 0.004620 | 0.013941 ± 0.001110 | 0.021748 ± 0.001852 | 0.003990 ± 0.000624 | 0.002650 ± 0.001006 | +| PPL correlation | 97.36% | 89.62% | 99.71% | 99.34% | 99.94% | 99.88% | 99.98% | 99.96% | +| Mean KLD | 0.108903 ± 0.000645 | 0.445132 ± 0.001835 | 0.012686 ± 0.000079 | 0.031273 ± 0.000238 | 0.002098 ± 0.000014 | 0.005452 ± 0.000035 | 0.000369 ± 0.000007 | 0.001355 ± 0.000006 | +| Mean Δp | -2.710 ± 0.023 % | -9.123 ± 0.051 % | -0.416 ± 0.008 % | -0.596 ± 0.014 % | -0.035 ± 0.003 % | -0.007 ± 0.006 % | -0.005 ± 0.002 % | -0.019 ± 0.003 % | +| Maximum Δp | 85.136% | 94.268% | 45.209% | 95.054% | 23.593% | 53.601% | 43.925% | 28.734% | +| 99.9% Δp | 37.184% | 50.003% | 17.461% | 27.084% | 7.798% | 13.613% | 3.387% | 6.402% | +| 99.0% Δp | 18.131% | 25.875% | 7.798% | 12.084% | 3.838% | 6.407% | 1.867% | 3.544% | +| Median Δp | -0.391% | -2.476% | -0.026% | -0.024% | -0.001% | 0.000% | -0.000% | -0.000% | +| 1.0% Δp | -39.762% | -87.173% | -11.433% | -19.567% | -4.222% | -6.767% | -1.862% | -3.698% | +| 0.1% Δp | -79.002% | -98.897% | -26.433% | -56.054% | -9.091% | -16.584% | -3.252% | -6.579% | +| Minimum Δp | -99.915% | -99.965% | -83.383% | -98.699% | -43.142% | -68.487% | -9.343% | -24.301% | +| RMS Δp | 9.762 ± 0.053 % | 21.421 ± 0.079 % | 3.252 ± 0.024 % | 5.519 ± 0.050 % | 1.339 ± 0.010 % | 2.295 ± 0.019 % | 0.618 ± 0.011 % | 1.198 ± 0.007 % | +| Same top p | 85.584 ± 0.086 % | 71.138 ± 0.119 % | 94.665 ± 0.055 % | 91.901 ± 0.072 % | 97.520 ± 0.038 % | 96.031 ± 0.051 % | 98.846 ± 0.026 % | 97.674 ± 0.040 % | + + +## Old Numbers + +
+Llama 2 70B Scoreboard + +| Quantization | Model size (GiB) | Perplexity | Delta to fp16 | +|--------------|------------------|------------|---------------| +| Q4_0 | 36.20 | 3.5550 | 3.61% | +| Q4_1 | 40.20 | 3.5125 | 2.37% | +| Q5_0 | 44.20 | 3.4744 | 1.26% | +| Q2_K | 27.27 | 3.7339 | 8.82% | +| Q3_K_S | 27.86 | 3.7019 | 7.89% | +| Q3_K_M | 30.83 | 3.5932 | 4.72% | +| Q3_K_L | 33.67 | 3.5617 | 3.80% | +| Q4_K_S | 36.39 | 3.4852 | 1.57% | +| Q4_K_M | 38.54 | 3.4725 | 1.20% | +| Q5_K_S | 44.20 | 3.4483 | 0.50% | +| Q5_K_M | 45.41 | 3.4451 | 0.40% | +| Q6_K | 52.70 | 3.4367 | 0.16% | +| fp16 | 128.5 | 3.4313 | - | + +
+ +
+Pre BPE tokenizer quantization comparison + | Metric | L2 7b q2_K | L3 8b q2_K | L2 7b q4_K_M | L3 8b q4_K_M | L2 7b q6_K | L3 8b q6_K | L2 7b q8_0 | L3 8b q8_0 | |-----------------|---------------------|----------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------| | Mean PPL | 5.794552 ± 0.032298 | 10.641563 ± 0.071555 | 5.877078 ± 0.032781 | 6.956203 ± 0.044123 | 5.808494 ± 0.032425 | 6.796525 ± 0.043079 | 5.798542 ± 0.032366 | 6.764558 ± 0.042733 | @@ -109,25 +154,4 @@ The "WT 2.7m" importance matrix was created using 2.7 million Wikitext tokens an | RMS Δp | 17.237 ± 0.077 % | 34.361 ± 0.094 % | 4.154 ± 0.032 % | 9.915 ± 0.067 % | 1.899 ± 0.015 % | 3.721 ± 0.030 % | 1.085 ± 0.007 % | 2.124 ± 0.018 % | | Same top p | 85.001 ± 0.087 % | 71.991 ± 0.118 % | 95.632 ± 0.050 % | 92.881 ± 0.068 % | 97.651 ± 0.037 % | 96.538 ± 0.048 % | 98.502 ± 0.030 % | 97.825 ± 0.038 % | -## Old Numbers - -
-Llama 2 70B Scorechart - -| Quantization | Model size (GiB) | Perplexity | Delta to fp16 | -|--------------|------------------|------------|---------------| -| Q4_0 | 36.20 | 3.5550 | 3.61% | -| Q4_1 | 40.20 | 3.5125 | 2.37% | -| Q5_0 | 44.20 | 3.4744 | 1.26% | -| Q2_K | 27.27 | 3.7339 | 8.82% | -| Q3_K_S | 27.86 | 3.7019 | 7.89% | -| Q3_K_M | 30.83 | 3.5932 | 4.72% | -| Q3_K_L | 33.67 | 3.5617 | 3.80% | -| Q4_K_S | 36.39 | 3.4852 | 1.57% | -| Q4_K_M | 38.54 | 3.4725 | 1.20% | -| Q5_K_S | 44.20 | 3.4483 | 0.50% | -| Q5_K_M | 45.41 | 3.4451 | 0.40% | -| Q6_K | 52.70 | 3.4367 | 0.16% | -| fp16 | 128.5 | 3.4313 | - | - -
+