llama : valign + remove unused ftype (#8502)
This commit is contained in:
		
							parent
							
								
									7acfd4e8d5
								
							
						
					
					
						commit
						0efec57787
					
				
					 3 changed files with 72 additions and 76 deletions
				
			
		|  | @ -16,44 +16,44 @@ struct quant_option { | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| static const std::vector<struct quant_option> QUANT_OPTIONS = { | static const std::vector<struct quant_option> QUANT_OPTIONS = { | ||||||
|     { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 4.34G, +0.4685 ppl @ Llama-3-8B",  }, |     { "Q4_0",     LLAMA_FTYPE_MOSTLY_Q4_0,     " 4.34G, +0.4685 ppl @ Llama-3-8B",  }, | ||||||
|     { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 4.78G, +0.4511 ppl @ Llama-3-8B",  }, |     { "Q4_1",     LLAMA_FTYPE_MOSTLY_Q4_1,     " 4.78G, +0.4511 ppl @ Llama-3-8B",  }, | ||||||
|     { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 5.21G, +0.1316 ppl @ Llama-3-8B",  }, |     { "Q5_0",     LLAMA_FTYPE_MOSTLY_Q5_0,     " 5.21G, +0.1316 ppl @ Llama-3-8B",  }, | ||||||
|     { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 5.65G, +0.1062 ppl @ Llama-3-8B",  }, |     { "Q5_1",     LLAMA_FTYPE_MOSTLY_Q5_1,     " 5.65G, +0.1062 ppl @ Llama-3-8B",  }, | ||||||
|     { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization",            }, |     { "IQ2_XXS",  LLAMA_FTYPE_MOSTLY_IQ2_XXS,  " 2.06 bpw quantization",            }, | ||||||
|     { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization",            }, |     { "IQ2_XS",   LLAMA_FTYPE_MOSTLY_IQ2_XS,   " 2.31 bpw quantization",            }, | ||||||
|     { "IQ2_S",  LLAMA_FTYPE_MOSTLY_IQ2_S,  " 2.5  bpw quantization",            }, |     { "IQ2_S",    LLAMA_FTYPE_MOSTLY_IQ2_S,    " 2.5  bpw quantization",            }, | ||||||
|     { "IQ2_M",  LLAMA_FTYPE_MOSTLY_IQ2_M,  " 2.7  bpw quantization",            }, |     { "IQ2_M",    LLAMA_FTYPE_MOSTLY_IQ2_M,    " 2.7  bpw quantization",            }, | ||||||
|     { "IQ1_S",  LLAMA_FTYPE_MOSTLY_IQ1_S,  " 1.56 bpw quantization",            }, |     { "IQ1_S",    LLAMA_FTYPE_MOSTLY_IQ1_S,    " 1.56 bpw quantization",            }, | ||||||
|     { "IQ1_M",  LLAMA_FTYPE_MOSTLY_IQ1_M,  " 1.75 bpw quantization",            }, |     { "IQ1_M",    LLAMA_FTYPE_MOSTLY_IQ1_M,    " 1.75 bpw quantization",            }, | ||||||
|     { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.96G, +3.5199 ppl @ Llama-3-8B",  }, |     { "Q2_K",     LLAMA_FTYPE_MOSTLY_Q2_K,     " 2.96G, +3.5199 ppl @ Llama-3-8B",  }, | ||||||
|     { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B",  }, |     { "Q2_K_S",   LLAMA_FTYPE_MOSTLY_Q2_K_S,   " 2.96G, +3.1836 ppl @ Llama-3-8B",  }, | ||||||
|     { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization",            }, |     { "IQ3_XXS",  LLAMA_FTYPE_MOSTLY_IQ3_XXS,  " 3.06 bpw quantization",            }, | ||||||
|     { "IQ3_S",  LLAMA_FTYPE_MOSTLY_IQ3_S,  " 3.44 bpw quantization",            }, |     { "IQ3_S",    LLAMA_FTYPE_MOSTLY_IQ3_S,    " 3.44 bpw quantization",            }, | ||||||
|     { "IQ3_M",  LLAMA_FTYPE_MOSTLY_IQ3_M,  " 3.66 bpw quantization mix",        }, |     { "IQ3_M",    LLAMA_FTYPE_MOSTLY_IQ3_M,    " 3.66 bpw quantization mix",        }, | ||||||
|     { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M"                   }, |     { "Q3_K",     LLAMA_FTYPE_MOSTLY_Q3_K_M,   "alias for Q3_K_M"                   }, | ||||||
|     { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization",             }, |     { "IQ3_XS",   LLAMA_FTYPE_MOSTLY_IQ3_XS,   " 3.3 bpw quantization",             }, | ||||||
|     { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B",  }, |     { "Q3_K_S",   LLAMA_FTYPE_MOSTLY_Q3_K_S,   " 3.41G, +1.6321 ppl @ Llama-3-8B",  }, | ||||||
|     { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B",  }, |     { "Q3_K_M",   LLAMA_FTYPE_MOSTLY_Q3_K_M,   " 3.74G, +0.6569 ppl @ Llama-3-8B",  }, | ||||||
|     { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B",  }, |     { "Q3_K_L",   LLAMA_FTYPE_MOSTLY_Q3_K_L,   " 4.03G, +0.5562 ppl @ Llama-3-8B",  }, | ||||||
|     { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, |     { "IQ4_NL",   LLAMA_FTYPE_MOSTLY_IQ4_NL,   " 4.50 bpw non-linear quantization", }, | ||||||
|     { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, |     { "IQ4_XS",   LLAMA_FTYPE_MOSTLY_IQ4_XS,   " 4.25 bpw non-linear quantization", }, | ||||||
|     { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M",                  }, |     { "Q4_K",     LLAMA_FTYPE_MOSTLY_Q4_K_M,   "alias for Q4_K_M",                  }, | ||||||
|     { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B",  }, |     { "Q4_K_S",   LLAMA_FTYPE_MOSTLY_Q4_K_S,   " 4.37G, +0.2689 ppl @ Llama-3-8B",  }, | ||||||
|     { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B",  }, |     { "Q4_K_M",   LLAMA_FTYPE_MOSTLY_Q4_K_M,   " 4.58G, +0.1754 ppl @ Llama-3-8B",  }, | ||||||
|     { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M",                  }, |     { "Q5_K",     LLAMA_FTYPE_MOSTLY_Q5_K_M,   "alias for Q5_K_M",                  }, | ||||||
|     { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B",  }, |     { "Q5_K_S",   LLAMA_FTYPE_MOSTLY_Q5_K_S,   " 5.21G, +0.1049 ppl @ Llama-3-8B",  }, | ||||||
|     { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B",  }, |     { "Q5_K_M",   LLAMA_FTYPE_MOSTLY_Q5_K_M,   " 5.33G, +0.0569 ppl @ Llama-3-8B",  }, | ||||||
|     { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 6.14G, +0.0217 ppl @ Llama-3-8B",  }, |     { "Q6_K",     LLAMA_FTYPE_MOSTLY_Q6_K,     " 6.14G, +0.0217 ppl @ Llama-3-8B",  }, | ||||||
|     { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 7.96G, +0.0026 ppl @ Llama-3-8B",  }, |     { "Q8_0",     LLAMA_FTYPE_MOSTLY_Q8_0,     " 7.96G, +0.0026 ppl @ Llama-3-8B",  }, | ||||||
|     { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, |     { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B",  }, | ||||||
|     { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, |     { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  }, | ||||||
|     { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, |     { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  }, | ||||||
|     { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, +0.0020 ppl @ Mistral-7B",  }, |     { "F16",      LLAMA_FTYPE_MOSTLY_F16,      "14.00G, +0.0020 ppl @ Mistral-7B",  }, | ||||||
|     { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B",  }, |     { "BF16",     LLAMA_FTYPE_MOSTLY_BF16,     "14.00G, -0.0050 ppl @ Mistral-7B",  }, | ||||||
|     { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B",          }, |     { "F32",      LLAMA_FTYPE_ALL_F32,         "26.00G              @ 7B",          }, | ||||||
|     // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
 |     // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
 | ||||||
|     { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing",  }, |     { "COPY",     LLAMA_FTYPE_ALL_F32,         "only copy tensors, no quantizing",  }, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file"; | static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file"; | ||||||
|  |  | ||||||
|  | @ -133,7 +133,7 @@ extern "C" { | ||||||
|         LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
 |         LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
 | ||||||
|         LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
 |         LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
 | ||||||
|         LLAMA_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
 |         LLAMA_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
 | ||||||
|         LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
 |         // LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
 | ||||||
|         // LLAMA_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
 |         // LLAMA_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
 | ||||||
|         // LLAMA_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
 |         // LLAMA_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
 | ||||||
|         LLAMA_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
 |         LLAMA_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
 | ||||||
|  |  | ||||||
|  | @ -4510,40 +4510,36 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     switch (ftype) { |     switch (ftype) { | ||||||
|         case LLAMA_FTYPE_ALL_F32:     return "all F32"; |         case LLAMA_FTYPE_ALL_F32:         return "all F32"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_F16:  return "F16"; |         case LLAMA_FTYPE_MOSTLY_F16:      return "F16"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; |         case LLAMA_FTYPE_MOSTLY_BF16:     return "BF16"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; |         case LLAMA_FTYPE_MOSTLY_Q4_0:     return "Q4_0"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; |         case LLAMA_FTYPE_MOSTLY_Q4_1:     return "Q4_1"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: |         case LLAMA_FTYPE_MOSTLY_Q5_0:     return "Q5_0"; | ||||||
|                                       return "Q4_1, some F16"; |         case LLAMA_FTYPE_MOSTLY_Q5_1:     return "Q5_1"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; |         case LLAMA_FTYPE_MOSTLY_Q8_0:     return "Q8_0"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; |         case LLAMA_FTYPE_MOSTLY_Q2_K:     return "Q2_K - Medium"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; |         case LLAMA_FTYPE_MOSTLY_Q2_K_S:   return "Q2_K - Small"; | ||||||
| 
 |         case LLAMA_FTYPE_MOSTLY_Q3_K_S:   return "Q3_K - Small"; | ||||||
|         // K-quants
 |         case LLAMA_FTYPE_MOSTLY_Q3_K_M:   return "Q3_K - Medium"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q2_K:   return "Q2_K - Medium"; |         case LLAMA_FTYPE_MOSTLY_Q3_K_L:   return "Q3_K - Large"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; |         case LLAMA_FTYPE_MOSTLY_Q4_K_S:   return "Q4_K - Small"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; |         case LLAMA_FTYPE_MOSTLY_Q4_K_M:   return "Q4_K - Medium"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; |         case LLAMA_FTYPE_MOSTLY_Q5_K_S:   return "Q5_K - Small"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; |         case LLAMA_FTYPE_MOSTLY_Q5_K_M:   return "Q5_K - Medium"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small"; |         case LLAMA_FTYPE_MOSTLY_Q6_K:     return "Q6_K"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium"; |         case LLAMA_FTYPE_MOSTLY_IQ2_XXS:  return "IQ2_XXS - 2.0625 bpw"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; |         case LLAMA_FTYPE_MOSTLY_IQ2_XS:   return "IQ2_XS - 2.3125 bpw"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; |         case LLAMA_FTYPE_MOSTLY_IQ2_S:    return "IQ2_S - 2.5 bpw"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q6_K:   return "Q6_K"; |         case LLAMA_FTYPE_MOSTLY_IQ2_M:    return "IQ2_M - 2.7 bpw"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw"; |         case LLAMA_FTYPE_MOSTLY_IQ3_XS:   return "IQ3_XS - 3.3 bpw"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; |         case LLAMA_FTYPE_MOSTLY_IQ3_XXS:  return "IQ3_XXS - 3.0625 bpw"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_IQ2_S:  return "IQ2_S - 2.5 bpw"; |         case LLAMA_FTYPE_MOSTLY_IQ1_S:    return "IQ1_S - 1.5625 bpw"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_IQ2_M:  return "IQ2_M - 2.7 bpw"; |         case LLAMA_FTYPE_MOSTLY_IQ1_M:    return "IQ1_M - 1.75 bpw"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; |         case LLAMA_FTYPE_MOSTLY_IQ4_NL:   return "IQ4_NL - 4.5 bpw"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw"; |         case LLAMA_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_IQ1_S  :return "IQ1_S - 1.5625 bpw"; |         case LLAMA_FTYPE_MOSTLY_IQ3_S:    return "IQ3_S - 3.4375 bpw"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_IQ1_M  :return "IQ1_M - 1.75 bpw"; |         case LLAMA_FTYPE_MOSTLY_IQ3_M:    return "IQ3_S mix - 3.66 bpw"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; |  | ||||||
|         case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; |  | ||||||
|         case LLAMA_FTYPE_MOSTLY_IQ3_S:  return "IQ3_S - 3.4375 bpw"; |  | ||||||
|         case LLAMA_FTYPE_MOSTLY_IQ3_M:  return "IQ3_S mix - 3.66 bpw"; |  | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; |         case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; |         case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; |         case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; | ||||||
|  | @ -18069,10 +18065,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n | ||||||
|     //    if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
 |     //    if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
 | ||||||
|     //}
 |     //}
 | ||||||
|     bool convert_incompatible_tensor = false; |     bool convert_incompatible_tensor = false; | ||||||
|     if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || |     if (new_type == GGML_TYPE_Q2_K    || new_type == GGML_TYPE_Q3_K    || new_type == GGML_TYPE_Q4_K   || | ||||||
|         new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS || |         new_type == GGML_TYPE_Q5_K    || new_type == GGML_TYPE_Q6_K    || new_type == GGML_TYPE_IQ4_XS || | ||||||
|         new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S || |         new_type == GGML_TYPE_IQ2_XS  || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S  || | ||||||
|         new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || |         new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S   || new_type == GGML_TYPE_IQ3_S  || | ||||||
|         new_type == GGML_TYPE_IQ1_M) { |         new_type == GGML_TYPE_IQ1_M) { | ||||||
|         int nx = tensor->ne[0]; |         int nx = tensor->ne[0]; | ||||||
|         int ny = tensor->ne[1]; |         int ny = tensor->ne[1]; | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue