ggml : refactor online repacking (#10446)
* rename ggml-cpu-aarch64.c to .cpp
* reformat extra cpu backend.
- clean Q4_0_N_M and IQ4_0_N_M
  - remove from "file" tensor type
  - allow only with dynamic repack
- extract cpu extra bufts and convert to C++
  - hbm
  - "aarch64"
- more generic use of extra buffer
  - generalise extra_supports_op
  - new API for "cpu-accel":
     - amx
     - aarch64
* clang-format
* Clean Q4_0_N_M ref
Enable restrict on C++
* add op GGML_OP_MUL_MAT_ID for Q4_0_N_M with runtime repack
* added/corrected control on tensor size for Q4 repacking.
* Update ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Update ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* add debug logs on repacks.
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
			
			
This commit is contained in:
		
							parent
							
								
									c2a16c0bdb
								
							
						
					
					
						commit
						19d8762ab6
					
				
					 33 changed files with 1136 additions and 1049 deletions
				
			
		|  | @ -4578,9 +4578,6 @@ struct llama_model_loader { | |||
|                 case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break; | ||||
|                 case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break; | ||||
|                 case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break; | ||||
|                 case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break; | ||||
|                 case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break; | ||||
|                 case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break; | ||||
|                 default: | ||||
|                     { | ||||
|                         LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); | ||||
|  | @ -5344,9 +5341,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { | |||
|         case LLAMA_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw"; | ||||
|         case LLAMA_FTYPE_MOSTLY_IQ3_S:    return "IQ3_S - 3.4375 bpw"; | ||||
|         case LLAMA_FTYPE_MOSTLY_IQ3_M:    return "IQ3_S mix - 3.66 bpw"; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; | ||||
| 
 | ||||
|         default: return "unknown, may not work"; | ||||
|     } | ||||
|  | @ -18367,10 +18361,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n | |||
|             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { | ||||
|                 new_type = GGML_TYPE_IQ3_S; | ||||
|             } | ||||
|             else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || | ||||
|                      new_type == GGML_TYPE_Q4_0_8_8) { | ||||
|                 new_type = GGML_TYPE_Q4_0; | ||||
|             } | ||||
|             else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { | ||||
|                 new_type = GGML_TYPE_Q4_K; | ||||
|             } | ||||
|  | @ -18693,9 +18683,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||
|         case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break; | ||||
|         case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break; | ||||
|         case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break; | ||||
| 
 | ||||
|         default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); | ||||
|     } | ||||
|  | @ -19034,14 +19021,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||
|                 f32_data = (float *) f32_conv_buf.data(); | ||||
|             } | ||||
| 
 | ||||
|             int chunk_size_multiplier = 1; | ||||
|             if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { | ||||
|                 if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0; | ||||
|                 else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0; | ||||
|                 if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8; | ||||
|                 else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4; | ||||
|             } | ||||
| 
 | ||||
|             LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); | ||||
|             fflush(stdout); | ||||
| 
 | ||||
|  | @ -19054,8 +19033,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||
|             const int64_t nrows = tensor->ne[1]; | ||||
| 
 | ||||
|             static const int64_t min_chunk_size = 32 * 512; | ||||
|             const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * | ||||
|                                        chunk_size_multiplier; | ||||
|             const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)); | ||||
| 
 | ||||
|             const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; | ||||
|             const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue