Metal: faster Q4_0 and Q4_1 matrix x vector kernels (#2212)
* 3-5% faster Q4_0 on Metal * 7-25% faster Q4_1 on Metal * Oops, forgot to delete the original Q4_1 kernel --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
		
							parent
							
								
									32c5411631
								
							
						
					
					
						commit
						27ad57a69b
					
				
					 2 changed files with 109 additions and 81 deletions
				
			
		|  | @ -739,12 +739,8 @@ void ggml_metal_graph_compute( | |||
|                                 [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13]; | ||||
|                                 [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14]; | ||||
| 
 | ||||
|                                 if (src0t == GGML_TYPE_Q4_0) { | ||||
|                                     [encoder dispatchThreadgroups:MTLSizeMake(ne01 / 8+((ne01 % 8) & 0x01), ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; | ||||
|                                 } | ||||
|                                 else if (src0t == GGML_TYPE_Q4_1) { | ||||
|                                     [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; | ||||
|                                     [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; | ||||
|                                 if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { | ||||
|                                     [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; | ||||
|                                 } | ||||
|                                 else if (src0t == GGML_TYPE_Q2_K || | ||||
|                                          src0t == GGML_TYPE_Q3_K || | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue