CUDA: faster q2_K, q3_K MMQ + int8 tensor cores (#7921)
* CUDA: faster q2_K, q3_K MMQ + int8 tensor cores * try CI fix * try CI fix * try CI fix * fix data race * rever q2_K precision related changes
This commit is contained in:
		
							parent
							
								
									66ef1ceedf
								
							
						
					
					
						commit
						76d66ee0be
					
				
					 6 changed files with 468 additions and 330 deletions
				
			
		|  | @ -130,6 +130,7 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons | |||
|     const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2); | ||||
|     const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); | ||||
| 
 | ||||
|     // FIXME: this limit could be raised by ~2-4x on Ampere or newer | ||||
|     if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) { | ||||
|         switch (ncols_x) { | ||||
|             case 32: | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue