metal: somewhat faster f16 x f32 matrix multiply kernel (#2951)

* Somewhat faster f16 x f32 matrix multiply kernel

* Better use 32 thread groups for f16 x f32

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow 2023-09-01 11:15:57 +03:00 committed by GitHub
parent bce1fef328
commit e8d9158925
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 29 additions and 11 deletions

View file

@ -840,7 +840,7 @@ void ggml_metal_graph_compute(
switch (src0t) {
case GGML_TYPE_F16:
{
nth0 = 64;
nth0 = 32;
nth1 = 1;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
} break;