metal : faster q4_0 (#1775)

* metal : 8% faster q4_0

Avoid copying into local uchar4 anf float4.

* metal : 17% faster Q4_0

Use 64 threads in a thread group.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow 2023-06-09 10:39:59 +03:00 committed by GitHub
parent 72ff5282bf
commit 245fc3c37d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 16 deletions

View file

@ -526,7 +526,7 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne12 == 1);
nth0 = 8;
nth1 = 4;
nth1 = 8;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
} break;
case GGML_TYPE_Q2_K: