metal: faster kernel_scale via float4

2023-09-07 16:16:28 +02:00 · 2023-09-07 16:16:28 +02:00 · 7c8c6ce085
commit 7c8c6ce085
parent 9a9010609b
2 changed files with 3 additions and 3 deletions
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -750,7 +750,7 @@ void ggml_metal_graph_compute(
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                            [encoder setBytes:&scale length:sizeof(scale) atIndex:2];

-                            const int64_t n = ggml_nelements(dst);
+                            const int64_t n = ggml_nelements(dst)/4;

                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -63,8 +63,8 @@ kernel void kernel_mul_row(
 }

 kernel void kernel_scale(
-        device const float * src0,
-        device       float * dst,
+        device const float4 * src0,
+        device       float4 * dst,
        constant     float & scale,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] * scale;