metal : print more GPU info + disable mul_mm for MTLGPUFamiliy < Apple7

2023-10-08 09:53:38 +03:00 · 2023-10-08 09:53:38 +03:00 · 6b9554a740
commit 6b9554a740
parent 545b03491c
2 changed files with 65 additions and 42 deletions
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -274,6 +274,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
+        if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
            GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
@ -284,6 +285,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
            GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
+        }
        GGML_METAL_ADD_KERNEL(rope_f32);
        GGML_METAL_ADD_KERNEL(rope_f16);
        GGML_METAL_ADD_KERNEL(alibi_f32);
@ -296,8 +298,22 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #undef GGML_METAL_ADD_KERNEL
    }

-    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
 #if TARGET_OS_OSX
+    // print MTL GPU family:
+    GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
+    GGML_METAL_LOG_INFO("%s: GPU arch:   %s\n", __func__, [[ctx->device architecture].name UTF8String]);
+
+    // determine max supported GPU family
+    // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
+    // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+    for (int i = MTLGPUFamilyApple9 + 10; i >= MTLGPUFamilyApple1; --i) {
+        if ([ctx->device supportsFamily:i]) {
+            GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
+            break;
+        }
+    }
+
+    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
    GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
    if (ctx->device.maxTransferRate != 0) {
        GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
@ -351,6 +367,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
+    if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
        GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
@ -361,6 +378,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
        GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
+    }
    GGML_METAL_DEL_KERNEL(rope_f32);
    GGML_METAL_DEL_KERNEL(rope_f16);
    GGML_METAL_DEL_KERNEL(alibi_f32);
@ -986,18 +1004,20 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_MUL_MAT:
                        {
-                            // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
-
                            GGML_ASSERT(ne00 == ne10);
-                            // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
-                            uint gqa = ne12/ne02;
                            GGML_ASSERT(ne03 == ne13);

+                            const uint gqa = ne12/ne02;
+
                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
-                            // to the matrix-vector kernel. the numbers below are measured on M2 Ultra
-                            // not sure if this translates across all chips
+                            // to the matrix-vector kernel
                            int ne11_mm_min = 1;

+#if 0
+                            // the numbers below are measured on M2 Ultra for 7B and 13B models
+                            // these numbers do not translate to other devices or model sizes
+                            // TODO: need to find a better approach
+                            if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
                                switch (src0t) {
                                    case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
                                    case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
@ -1012,6 +1032,8 @@ void ggml_metal_graph_compute(
                                    case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
                                    default:             ne11_mm_min = 1;  break;
                                }
+                            }
+#endif

                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -2332,7 +2332,7 @@ kernel void kernel_get_rows(
 }

 #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
-#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A
+#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
 #define BLOCK_SIZE_K 32
 #define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
 #define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
@ -2459,7 +2459,8 @@ kernel void kernel_mul_mm(device const  uchar * src0,
        }

        threadgroup_barrier(mem_flags::mem_threadgroup);
-        device float * C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
+
+        device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
        if (sgitg == 0) {
            for (int i = 0; i < n_rows; i++) {
                for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {