metal : print more GPU info + disable mul_mm for MTLGPUFamiliy < Apple7

This commit is contained in:
Georgi Gerganov 2023-10-08 09:53:38 +03:00
parent 545b03491c
commit 6b9554a740
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 65 additions and 42 deletions

View file

@ -274,16 +274,18 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32); if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32); GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32); GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32); GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
}
GGML_METAL_ADD_KERNEL(rope_f32); GGML_METAL_ADD_KERNEL(rope_f32);
GGML_METAL_ADD_KERNEL(rope_f16); GGML_METAL_ADD_KERNEL(rope_f16);
GGML_METAL_ADD_KERNEL(alibi_f32); GGML_METAL_ADD_KERNEL(alibi_f32);
@ -296,8 +298,22 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
#undef GGML_METAL_ADD_KERNEL #undef GGML_METAL_ADD_KERNEL
} }
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
#if TARGET_OS_OSX #if TARGET_OS_OSX
// print MTL GPU family:
GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
GGML_METAL_LOG_INFO("%s: GPU arch: %s\n", __func__, [[ctx->device architecture].name UTF8String]);
// determine max supported GPU family
// https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
// https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
for (int i = MTLGPUFamilyApple9 + 10; i >= MTLGPUFamilyApple1; --i) {
if ([ctx->device supportsFamily:i]) {
GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
break;
}
}
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
if (ctx->device.maxTransferRate != 0) { if (ctx->device.maxTransferRate != 0) {
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
@ -351,16 +367,18 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32); if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32); GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32); GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32); GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
}
GGML_METAL_DEL_KERNEL(rope_f32); GGML_METAL_DEL_KERNEL(rope_f32);
GGML_METAL_DEL_KERNEL(rope_f16); GGML_METAL_DEL_KERNEL(rope_f16);
GGML_METAL_DEL_KERNEL(alibi_f32); GGML_METAL_DEL_KERNEL(alibi_f32);
@ -986,32 +1004,36 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
{ {
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
GGML_ASSERT(ne00 == ne10); GGML_ASSERT(ne00 == ne10);
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
uint gqa = ne12/ne02;
GGML_ASSERT(ne03 == ne13); GGML_ASSERT(ne03 == ne13);
const uint gqa = ne12/ne02;
// find the break-even point where the matrix-matrix kernel becomes more efficient compared // find the break-even point where the matrix-matrix kernel becomes more efficient compared
// to the matrix-vector kernel. the numbers below are measured on M2 Ultra // to the matrix-vector kernel
// not sure if this translates across all chips
int ne11_mm_min = 1; int ne11_mm_min = 1;
switch (src0t) { #if 0
case GGML_TYPE_F16: ne11_mm_min = 2; break; // the numbers below are measured on M2 Ultra for 7B and 13B models
case GGML_TYPE_Q8_0: ne11_mm_min = 7; break; // these numbers do not translate to other devices or model sizes
case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; // TODO: need to find a better approach
case GGML_TYPE_Q3_K: ne11_mm_min = 7; break; if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
case GGML_TYPE_Q4_0: switch (src0t) {
case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; case GGML_TYPE_F16: ne11_mm_min = 2; break;
case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
case GGML_TYPE_Q5_0: // not tested yet case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
case GGML_TYPE_Q5_K: ne11_mm_min = 7; break; case GGML_TYPE_Q4_0:
case GGML_TYPE_Q6_K: ne11_mm_min = 7; break; case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
default: ne11_mm_min = 1; break; case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
case GGML_TYPE_Q5_0: // not tested yet
case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
default: ne11_mm_min = 1; break;
}
} }
#endif
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel

View file

@ -2332,7 +2332,7 @@ kernel void kernel_get_rows(
} }
#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A #define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
#define BLOCK_SIZE_K 32 #define BLOCK_SIZE_K 32
#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A #define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B #define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
@ -2459,7 +2459,8 @@ kernel void kernel_mul_mm(device const uchar * src0,
} }
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
device float * C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
if (sgitg == 0) { if (sgitg == 0) {
for (int i = 0; i < n_rows; i++) { for (int i = 0; i < n_rows; i++) {
for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {