metal : print more GPU info + disable mul_mm for MTLGPUFamiliy < Apple7
This commit is contained in:
parent
545b03491c
commit
6b9554a740
2 changed files with 65 additions and 42 deletions
36
ggml-metal.m
36
ggml-metal.m
|
@ -274,6 +274,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
|
||||||
|
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
||||||
|
@ -284,6 +285,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
|
||||||
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
|
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
|
||||||
|
}
|
||||||
GGML_METAL_ADD_KERNEL(rope_f32);
|
GGML_METAL_ADD_KERNEL(rope_f32);
|
||||||
GGML_METAL_ADD_KERNEL(rope_f16);
|
GGML_METAL_ADD_KERNEL(rope_f16);
|
||||||
GGML_METAL_ADD_KERNEL(alibi_f32);
|
GGML_METAL_ADD_KERNEL(alibi_f32);
|
||||||
|
@ -296,8 +298,22 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
#undef GGML_METAL_ADD_KERNEL
|
#undef GGML_METAL_ADD_KERNEL
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
|
||||||
#if TARGET_OS_OSX
|
#if TARGET_OS_OSX
|
||||||
|
// print MTL GPU family:
|
||||||
|
GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
|
||||||
|
GGML_METAL_LOG_INFO("%s: GPU arch: %s\n", __func__, [[ctx->device architecture].name UTF8String]);
|
||||||
|
|
||||||
|
// determine max supported GPU family
|
||||||
|
// https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
|
||||||
|
// https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||||
|
for (int i = MTLGPUFamilyApple9 + 10; i >= MTLGPUFamilyApple1; --i) {
|
||||||
|
if ([ctx->device supportsFamily:i]) {
|
||||||
|
GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
if (ctx->device.maxTransferRate != 0) {
|
if (ctx->device.maxTransferRate != 0) {
|
||||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||||
|
@ -351,6 +367,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
|
||||||
|
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
||||||
|
@ -361,6 +378,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
|
||||||
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
|
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
|
||||||
|
}
|
||||||
GGML_METAL_DEL_KERNEL(rope_f32);
|
GGML_METAL_DEL_KERNEL(rope_f32);
|
||||||
GGML_METAL_DEL_KERNEL(rope_f16);
|
GGML_METAL_DEL_KERNEL(rope_f16);
|
||||||
GGML_METAL_DEL_KERNEL(alibi_f32);
|
GGML_METAL_DEL_KERNEL(alibi_f32);
|
||||||
|
@ -986,18 +1004,20 @@ void ggml_metal_graph_compute(
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
{
|
{
|
||||||
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
|
||||||
|
|
||||||
GGML_ASSERT(ne00 == ne10);
|
GGML_ASSERT(ne00 == ne10);
|
||||||
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
|
|
||||||
uint gqa = ne12/ne02;
|
|
||||||
GGML_ASSERT(ne03 == ne13);
|
GGML_ASSERT(ne03 == ne13);
|
||||||
|
|
||||||
|
const uint gqa = ne12/ne02;
|
||||||
|
|
||||||
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
||||||
// to the matrix-vector kernel. the numbers below are measured on M2 Ultra
|
// to the matrix-vector kernel
|
||||||
// not sure if this translates across all chips
|
|
||||||
int ne11_mm_min = 1;
|
int ne11_mm_min = 1;
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// the numbers below are measured on M2 Ultra for 7B and 13B models
|
||||||
|
// these numbers do not translate to other devices or model sizes
|
||||||
|
// TODO: need to find a better approach
|
||||||
|
if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
|
||||||
switch (src0t) {
|
switch (src0t) {
|
||||||
case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||||
case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||||
|
@ -1012,6 +1032,8 @@ void ggml_metal_graph_compute(
|
||||||
case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||||
default: ne11_mm_min = 1; break;
|
default: ne11_mm_min = 1; break;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||||
|
|
|
@ -2332,7 +2332,7 @@ kernel void kernel_get_rows(
|
||||||
}
|
}
|
||||||
|
|
||||||
#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
|
#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
|
||||||
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A
|
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
|
||||||
#define BLOCK_SIZE_K 32
|
#define BLOCK_SIZE_K 32
|
||||||
#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
|
#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
|
||||||
#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
|
#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
|
||||||
|
@ -2459,7 +2459,8 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||||
}
|
}
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
device float * C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
|
||||||
|
device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
||||||
if (sgitg == 0) {
|
if (sgitg == 0) {
|
||||||
for (int i = 0; i < n_rows; i++) {
|
for (int i = 0; i < n_rows; i++) {
|
||||||
for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
|
for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue