wip : template for rows per warp

This commit is contained in:
Georgi Gerganov 2024-01-21 18:24:13 +02:00
parent f31955f5d1
commit a4b6341c7b
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 32 additions and 29 deletions

View file

@ -2252,14 +2252,15 @@ static bool ggml_metal_graph_compute(
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:26];
[encoder setBytes:&scale length:sizeof( float) atIndex:27];
const int64_t nwarps = 4;
const int64_t nwarps = 8;
const int64_t nhpw = 4; // heads per warp
const size_t smem = nwarps*(2*4*ne00 + 128)*(sizeof(float)/2);
const size_t smem = nwarps*(2*nhpw*ne00 + 128)*(sizeof(float)/2);
GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength);
[encoder setThreadgroupMemoryLength:smem atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, (ne02 + 4*nwarps - 1)/(4*nwarps), ne03) threadsPerThreadgroup:MTLSizeMake(32*nwarps, 1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, (ne02 + nhpw*nwarps - 1)/(nhpw*nwarps), ne03) threadsPerThreadgroup:MTLSizeMake(32*nwarps, 1, 1)];
} break;
case GGML_OP_DUP:
case GGML_OP_CPY: