wip
This commit is contained in:
parent
17720fad66
commit
06c2d0d117
3 changed files with 136 additions and 73 deletions
|
@ -2252,15 +2252,16 @@ static bool ggml_metal_graph_compute(
|
|||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:26];
|
||||
[encoder setBytes:&scale length:sizeof( float) atIndex:27];
|
||||
|
||||
const int64_t nwarps = 32;
|
||||
const int64_t nhptg = 2; // heads per threadgroup
|
||||
const int64_t nwarps = 8;
|
||||
const int64_t nhptg = 2; // heads per threadgroup !! sync with kernel template arguments !!
|
||||
const int64_t nqptg = 4; // queries per threadgroup !! sync with kernel template arguments !!
|
||||
|
||||
const size_t smem = (nhptg*ne00 + nwarps*(nhptg*ne00 + 32))*(sizeof(float)/2);
|
||||
const size_t smem = nqptg*(nhptg*ne00 + nwarps*(nhptg*ne00 + 32))*(sizeof(float)/2);
|
||||
|
||||
GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength);
|
||||
[encoder setThreadgroupMemoryLength:smem atIndex:0];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, (ne02 + nhptg - 1)/(nhptg), ne03) threadsPerThreadgroup:MTLSizeMake(32, nwarps, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, (ne02 + nhptg - 1)/(nhptg), ne03) threadsPerThreadgroup:MTLSizeMake(32, nwarps, 1)];
|
||||
} break;
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_CPY:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue