CUDA: stream-k decomposition for MMQ (#8018)
* CUDA: stream-k decomposition for MMQ * fix undefined memory reads for small matrices
This commit is contained in:
parent
2075a66a96
commit
d50f8897a7
4 changed files with 292 additions and 113 deletions
|
@ -652,8 +652,8 @@ static int get_mmq_x_max_host(const int cc) {
|
|||
}
|
||||
|
||||
// Round rows to this value for --split-mode row:
|
||||
static int get_mmq_y_host(const int cc, const int mmq_x) {
|
||||
return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
|
||||
static int get_mmq_y_host(const int cc) {
|
||||
return cc >= CC_VOLTA ? 128 : 64;
|
||||
}
|
||||
|
||||
//////////////////////
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue