CUDA: stream-k decomposition for MMQ (#8018)

* CUDA: stream-k decomposition for MMQ

* fix undefined memory reads for small matrices
This commit is contained in:
Johannes Gäßler 2024-06-20 14:39:21 +02:00 committed by GitHub
parent 2075a66a96
commit d50f8897a7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 292 additions and 113 deletions

View file

@ -635,7 +635,7 @@ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> &
}
const int cc = ggml_cuda_info().devices[id].cc;
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
}
return row_rounding;
}