CUDA: optimize MMQ int8 tensor core performance (#8062)

* CUDA: optimize MMQ int8 tensor core performance

* only a single get_mma_tile_x_k function

* simplify code, make functions constexpr
This commit is contained in:
Johannes Gäßler 2024-06-24 12:41:23 +02:00 committed by GitHub
parent 52fc8705a0
commit 9a590c8226
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 902 additions and 570 deletions

View file

@ -643,7 +643,7 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
static constexpr int qi = QI3_S;
};
static int get_mmq_x_max_host(const int cc) {
static constexpr int get_mmq_x_max_host(int cc) {
#ifdef CUDA_USE_TENSOR_CORES
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
#else
@ -652,7 +652,7 @@ static int get_mmq_x_max_host(const int cc) {
}
// Round rows to this value for --split-mode row:
static int get_mmq_y_host(const int cc) {
static constexpr int get_mmq_y_host(int cc) {
return cc >= CC_VOLTA ? 128 : 64;
}