CUDA: optimize MMQ int8 tensor core performance (#8062)

* CUDA: optimize MMQ int8 tensor core performance * only a single get_mma_tile_x_k function * simplify code, make functions constexpr
2024-06-24 12:41:23 +02:00 · 2024-06-24 12:41:23 +02:00 · 9a590c8226
commit 9a590c8226
parent 52fc8705a0
3 changed files with 902 additions and 570 deletions
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@ -643,7 +643,7 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
    static constexpr int qi = QI3_S;
 };

-static int get_mmq_x_max_host(const int cc) {
+static constexpr int get_mmq_x_max_host(int cc) {
 #ifdef CUDA_USE_TENSOR_CORES
    return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
 #else
@ -652,7 +652,7 @@ static int get_mmq_x_max_host(const int cc) {
 }

 // Round rows to this value for --split-mode row:
-static int get_mmq_y_host(const int cc) {
+static constexpr int get_mmq_y_host(int cc) {
    return cc >= CC_VOLTA ? 128 : 64;
 }