CUDA: switch tile sizes based on binary version

2024-02-17 20:09:13 +01:00 · 2024-02-17 20:09:13 +01:00 · 44a80b4119
commit 44a80b4119
parent d250c9d61d
1 changed files with 13 additions and 3 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -6941,11 +6941,12 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

+    int mmq_x, mmq_y, nwarps;
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = g_device_caps[id].cc;

-    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
        mmq_x  =  MMQ_X_Q4_0_RDNA2;
        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
@ -6954,17 +6955,26 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
        mmq_x  =  MMQ_X_Q4_0_RDNA1;
        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
        nwarps = NWARPS_Q4_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
+    } else {
+        GGML_ASSERT(false);
+    }
+#else
+    cudaFuncAttributes attributes;
+    CUDA_CHECK(cudaFuncGetAttributes(&attributes, mul_mat_q4_0<false>));
+    const int cc_binary = 10*attributes.binaryVersion;
+
+    if (cc_binary >= CC_VOLTA) {
        mmq_x  =  MMQ_X_Q4_0_AMPERE;
        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
        nwarps = NWARPS_Q4_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
+    } else if (cc_binary >= MIN_CC_DP4A) {
        mmq_x  =  MMQ_X_Q4_0_PASCAL;
        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
        nwarps = NWARPS_Q4_0_PASCAL;
    } else {
        GGML_ASSERT(false);
    }
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;