refactor fp16 logic, only consider used devices

2024-02-11 18:52:49 +01:00 · 2024-02-11 18:52:49 +01:00 · 005de593ad
commit 005de593ad
parent 76a0128bec
1 changed files with 11 additions and 8 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -9914,13 +9914,22 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1

    int64_t min_compute_capability = INT_MAX;

+    bool any_pascal_with_slow_fp16 = false;
    if (split) {
        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
        auto & tensor_split = buft_ctx->tensor_split;
        for (int id = 0; id < g_device_count; ++id) {
-            if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
+            // skip devices that are not going to do any work:
+            if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
+                continue;
+            }
+
+            if (min_compute_capability > g_device_caps[id].cc) {
                min_compute_capability = g_device_caps[id].cc;
            }
+            if (g_device_caps[id].cc == 610) {
+                any_pascal_with_slow_fp16 = true;
+            }
        }
    } else {
        min_compute_capability = g_device_caps[g_main_device].cc;
@ -9949,13 +9958,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 #else

    // fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
-    bool fp16_performance_good = true;
-    for (int id = 0; id < g_device_count; ++id) {
-        if (g_device_caps[id].cc < CC_VOLTA && g_device_caps[id].cc != CC_PASCAL) {
-            fp16_performance_good = false;
-            break;
-        }
-    }
+    const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;

    // mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
    use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;