diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 39ef1e0ea..8aa70f143 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -9914,13 +9914,22 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 
     int64_t min_compute_capability = INT_MAX;
 
+    bool any_pascal_with_slow_fp16 = false;
     if (split) {
         ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
         auto & tensor_split = buft_ctx->tensor_split;
         for (int id = 0; id < g_device_count; ++id) {
-            if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
+            // skip devices that are not going to do any work:
+            if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
+                continue;
+            }
+
+            if (min_compute_capability > g_device_caps[id].cc) {
                 min_compute_capability = g_device_caps[id].cc;
             }
+            if (g_device_caps[id].cc == 610) {
+                any_pascal_with_slow_fp16 = true;
+            }
         }
     } else {
         min_compute_capability = g_device_caps[g_main_device].cc;
@@ -9949,13 +9958,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 #else
 
     // fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
-    bool fp16_performance_good = true;
-    for (int id = 0; id < g_device_count; ++id) {
-        if (g_device_caps[id].cc < CC_VOLTA && g_device_caps[id].cc != CC_PASCAL) {
-            fp16_performance_good = false;
-            break;
-        }
-    }
+    const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
 
     // mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
     use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;