diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index bc122c9b8..79e2d313a 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -6059,8 +6059,9 @@ inline void ggml_cuda_op_mul_mat_cublas(
     // ldc == nrows of the matrix that cuBLAS writes into
     int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
 
-    // TODO: this may be slower in older architectures with poor fp16 support
-    if (src0->type == GGML_TYPE_F16 && ggml_is_contiguous(src0) && ldc == row_diff) {
+    const int compute_capability = g_compute_capabilities[id];
+
+    if (compute_capability >= CC_TURING && src0->type == GGML_TYPE_F16 && ggml_is_contiguous(src0) && ldc == row_diff) {
         // convert src1 to fp16, multiply as fp16, convert dst to fp32
         half * src1_as_f16 = nullptr;
         size_t src1_as = 0;