diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index b58af8040..35aaa3e44 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -5248,13 +5248,15 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
     half  * buf_iw = data_soft_max_f16 + 0; // shared memory buffer for inter-warp communication
     // (shared memory) buffer to cache values between iterations:
     half2 * vals = vals_smem ? (half2 *) (buf_iw + WARP_SIZE) : (half2 *) (dst + rowx*ncols_data);
+    // if the buffer is larger than max. shared memory per block, use dst as temp. buffer instead
+    // in that case col_smem == col_data must be enforced to avoid race conditions
 
     half2 max_val = make_half2(-INFINITY, -INFINITY);
 
 #pragma unroll
     for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
-        const int col_smem = col0 + tid;
         const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
+        const int col_smem = vals_smem ? col0 + tid : col_data;
 
         const int ix = rowx*ncols_data + col_data;
         const int iy = rowy*ncols_data + col_data;
@@ -5270,7 +5272,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
         } else {
             val.y = x[ix + WARP_SIZE]*scale + (y ? y[iy + WARP_SIZE] : 0.0f);
         }
-        if (!need_check || col_smem < ncols_smem) {
+        if (!need_check || col_smem < (vals_smem ? ncols_smem : ncols_data)) {
             vals[col_smem] = val;
         }
         max_val = __hmax2(max_val, val);
@@ -5299,9 +5301,9 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
 
 #pragma unroll
     for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
-        const int col_smem = col0 + tid;
+        const int col_smem = vals_smem ? col0 + tid : 2*col0 + 2*warp_id*WARP_SIZE + lane_id;
 
-        if (ncols_template == 0 && col_smem >= ncols_smem) {
+        if (ncols_template == 0 && col_smem >= (vals_smem ? ncols_smem : ncols_data)) {
             break;
         }
 
@@ -5334,8 +5336,8 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
 
 #pragma unroll
     for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
-        const int col_smem = col0 + tid;
         const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
+        const int col_smem = vals_smem ? col0 + tid : col_data;
 
         const int idst = rowx*ncols_data + col_data;
         const half2 result = vals[col_smem] * inv_sum;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 4ade8b1d8..9c2baded5 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -436,7 +436,7 @@ struct test_case {
 
             double err = nmse(f1.data(), f2.data(), f1.size());
             if (err > ud->max_err) {
-                printf("[%s] NMSE = %.3E > %.3E ", ggml_op_desc(t1), err, ud->max_err);
+                printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
                 //for (int i = 0; i < (int) f1.size(); i++) {
                 //    printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
                 //}
@@ -1426,6 +1426,7 @@ struct test_moe : public test_case {
 
 static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
     std::vector<std::unique_ptr<test_case>> test_cases;
+    std::default_random_engine rng(0);
 
     const ggml_type all_types[] = {
         GGML_TYPE_F32, GGML_TYPE_F16,
@@ -1548,7 +1549,19 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10,  1}, 5));
     test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
 
-    test_cases.emplace_back(new test_soft_max());
+    std::uniform_int_distribution<> dist_ne1(1, 50);
+    int exponent = 1;
+    while (exponent < (1 << 17)) {
+        std::uniform_int_distribution<> dist_ne0(exponent, 2*exponent);
+
+        for (int n = 0; n < 10; ++n) {
+            int64_t ne0 = dist_ne0(rng);
+            int64_t ne1 = dist_ne1(rng);
+            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}));
+        }
+
+        exponent <<= 1;
+    }
 
     for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
         test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512)); // llama 7B