diff --git a/ggml-blas.cpp b/ggml-blas.cpp
index 92d0e6637..ade10b9ac 100644
--- a/ggml-blas.cpp
+++ b/ggml-blas.cpp
@@ -98,15 +98,23 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
                     to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
                 }
 #else
-                for (int i = 0; i < ctx->n_threads; i++) {
+                for (int i = 0; i < ctx->n_threads - 1; i++) {
                     ctx->tasks.push_back(std::async(std::launch::async, [=]() {
-                        const int64_t start = i*ne01/ctx->n_threads;
+                        const int64_t start =       i*ne01/ctx->n_threads;
                         const int64_t end   = (i + 1)*ne01/ctx->n_threads;
                         for (int64_t i01 = start; i01 < end; i01++) {
                             to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
                         }
                     }));
                 }
+                {
+                    // reuse the current thread for the last task
+                    const int64_t start = (ctx->n_threads - 1)*ne01/ctx->n_threads;
+                    const int64_t end   = ne01;
+                    for (int64_t i01 = start; i01 < end; i01++) {
+                        to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                    }
+                }
 #endif
             }
         }