From fc7dc515f187e96ecd0c6e3ac906c6c5faf357c2 Mon Sep 17 00:00:00 2001
From: Kunnis <kunnis@gmail.com>
Date: Thu, 9 May 2024 23:29:49 -0500
Subject: [PATCH] adding the looping structure based on the chunk
 configuration.

---
 ggml.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml.c b/ggml.c
index 896f540ea..f04a11b69 100644
--- a/ggml.c
+++ b/ggml.c
@@ -12091,8 +12091,10 @@ UseGgmlGemm2:;
     const int64_t nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
     const int64_t nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
 
+    //The first chunk comes from our thread_id, the rest will get auto-assigned.
     int current_chunk = ith;
 
+    while (current_chunk < nchunk0 * nchunk1)
     {
         const int64_t ith0 = current_chunk % nchunk0;
         const int64_t ith1 = current_chunk / nchunk0;
@@ -12112,6 +12114,10 @@ UseGgmlGemm2:;
         chunks_executed++;
 #endif
 
+        if (nth >= nchunk0 * nchunk1)
+            break;
+
+        current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1);
     }
 
 #ifdef GGML_PERF