From bd80601ea8eb4bc7d6135dfa4214d3c67e6e7f87 Mon Sep 17 00:00:00 2001
From: Kunnis <kunnis@gmail.com>
Date: Fri, 10 May 2024 17:30:37 -0500
Subject: [PATCH] Updating comments with what we've learned.

---
 ggml.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index 6182dbd2d..1a6ab13cd 100644
--- a/ggml.c
+++ b/ggml.c
@@ -12094,13 +12094,11 @@ UseGgmlGemm2:;
     int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
     int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
 
-    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
-
     //If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
+    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggerganov/llama.cpp/pull/6915
+    //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
     if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa())
     {
-        //if (ith == 0)
-        //    printf("rechunked");
         // distribute the thread work across the inner or outer loop based on which one is larger
         nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
         nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
@@ -12111,7 +12109,7 @@ UseGgmlGemm2:;
     const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
 
     //if (ith == 0)
-    //    printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d.  Fp/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
+    //    printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d.  Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
 
     //The first chunk comes from our thread_id, the rest will get auto-assigned.
     int current_chunk = ith;