From d9ba30a2046615e04dc9286e31d54652c6cd6ad7 Mon Sep 17 00:00:00 2001
From: Kunnis <kunnis@gmail.com>
Date: Tue, 14 May 2024 17:15:47 -0500
Subject: [PATCH] Fix formatting

---
 ggml.c | 55 +++++++++++++++++++++++++++----------------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/ggml.c b/ggml.c
index 1a6ab13cd..c0d498014 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2435,7 +2435,6 @@ static void ggml_setup_op_has_task_pass(void) {
     }
 }
 
-
 //
 // NUMA support
 //
@@ -11773,16 +11772,16 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
 #endif
 
 static void ggml_compute_forward_mul_mat_one_chunk(
-    const struct ggml_compute_params* params,
-    struct ggml_tensor* dst,
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst,
     const int64_t num_rows_per_vec_dot,
     const int64_t ir0_start,
     const int64_t ir0_end,
     const int64_t ir1_start,
     const int64_t ir1_end) {
 
-    const struct ggml_tensor* src0 = dst->src[0];
-    const struct ggml_tensor* src1 = dst->src[1];
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
@@ -11804,7 +11803,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
         return;
     }
 
-    const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
     const size_t row_size = ggml_row_size(vec_dot_type, ne10);
 
     assert(ne12 % ne02 == 0);
@@ -12011,7 +12010,7 @@ UseGgmlGemm1:;
         if (ith != 0) {
             return;
         }
-        //Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+        // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
         atomic_store(&state->shared->current_chunk, nth);
         if (src1->type != vec_dot_type) {
             char * wdata = params->wdata;
@@ -12067,10 +12066,10 @@ UseGgmlGemm2:;
     UNUSED(chunks_executed);
 #endif
 
-    //This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
+    // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
     const int64_t nr0 = ne0;
 
-    //This is the size of the rest of the dimensions of the result
+    // This is the size of the rest of the dimensions of the result
     const int64_t nr1 = ne1 * ne2 * ne3;
 
     // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
@@ -12081,24 +12080,24 @@ UseGgmlGemm2:;
         num_rows_per_vec_dot = 1;
     }
 
-    //Now select a reasonable chunk size.
+    // Now select a reasonable chunk size.
     int chunk_size = 16;
 
-    //We need to step up the size if it's small
-    if (nr0 == 1 || nr1 == 1)
+    // We need to step up the size if it's small
+    if (nr0 == 1 || nr1 == 1) {
         chunk_size = 64;
+    }
 
     // distribute the work across the inner or outer loop based on which one is larger
-    //The number of chunks in the 0/1 dim.
-    //CEIL(nr0/chunk_size)
+    // The number of chunks in the 0/1 dim.
+    // CEIL(nr0/chunk_size)
     int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
     int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
 
-    //If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
+    // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
     //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggerganov/llama.cpp/pull/6915
     //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
-    if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa())
-    {
+    if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
         // distribute the thread work across the inner or outer loop based on which one is larger
         nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
         nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
@@ -12114,8 +12113,7 @@ UseGgmlGemm2:;
     //The first chunk comes from our thread_id, the rest will get auto-assigned.
     int current_chunk = ith;
 
-    while (current_chunk < nchunk0 * nchunk1)
-    {
+    while (current_chunk < nchunk0 * nchunk1) {
         const int64_t ith0 = current_chunk % nchunk0;
         const int64_t ith1 = current_chunk / nchunk0;
 
@@ -12131,8 +12129,9 @@ UseGgmlGemm2:;
         chunks_executed++;
 #endif
 
-        if (nth >= nchunk0 * nchunk1)
+        if (nth >= nchunk0 * nchunk1) {
             break;
+        }
 
         current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1);
     }
@@ -19652,17 +19651,17 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
     return n_tasks;
 }
 
-static void ggml_graph_compute_thread_sync_node(int* node_n, struct ggml_compute_state* state, const bool do_yield) {
+static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
     // wait for other threads to finish
-    const int last_node_n = *node_n;
+    const int last_node_n = * node_n;
 
     while (true) {
         if (do_yield) {
             sched_yield();
         }
 
-        *node_n = atomic_load(&state->shared->node_n);
-        if (*node_n != last_node_n) break;
+        * node_n = atomic_load(&state->shared->node_n);
+        if (* node_n != last_node_n) break;
 #if defined(__SSE3__)
         //Tell the processor we're spinning.  It's a processor hint for spinlocks.
         _mm_pause();
@@ -19670,17 +19669,17 @@ static void ggml_graph_compute_thread_sync_node(int* node_n, struct ggml_compute
     }
 }
 
-static void ggml_graph_compute_thread_sync_task(int* task_phase, struct ggml_compute_state* state, const bool do_yield) {
+static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
     // wait for other threads to finish
-    const int last_task_phase = *task_phase;
+    const int last_task_phase = * task_phase;
 
     while (true) {
         if (do_yield) {
             sched_yield();
         }
 
-        *task_phase = atomic_load(&state->shared->node_task);
-        if (*task_phase != last_task_phase) break;
+        * task_phase = atomic_load(&state->shared->node_task);
+        if (* task_phase != last_task_phase) break;
 #if defined(__SSE3__)
         //Tell the processor we're spinning.  It's a processor hint for spinlocks.
         _mm_pause();