Interleave threads

2023-04-07 17:00:37 -04:00 · 2023-04-07 17:00:37 -04:00 · 5e5a653555
commit 5e5a653555
parent 1b6fd5470b
1 changed files with 7 additions and 22 deletions
--- a/ggml.c
+++ b/ggml.c
@ -4938,12 +4938,6 @@ static void ggml_compute_forward_dup_f16(

    const int thread_num = params->ith;
    const int total_threads = params->nth;
-    const int64_t regions = ne03 * ne02 * ne01 * ne00;
-
-    const int64_t regions_per_thread = (regions + total_threads - 1) / total_threads;
-
-    const int64_t thread_start_region = regions_per_thread * thread_num;
-    const int64_t thread_stop_region = MIN(thread_start_region + regions_per_thread, regions);

    int region_index = 0;

@ -4952,9 +4946,9 @@ static void ggml_compute_forward_dup_f16(
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        if (region_index > thread_stop_region) break;

-                        if (region_index++ >= thread_start_region) {
+                        // Interleave execution so that in a 4 thread run thread 0 copies regions 0,4,8, ...
+                        if ((region_index++ % total_threads) == thread_num) {
                            const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                            char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);

@ -4984,9 +4978,8 @@ static void ggml_compute_forward_dup_f16(
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    for (int64_t i00 = 0; i00 < ne00; i00++) {

-                        if (region_index > thread_stop_region) break;
-
-                        if (region_index++ >= thread_start_region) {
+                        // Interleave execution so that in a 4 thread run thread 0 copies regions 0,4,8, ...
+                        if ((region_index++ % total_threads) == thread_num) {
                            const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                                  char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);

@ -5053,12 +5046,6 @@ static void ggml_compute_forward_dup_f32(

    const int thread_num = params->ith;
    const int total_threads = params->nth;
-    const int64_t regions = ne03 * ne02 * ne01 * ne00;
-
-    const int64_t regions_per_thread = (regions + total_threads - 1) / total_threads;
-
-    const int64_t thread_start_region = regions_per_thread * thread_num;
-    const int64_t thread_stop_region = MIN(thread_start_region + regions_per_thread, regions);

    int region_index = 0;

@ -5067,9 +5054,9 @@ static void ggml_compute_forward_dup_f32(
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        if (region_index > thread_stop_region) break;

-                        if (region_index++ >= thread_start_region) {
+                        // Interleave execution so that in a 4 thread run thread 0 copies regions 0,4,8, ...
+                        if ((region_index++ % total_threads) == thread_num) {
                            const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                            char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);

@ -5098,9 +5085,7 @@ static void ggml_compute_forward_dup_f32(
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        if (region_index > thread_stop_region) break;
-
-                        if (region_index++ >= thread_start_region) {
+                        if ((region_index++ % total_threads) == thread_num) {
                            const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                            char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);