Starting the buildup of the loop

This commit is contained in:
Kunnis 2024-05-09 23:22:12 -05:00
parent 9acaec58bd
commit c0557fa20a

21
ggml.c
View file

@ -12088,14 +12088,17 @@ UseGgmlGemm2:;
// distribute the thread work across the inner or outer loop based on which one is larger
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
const int64_t nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
const int64_t nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
const int64_t ith0 = ith % nth0;
const int64_t ith1 = ith / nth0;
int current_chunk = ith;
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
{
const int64_t ith0 = current_chunk % nchunk0;
const int64_t ith1 = current_chunk / nchunk0;
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
const int64_t ir0_start = dr0 * ith0;
const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
@ -12111,6 +12114,12 @@ UseGgmlGemm2:;
ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
#ifdef GGML_PERF
chunks_executed++;
#endif
}
#ifdef GGML_PERF
// These numbers are useful when trying to measure how well the threading scheduling works.
//int64_t workSize = (ne01 * ne11 * ne12 * ne13 * ne00) / nchunk0 / nchunk1;