Updating comments with what we've learned.
This commit is contained in:
parent
1c68ea8d9f
commit
bd80601ea8
1 changed files with 3 additions and 5 deletions
8
ggml.c
8
ggml.c
|
@ -12094,13 +12094,11 @@ UseGgmlGemm2:;
|
||||||
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
|
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
|
||||||
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
||||||
|
|
||||||
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
|
||||||
|
|
||||||
//If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
|
//If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
|
||||||
|
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
|
||||||
|
// In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
|
||||||
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa())
|
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa())
|
||||||
{
|
{
|
||||||
//if (ith == 0)
|
|
||||||
// printf("rechunked");
|
|
||||||
// distribute the thread work across the inner or outer loop based on which one is larger
|
// distribute the thread work across the inner or outer loop based on which one is larger
|
||||||
nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
||||||
nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
||||||
|
@ -12111,7 +12109,7 @@ UseGgmlGemm2:;
|
||||||
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
|
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
|
||||||
|
|
||||||
//if (ith == 0)
|
//if (ith == 0)
|
||||||
// printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
|
// printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
|
||||||
|
|
||||||
//The first chunk comes from our thread_id, the rest will get auto-assigned.
|
//The first chunk comes from our thread_id, the rest will get auto-assigned.
|
||||||
int current_chunk = ith;
|
int current_chunk = ith;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue