Fix formatting
This commit is contained in:
parent
bd80601ea8
commit
d9ba30a204
1 changed files with 27 additions and 28 deletions
13
ggml.c
13
ggml.c
|
@ -2435,7 +2435,6 @@ static void ggml_setup_op_has_task_pass(void) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// NUMA support
|
// NUMA support
|
||||||
//
|
//
|
||||||
|
@ -12085,8 +12084,9 @@ UseGgmlGemm2:;
|
||||||
int chunk_size = 16;
|
int chunk_size = 16;
|
||||||
|
|
||||||
// We need to step up the size if it's small
|
// We need to step up the size if it's small
|
||||||
if (nr0 == 1 || nr1 == 1)
|
if (nr0 == 1 || nr1 == 1) {
|
||||||
chunk_size = 64;
|
chunk_size = 64;
|
||||||
|
}
|
||||||
|
|
||||||
// distribute the work across the inner or outer loop based on which one is larger
|
// distribute the work across the inner or outer loop based on which one is larger
|
||||||
// The number of chunks in the 0/1 dim.
|
// The number of chunks in the 0/1 dim.
|
||||||
|
@ -12097,8 +12097,7 @@ UseGgmlGemm2:;
|
||||||
// If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
|
// If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
|
||||||
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
|
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
|
||||||
// In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
|
// In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
|
||||||
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa())
|
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
|
||||||
{
|
|
||||||
// distribute the thread work across the inner or outer loop based on which one is larger
|
// distribute the thread work across the inner or outer loop based on which one is larger
|
||||||
nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
||||||
nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
||||||
|
@ -12114,8 +12113,7 @@ UseGgmlGemm2:;
|
||||||
//The first chunk comes from our thread_id, the rest will get auto-assigned.
|
//The first chunk comes from our thread_id, the rest will get auto-assigned.
|
||||||
int current_chunk = ith;
|
int current_chunk = ith;
|
||||||
|
|
||||||
while (current_chunk < nchunk0 * nchunk1)
|
while (current_chunk < nchunk0 * nchunk1) {
|
||||||
{
|
|
||||||
const int64_t ith0 = current_chunk % nchunk0;
|
const int64_t ith0 = current_chunk % nchunk0;
|
||||||
const int64_t ith1 = current_chunk / nchunk0;
|
const int64_t ith1 = current_chunk / nchunk0;
|
||||||
|
|
||||||
|
@ -12131,8 +12129,9 @@ UseGgmlGemm2:;
|
||||||
chunks_executed++;
|
chunks_executed++;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (nth >= nchunk0 * nchunk1)
|
if (nth >= nchunk0 * nchunk1) {
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1);
|
current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue