This commit is contained in:
slaren 2024-06-08 17:00:26 +02:00
parent ecb75b5f54
commit e06659811e
4 changed files with 80 additions and 31 deletions

View file

@ -293,6 +293,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
params.output_format = cmd_params_defaults.output_format;
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
params.reps = cmd_params_defaults.reps;
params.numa = cmd_params_defaults.numa;
for (int i = 1; i < argc; i++) {
arg = argv[i];

View file

@ -1232,7 +1232,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
*node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.2");
SET_CAUSE(node, "2.1");
} else {
for (int b = 0; b < sched->n_backends; b++) {
if (b == cur_backend_id) {
@ -1326,7 +1326,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
// pass 2.2 expand gpu down
// expand gpu down
{
int cur_backend_id = -1;
for (int i = 0; i < graph->n_nodes; i++) {
@ -1352,7 +1352,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
}
// pass 2.1 expand gpu up
// expand gpu up
{
int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) {
@ -1377,7 +1377,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
}
// pass 2.4 expand rest down
// expand rest down
{
int cur_backend_id = -1;
for (int i = 0; i < graph->n_nodes; i++) {
@ -1393,7 +1393,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
}
// pass 2.3 expand rest up
// expand rest up
{
int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) {
@ -1410,13 +1410,48 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
// pass 3: assign backends to remaining src from dst and view_src
// pass 3
// upgrade nodes to higher prio backends with compatible buffer types
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
// however, we also need to verify that the sources are in compatible buffer types
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view_op(node->op)) {
continue;
}
int * node_backend_id = &tensor_backend_id(node);
for (int b = 0; b < *node_backend_id; b++) {
if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
bool supported = true;
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
supported = false;
break;
}
}
if (supported) {
*node_backend_id = b;
SET_CAUSE(node, "3.upg");
break;
}
}
}
}
// pass 4: assign backends to remaining src from dst and view_src
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
int * cur_backend_id = &tensor_backend_id(node);
if (node->view_src != NULL && *cur_backend_id == -1) {
*cur_backend_id = tensor_backend_id(node->view_src);
SET_CAUSE(node, "3.vsrc");
SET_CAUSE(node, "4.vsrc");
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
@ -1428,10 +1463,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (src->view_src != NULL) {
// views are always on the same backend as the source
*src_backend_id = tensor_backend_id(src->view_src);
SET_CAUSE(src, "3.vsrc");
SET_CAUSE(src, "4.vsrc");
} else {
*src_backend_id = *cur_backend_id;
SET_CAUSE(src, "3.cur");
SET_CAUSE(src, "4.cur");
}
}
}
@ -1848,6 +1883,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
free(sched->tensor_copies);
free(sched->node_backend_ids);
free(sched->leaf_backend_ids);
free(sched->prev_node_backend_ids);
free(sched->prev_leaf_backend_ids);
free(sched);
}
@ -1944,6 +1981,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
tensor_backend_id(node) = backend_index;
SET_CAUSE(node, "usr");
}
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {

View file

@ -56,8 +56,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
const enum ggml_type type = src0->type;
ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11);
GGML_ASSERT(ne2 == ne12);
@ -88,32 +86,39 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
// convert src0 to float
if (type != GGML_TYPE_F32) {
ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
ggml_to_float_t const to_float = type_traits.to_float;
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
float * const wplane = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane;
float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
const int min_cols_per_thread = 4096;
const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
const int n_threads = std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread));
#ifdef GGML_USE_OPENMP
#pragma omp parallel for num_threads(ctx->n_threads)
#pragma omp parallel for num_threads(n_threads)
for (int64_t i01 = 0; i01 < ne01; i01++) {
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
}
#else
for (int i = 0; i < ctx->n_threads - 1; i++) {
ctx->tasks.push_back(std::async(std::launch::async, [=]() {
const int64_t start = i*ne01/ctx->n_threads;
const int64_t end = (i + 1)*ne01/ctx->n_threads;
for (int64_t i01 = start; i01 < end; i01++) {
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
}
}));
for (int i = 1; i < n_threads; i++) {
const int64_t start = i*ne01/n_threads;
const int64_t end = (i + 1)*ne01/n_threads;
if (start < end) {
ctx->tasks.push_back(std::async(std::launch::async, [=]() {
for (int64_t i01 = start; i01 < end; i01++) {
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
}
}));
}
}
{
// reuse the current thread for the last task
const int64_t start = (ctx->n_threads - 1)*ne01/ctx->n_threads;
const int64_t end = ne01;
const int64_t start = 0;
const int64_t end = ne01/n_threads;
for (int64_t i01 = start; i01 < end; i01++) {
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
}
@ -131,7 +136,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
#endif
}
#if defined(OPENBLAS_VERSION)
openblas_set_num_threads(ctx->n_threads);
#endif
@ -150,7 +154,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
if (type != GGML_TYPE_F32) {
x = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane;
x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
}
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,

18
ggml.c
View file

@ -18749,6 +18749,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
switch (node->op) {
case GGML_OP_CPY:
case GGML_OP_DUP:
case GGML_OP_CONT:
case GGML_OP_ADD:
case GGML_OP_ADD1:
case GGML_OP_ACC:
@ -18833,7 +18834,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
} break;
case GGML_OP_SCALE:
case GGML_OP_SET:
case GGML_OP_CONT:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
@ -18993,8 +18993,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
sched_yield();
}
* node_n = atomic_load(&state->shared->node_n);
if (* node_n != last_node_n) break;
*node_n = atomic_load(&state->shared->node_n);
if (*node_n != last_node_n) {
break;
}
#if defined(__SSE3__)
// Tell the processor we're spinning. It's a processor hint for spinlocks.
_mm_pause();
@ -19004,15 +19007,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
// wait for other threads to finish
const int last_task_phase = * task_phase;
const int last_task_phase = *task_phase;
while (true) {
if (do_yield) {
sched_yield();
}
* task_phase = atomic_load(&state->shared->node_task);
if (* task_phase != last_task_phase) break;
*task_phase = atomic_load(&state->shared->node_task);
if (*task_phase != last_task_phase) {
break;
}
#if defined(__SSE3__)
// Tell the processor we're spinning. It's a processor hint for spinlocks.
_mm_pause();