diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 5c31548a6..2a263d2a2 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -293,6 +293,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.output_format = cmd_params_defaults.output_format; params.output_format_stderr = cmd_params_defaults.output_format_stderr; params.reps = cmd_params_defaults.reps; + params.numa = cmd_params_defaults.numa; for (int i = 1; i < argc; i++) { arg = argv[i]; diff --git a/ggml-backend.c b/ggml-backend.c index 80e129cf8..68094e054 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1232,7 +1232,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) { if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) { *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.2"); + SET_CAUSE(node, "2.1"); } else { for (int b = 0; b < sched->n_backends; b++) { if (b == cur_backend_id) { @@ -1326,7 +1326,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops - // pass 2.2 expand gpu down + // expand gpu down { int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { @@ -1352,7 +1352,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } } - // pass 2.1 expand gpu up + // expand gpu up { int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { @@ -1377,7 +1377,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } } - // pass 2.4 expand rest down + // expand rest down { int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { @@ -1393,7 +1393,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } } - // pass 2.3 expand rest up + // expand rest up { int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { @@ -1410,13 +1410,48 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - // pass 3: assign backends to remaining src from dst and view_src + // pass 3 + // upgrade nodes to higher prio backends with compatible buffer types + // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there + // however, we also need to verify that the sources are in compatible buffer types + // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph + // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same + // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU) + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + int * node_backend_id = &tensor_backend_id(node); + for (int b = 0; b < *node_backend_id; b++) { + if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) { + bool supported = true; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + if (!ggml_backend_sched_buffer_supported(sched, src, b)) { + supported = false; + break; + } + } + if (supported) { + *node_backend_id = b; + SET_CAUSE(node, "3.upg"); + break; + } + } + } + } + + // pass 4: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; int * cur_backend_id = &tensor_backend_id(node); if (node->view_src != NULL && *cur_backend_id == -1) { *cur_backend_id = tensor_backend_id(node->view_src); - SET_CAUSE(node, "3.vsrc"); + SET_CAUSE(node, "4.vsrc"); } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; @@ -1428,10 +1463,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (src->view_src != NULL) { // views are always on the same backend as the source *src_backend_id = tensor_backend_id(src->view_src); - SET_CAUSE(src, "3.vsrc"); + SET_CAUSE(src, "4.vsrc"); } else { *src_backend_id = *cur_backend_id; - SET_CAUSE(src, "3.cur"); + SET_CAUSE(src, "4.cur"); } } } @@ -1848,6 +1883,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { free(sched->tensor_copies); free(sched->node_backend_ids); free(sched->leaf_backend_ids); + free(sched->prev_node_backend_ids); + free(sched->prev_leaf_backend_ids); free(sched); } @@ -1944,6 +1981,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); tensor_backend_id(node) = backend_index; + SET_CAUSE(node, "usr"); } ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { diff --git a/ggml-blas.cpp b/ggml-blas.cpp index 3d146fc01..089c73dd3 100644 --- a/ggml-blas.cpp +++ b/ggml-blas.cpp @@ -56,8 +56,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg const enum ggml_type type = src0->type; - ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type); - GGML_ASSERT(ne0 == ne01); GGML_ASSERT(ne1 == ne11); GGML_ASSERT(ne2 == ne12); @@ -88,32 +86,39 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg // convert src0 to float if (type != GGML_TYPE_F32) { + ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type); ggml_to_float_t const to_float = type_traits.to_float; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - float * const wplane = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane; + float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + + const int min_cols_per_thread = 4096; + const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); + const int n_threads = std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)); #ifdef GGML_USE_OPENMP - #pragma omp parallel for num_threads(ctx->n_threads) + #pragma omp parallel for num_threads(n_threads) for (int64_t i01 = 0; i01 < ne01; i01++) { to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); } #else - for (int i = 0; i < ctx->n_threads - 1; i++) { - ctx->tasks.push_back(std::async(std::launch::async, [=]() { - const int64_t start = i*ne01/ctx->n_threads; - const int64_t end = (i + 1)*ne01/ctx->n_threads; - for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } - })); + for (int i = 1; i < n_threads; i++) { + const int64_t start = i*ne01/n_threads; + const int64_t end = (i + 1)*ne01/n_threads; + if (start < end) { + ctx->tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + })); + } } { // reuse the current thread for the last task - const int64_t start = (ctx->n_threads - 1)*ne01/ctx->n_threads; - const int64_t end = ne01; + const int64_t start = 0; + const int64_t end = ne01/n_threads; for (int64_t i01 = start; i01 < end; i01++) { to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); } @@ -131,7 +136,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg #endif } - #if defined(OPENBLAS_VERSION) openblas_set_num_threads(ctx->n_threads); #endif @@ -150,7 +154,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); if (type != GGML_TYPE_F32) { - x = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane; + x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; } cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, diff --git a/ggml.c b/ggml.c index a5d143b5c..e372fee93 100644 --- a/ggml.c +++ b/ggml.c @@ -18749,6 +18749,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_ switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: + case GGML_OP_CONT: case GGML_OP_ADD: case GGML_OP_ADD1: case GGML_OP_ACC: @@ -18833,7 +18834,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_ } break; case GGML_OP_SCALE: case GGML_OP_SET: - case GGML_OP_CONT: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: @@ -18993,8 +18993,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput sched_yield(); } - * node_n = atomic_load(&state->shared->node_n); - if (* node_n != last_node_n) break; + *node_n = atomic_load(&state->shared->node_n); + if (*node_n != last_node_n) { + break; + } + #if defined(__SSE3__) // Tell the processor we're spinning. It's a processor hint for spinlocks. _mm_pause(); @@ -19004,15 +19007,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) { // wait for other threads to finish - const int last_task_phase = * task_phase; + const int last_task_phase = *task_phase; while (true) { if (do_yield) { sched_yield(); } - * task_phase = atomic_load(&state->shared->node_task); - if (* task_phase != last_task_phase) break; + *task_phase = atomic_load(&state->shared->node_task); + if (*task_phase != last_task_phase) { + break; + } + #if defined(__SSE3__) // Tell the processor we're spinning. It's a processor hint for spinlocks. _mm_pause();