fixes

2024-06-08 17:00:26 +02:00 · 2024-06-08 17:00:26 +02:00 · e06659811e
commit e06659811e
parent ecb75b5f54
4 changed files with 80 additions and 31 deletions
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -293,6 +293,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    params.output_format = cmd_params_defaults.output_format;
    params.output_format_stderr = cmd_params_defaults.output_format_stderr;
    params.reps = cmd_params_defaults.reps;
+    params.numa = cmd_params_defaults.numa;

    for (int i = 1; i < argc; i++) {
        arg = argv[i];
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -1232,7 +1232,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
 static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
    if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
        *node_backend_id = cur_backend_id;
-        SET_CAUSE(node, "2.2");
+        SET_CAUSE(node, "2.1");
    } else {
        for (int b = 0; b < sched->n_backends; b++) {
            if (b == cur_backend_id) {
@ -1326,7 +1326,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
    // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops


-    // pass 2.2 expand gpu down
+    // expand gpu down
    {
        int cur_backend_id = -1;
        for (int i = 0; i < graph->n_nodes; i++) {
@ -1352,7 +1352,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
            }
        }
    }
-    // pass 2.1 expand gpu up
+    // expand gpu up
    {
        int cur_backend_id = -1;
        for (int i = graph->n_nodes - 1; i >= 0; i--) {
@ -1377,7 +1377,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
            }
        }
    }
-    // pass 2.4 expand rest down
+    // expand rest down
    {
        int cur_backend_id = -1;
        for (int i = 0; i < graph->n_nodes; i++) {
@ -1393,7 +1393,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
            }
        }
    }
-    // pass 2.3 expand rest up
+    // expand rest up
    {
        int cur_backend_id = -1;
        for (int i = graph->n_nodes - 1; i >= 0; i--) {
@ -1410,13 +1410,48 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
        }
    }

-    // pass 3: assign backends to remaining src from dst and view_src
+    // pass 3
+    // upgrade nodes to higher prio backends with compatible buffer types
+    // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
+    // however, we also need to verify that the sources are in compatible buffer types
+    // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
+    // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
+    // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
+    for (int i = 0; i < graph->n_nodes; i++) {
+        struct ggml_tensor * node = graph->nodes[i];
+        if (ggml_is_view_op(node->op)) {
+            continue;
+        }
+        int * node_backend_id = &tensor_backend_id(node);
+        for (int b = 0; b < *node_backend_id; b++) {
+            if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
+                bool supported = true;
+                for (int j = 0; j < GGML_MAX_SRC; j++) {
+                    struct ggml_tensor * src = node->src[j];
+                    if (src == NULL) {
+                        continue;
+                    }
+                    if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
+                        supported = false;
+                        break;
+                    }
+                }
+                if (supported) {
+                    *node_backend_id = b;
+                    SET_CAUSE(node, "3.upg");
+                    break;
+                }
+            }
+        }
+    }
+
+    // pass 4: assign backends to remaining src from dst and view_src
    for (int i = 0; i < graph->n_nodes; i++) {
        struct ggml_tensor * node = graph->nodes[i];
        int * cur_backend_id = &tensor_backend_id(node);
        if (node->view_src != NULL && *cur_backend_id == -1) {
            *cur_backend_id = tensor_backend_id(node->view_src);
-            SET_CAUSE(node, "3.vsrc");
+            SET_CAUSE(node, "4.vsrc");
        }
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * src = node->src[j];
@ -1428,10 +1463,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                if (src->view_src != NULL) {
                    // views are always on the same backend as the source
                    *src_backend_id = tensor_backend_id(src->view_src);
-                    SET_CAUSE(src, "3.vsrc");
+                    SET_CAUSE(src, "4.vsrc");
                } else {
                    *src_backend_id = *cur_backend_id;
-                    SET_CAUSE(src, "3.cur");
+                    SET_CAUSE(src, "4.cur");
                }
            }
        }
@ -1848,6 +1883,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
    free(sched->tensor_copies);
    free(sched->node_backend_ids);
    free(sched->leaf_backend_ids);
+    free(sched->prev_node_backend_ids);
+    free(sched->prev_leaf_backend_ids);
    free(sched);
 }

@ -1944,6 +1981,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
    tensor_backend_id(node) = backend_index;
+    SET_CAUSE(node, "usr");
 }

 ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
--- a/ggml-blas.cpp
+++ b/ggml-blas.cpp
@ -56,8 +56,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg

    const enum ggml_type type = src0->type;

-    ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
-
    GGML_ASSERT(ne0 == ne01);
    GGML_ASSERT(ne1 == ne11);
    GGML_ASSERT(ne2 == ne12);
@ -88,32 +86,39 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg

    // convert src0 to float
    if (type != GGML_TYPE_F32) {
+        ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
        ggml_to_float_t const to_float = type_traits.to_float;

        for (int64_t i03 = 0; i03 < ne03; i03++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                const void  *       x      = (char *)  src0->data + i02*nb02          + i03*nb03;
-                      float * const wplane = (float *) wdata      + i03*ne12*ne_plane + i02*ne_plane;
+                      float * const wplane = (float *) wdata      + i02*ne_plane      + i03*ne02*ne_plane;
+
+                const int min_cols_per_thread = 4096;
+                const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
+                const int n_threads = std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread));

 #ifdef GGML_USE_OPENMP
-                #pragma omp parallel for num_threads(ctx->n_threads)
+                #pragma omp parallel for num_threads(n_threads)
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
                }
 #else
-                for (int i = 0; i < ctx->n_threads - 1; i++) {
-                    ctx->tasks.push_back(std::async(std::launch::async, [=]() {
-                        const int64_t start =       i*ne01/ctx->n_threads;
-                        const int64_t end   = (i + 1)*ne01/ctx->n_threads;
-                        for (int64_t i01 = start; i01 < end; i01++) {
-                            to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
-                        }
-                    }));
+                for (int i = 1; i < n_threads; i++) {
+                    const int64_t start =       i*ne01/n_threads;
+                    const int64_t end   = (i + 1)*ne01/n_threads;
+                    if (start < end) {
+                        ctx->tasks.push_back(std::async(std::launch::async, [=]() {
+                            for (int64_t i01 = start; i01 < end; i01++) {
+                                to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+                            }
+                        }));
+                    }
                }
                {
                    // reuse the current thread for the last task
-                    const int64_t start = (ctx->n_threads - 1)*ne01/ctx->n_threads;
-                    const int64_t end   = ne01;
+                    const int64_t start = 0;
+                    const int64_t end   = ne01/n_threads;
                    for (int64_t i01 = start; i01 < end; i01++) {
                        to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
                    }
@ -131,7 +136,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
 #endif
    }

-
 #if defined(OPENBLAS_VERSION)
    openblas_set_num_threads(ctx->n_threads);
 #endif
@ -150,7 +154,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
                  float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);

            if (type != GGML_TYPE_F32) {
-                x = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane;
+                x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
            }

            cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
--- a/ggml.c
+++ b/ggml.c
@ -18749,6 +18749,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
    switch (node->op) {
        case GGML_OP_CPY:
        case GGML_OP_DUP:
+        case GGML_OP_CONT:
        case GGML_OP_ADD:
        case GGML_OP_ADD1:
        case GGML_OP_ACC:
@ -18833,7 +18834,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
            } break;
        case GGML_OP_SCALE:
        case GGML_OP_SET:
-        case GGML_OP_CONT:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:
@ -18993,8 +18993,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
            sched_yield();
        }

-        * node_n = atomic_load(&state->shared->node_n);
-        if (* node_n != last_node_n) break;
+        *node_n = atomic_load(&state->shared->node_n);
+        if (*node_n != last_node_n) {
+            break;
+        }
+
 #if defined(__SSE3__)
        // Tell the processor we're spinning.  It's a processor hint for spinlocks.
        _mm_pause();
@ -19004,15 +19007,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput

 static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
    // wait for other threads to finish
-    const int last_task_phase = * task_phase;
+    const int last_task_phase = *task_phase;

    while (true) {
        if (do_yield) {
            sched_yield();
        }

-        * task_phase = atomic_load(&state->shared->node_task);
-        if (* task_phase != last_task_phase) break;
+        *task_phase = atomic_load(&state->shared->node_task);
+        if (*task_phase != last_task_phase) {
+            break;
+        }
+
 #if defined(__SSE3__)
        // Tell the processor we're spinning.  It's a processor hint for spinlocks.
        _mm_pause();