improve graph build time

2023-07-22 21:48:57 +02:00 · 2023-07-22 21:48:57 +02:00 · 261fdaae80
commit 261fdaae80
parent 2f9cf974a0
4 changed files with 38 additions and 22 deletions
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -1342,17 +1342,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
 // expand the graph nodes without creating leafs.
 struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
    // check if already visited
-    for (int i = 0; i < g->n_nodes; i++) {
+    if (t->visited) {
        if (g->nodes[i] == t) {
        return t;
    }
-    }
+    t->visited = true;
    for (int i = 0; i < g->n_leafs; i++) {
        if (g->leafs[i] == t) {
            return t;
        }
    }
    for (int i = 0; i < GGML_MAX_SRC; ++i) {
        if (t->src[i]) {
--- a/ggml.c
+++ b/ggml.c
@ -4594,6 +4594,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
        /*.is_param     =*/ false,
        /*.grad         =*/ NULL,
        /*.src          =*/ { NULL },
        /*.visited      =*/ false,
        /*.perf_runs    =*/ 0,
        /*.perf_cycles  =*/ 0,
        /*.perf_time_us =*/ 0,
@ -15752,17 +15753,11 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
    }
    // check if already visited
-    for (int i = 0; i < cgraph->n_nodes; i++) {
+    if (node->visited) {
-        if (cgraph->nodes[i] == node) {
+        GGML_ASSERT(cgraph->n_nodes > 0 || cgraph->n_leafs > 0); // to fix this, call ggml_graph_close() after building the graph
        return;
    }
-    }
+    node->visited = true;
    for (int i = 0; i < cgraph->n_leafs; i++) {
        if (cgraph->leafs[i] == node) {
            return;
        }
    }
    for (int i = 0; i < GGML_MAX_SRC; ++i) {
        if (node->src[i]) {
@ -15814,13 +15809,28 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten
 }
 void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
    GGML_ASSERT(!cgraph->closed && "graph is closed");
    ggml_build_forward_impl(cgraph, tensor, true);
 }
 void ggml_graph_close(struct ggml_cgraph * cgraph) {
    if (cgraph->closed) {
        return;
    }
    for (int i = 0; i < cgraph->n_nodes; ++i) {
        cgraph->nodes[i]->visited = false;
    }
    for (int i = 0; i < cgraph->n_leafs; ++i) {
        cgraph->leafs[i]->visited = false;
    }
    cgraph->closed = true;
 }
 struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
    struct ggml_cgraph result = {
        /*.n_nodes      =*/ 0,
        /*.n_leafs      =*/ 0,
        /*.closed       =*/ false,
        /*.nodes        =*/ { NULL },
        /*.grads        =*/ { NULL },
        /*.leafs        =*/ { NULL },
@ -15865,7 +15875,7 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
        if (node->is_param) {
            GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
-            ggml_build_forward_impl(&result, node->grad, true);
+            ggml_build_forward_expand(&result, node->grad);
        }
    }
@ -16135,6 +16145,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 }
 struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
    ggml_graph_close(cgraph);
    if (n_threads <= 0) {
        n_threads = GGML_DEFAULT_N_THREADS;
    }
--- a/ggml.h
+++ b/ggml.h
@ -427,6 +427,8 @@ extern "C" {
        struct ggml_tensor * grad;
        struct ggml_tensor * src[GGML_MAX_SRC];
        bool visited;   // used to build graphs
        // performance
        int     perf_runs;
        int64_t perf_cycles;
@ -438,7 +440,7 @@ extern "C" {
        void * extra; // extra things e.g. for ggml-cuda.cu
-        char padding[8];
+        char padding[4];
    };
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -463,6 +465,7 @@ extern "C" {
    struct ggml_cgraph {
        int n_nodes;
        int n_leafs;
        bool closed;
        struct ggml_tensor * nodes[GGML_MAX_NODES];
        struct ggml_tensor * grads[GGML_MAX_NODES];
@ -1349,6 +1352,11 @@ extern "C" {
    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    // resets the visited flag for all the tensors in the graph
    // called by ggml_graph_plan()
    // shouldn't be necessary to call manually except building when building multiple graphs without computing them
    GGML_API void ggml_graph_close(struct ggml_cgraph * cgraph);
    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
--- a/llama.cpp
+++ b/llama.cpp
@ -1701,6 +1701,8 @@ static bool llama_eval_internal(
    // logits -> probs
    //cur = ggml_soft_max_inplace(ctx0, cur);
    //fprintf(stderr, "graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
    // run the computation
    ggml_build_forward_expand(&gf, cur);
@ -1710,6 +1712,7 @@ static bool llama_eval_internal(
 #ifdef GGML_USE_METAL
    if (lctx.ctx_metal && N == 1) {
        ggml_graph_close(&gf); // should only be required for the Metal backend, as ggml_graph_plan() does this automatically
        ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
        ggml_metal_graph_compute(lctx.ctx_metal, &gf);
        ggml_metal_get_tensor   (lctx.ctx_metal, cur);