metal: better memory alloc w/ concurrency dispatch

The ggml-alloc should only free tensors at memory barriers.
2023-08-24 00:55:58 -04:00 · 2023-08-24 00:55:58 -04:00 · ee8b2aa75d
commit ee8b2aa75d
parent 44d5462b5c
1 changed files with 75 additions and 62 deletions
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -68,7 +68,7 @@ struct ggml_allocr {
    size_t max_size;
    bool measure;
    int parse_seq[GGML_MAX_NODES];
-    bool has_parse_seq;
+    int parse_seq_len;
 #ifdef GGML_ALLOCATOR_DEBUG
    struct ggml_tensor * allocated_tensors[1024];
@ -239,14 +239,10 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
 }
 void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
    int pos = 0;
    for (int i = 0; i < n; i++) {
-        if (list[i] != -1) {
+        alloc->parse_seq[i] = list[i];
            alloc->parse_seq[pos] = list[i];
            pos++;
    }
-    }
+    alloc->parse_seq_len = n;
    alloc->has_parse_seq = true;
 }
 void ggml_allocr_reset(struct ggml_allocr * alloc) {
@ -269,7 +265,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
        /*.max_size      = */ 0,
        /*.measure       = */ false,
        /*.parse_seq     = */ {0},
-        /*.has_parse_seq = */ false,
+        /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
        /*.allocated_tensors = */ = {0},
 #endif
@ -298,7 +294,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
        /*.max_size      = */ 0,
        /*.measure       = */ true,
        /*.parse_seq     = */ {0},
-        /*.has_parse_seq = */ false,
+        /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
        /*.allocated_tensors = */ = {0},
 #endif
@ -497,13 +493,14 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                allocate_node(alloc, input);
            }
        }
-        for (int ind = 0; ind < gf->n_nodes; ind++) {
+        // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
-            int i;
+        int last_barrier_pos = 0;
-            if (alloc->has_parse_seq) {
+        int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
-                i = alloc->parse_seq[ind];
+
-            } else {
+        for (int ind = 0; ind < n_nodes; ind++) {
-                i = ind;
+            // allocate a node if there is no parse_seq or this is not a barrier
-            }
+            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
                int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
                struct ggml_tensor * node = gf->nodes[i];
                // allocate parents (leafs)
@ -530,8 +527,19 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                    }
                }
                AT_PRINTF("\n");
            }
            // update parents
            // update immediately if there is no parse_seq
            // update only at barriers if there is parse_seq
            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
                int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
                int update_end   = alloc->parse_seq_len ? ind              : ind + 1;
                for (int i = update_start; i < update_end; i++) {
                    int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
                    struct ggml_tensor * node = gf->nodes[node_i];
                    for (int j = 0; j < GGML_MAX_SRC; j++) {
                        struct ggml_tensor * parent = node->src[j];
                        if (parent == NULL) {
@ -559,7 +567,12 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                            }
                        }
                    }
                }
                AT_PRINTF("\n");
                if (alloc->parse_seq_len) {
                    last_barrier_pos = ind + 1;
                }
            }
        }
        // free graph outputs here that wouldn't be freed otherwise because they have no children
        if (outputs != NULL && outputs[g] != NULL) {