replace n_views and n_children in ggml_tensor with a hash table in the allocator

2023-07-27 18:34:21 +02:00 · 2023-07-27 18:34:21 +02:00 · e39e62ba4a
commit e39e62ba4a
parent af7bd42b2a
4 changed files with 51 additions and 32 deletions
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -14,6 +14,35 @@
 //#define AT_PRINTF printf
 #define AT_PRINTF(...) ((void)0)

+struct hash_node {
+    struct ggml_tensor * t;
+    int n_children;
+    int n_views;
+};
+
+static size_t hash(void * p) {
+    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
+}
+
+static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
+    size_t h = hash(t);
+
+    // linear probing
+    size_t i = h;
+    while (hash_table[i].t != NULL) {
+        if (hash_table[i].t == t) {
+            return &hash_table[i];
+        }
+        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
+        if (i == h) {
+            // hash table is full
+            GGML_ASSERT(false);
+        }
+    }
+
+    hash_table[i].t = t;
+    return &hash_table[i];
+}

 // TODO: GGML_PAD ?
 static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
@ -35,6 +64,7 @@ struct ggml_allocator {
    size_t alignment;
    int n_free_blocks;
    struct free_block free_blocks[MAX_FREE_BLOCKS];
+    struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
    size_t max_size;
    bool measure;

@ -215,6 +245,7 @@ struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alig
        /*.alignment     = */ alignment,
        /*.n_free_blocks = */ 0,
        /*.free_blocks   = */ {{0}},
+        /*.hash_table    = */ {{0}},
        /*.max_size      = */ 0,
        /*.measure       = */ false,
 #ifdef GGML_ALLOCATOR_DEBUG
@ -241,6 +272,7 @@ struct ggml_allocator * ggml_allocator_new_measure(size_t alignment) {
        /*.alignment     = */ alignment,
        /*.n_free_blocks = */ 0,
        /*.free_blocks   = */ {{0}},
+        /*.hash_table    = */ {{0}},
        /*.max_size      = */ 0,
        /*.measure       = */ true,
 #ifdef GGML_ALLOCATOR_DEBUG
@ -305,7 +337,7 @@ static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
    return parent;
 }

-bool ggml_op_can_inplace(enum ggml_op op) {
+static bool ggml_op_can_inplace(enum ggml_op op) {
    switch (op) {
        case GGML_OP_SCALE:
        case GGML_OP_DIAG_MASK_ZERO:
@ -333,6 +365,7 @@ bool ggml_op_can_inplace(enum ggml_op op) {
 }

 static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * node) {
+    struct hash_node * ht = alloc->hash_table;
    if (node->data == NULL) {
        if (ggml_is_view(node)) {
            size_t offset;
@ -360,10 +393,12 @@ static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * no
                if (parent == NULL) {
                    break;
                }
-                if (parent->data != NULL && parent->n_children == 1 && parent->n_views == 0 && ggml_are_same_layout(node, parent) && ggml_op_can_inplace(node->op)) {
+                struct hash_node * p_hn = hash_get(ht, parent);
+                if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent) && ggml_op_can_inplace(node->op)) {
                    if (ggml_is_view(parent)) {
                        struct ggml_tensor * view_src = get_view_source(parent);
-                        if (view_src->n_views == 1 && view_src->n_children == 0 && view_src->data == parent->data) {
+                        struct hash_node * view_src_hn = hash_get(ht, view_src);
+                        if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
                            // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
                            // the parent's data that it will need later (same layout requirement). the problem is that then
                            // we cannot free the tensor because the original address of the allocation is lost.
@ -391,21 +426,9 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
    struct ggml_cgraph ** graphs, int n_graphs,
    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {

-    // reset counters
-    for (int g = 0; g < n_graphs; g++) {
-        struct ggml_cgraph * gf = graphs[g];
-        for (int i = 0; i < gf->n_nodes; i++) {
-            struct ggml_tensor * node = gf->nodes[i];
-            node->n_children = 0;
-            node->n_views = 0;
-        }
-
-        for (int i = 0; i < gf->n_leafs; i++) {
-            struct ggml_tensor * leaf = gf->leafs[i];
-            leaf->n_children = 0;
-            leaf->n_views = 0;
-        }
-    }
+    // reset hash table
+    struct hash_node * ht = alloc->hash_table;
+    memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);

    // count number of children and views
    for (int g = 0; g < n_graphs; g++) {
@ -415,7 +438,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(

            if (ggml_is_view(node)) {
                struct ggml_tensor * view_src = get_view_source(node);
-                view_src->n_views += 1;
+                hash_get(ht, view_src)->n_views += 1;
            }

            for (int j = 0; j < GGML_MAX_SRC; j++) {
@ -423,7 +446,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                if (parent == NULL) {
                    break;
                }
-                parent->n_children += 1;
+                hash_get(ht, parent)->n_children += 1;
            }
        }
    }
@ -474,16 +497,18 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                if (parent == NULL) {
                    break;
                }
-                parent->n_children -= 1;
+                struct hash_node * p_hn = hash_get(ht, parent);
+                p_hn->n_children -= 1;

                //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);

-                if (parent->n_children == 0 && parent->n_views == 0) {
+                if (p_hn->n_children == 0 && p_hn->n_views == 0) {
                    if (ggml_is_view(parent)) {
                        struct ggml_tensor * view_src = get_view_source(parent);
-                        view_src->n_views -= 1;
+                        struct hash_node * view_src_hn = hash_get(ht, view_src);
+                        view_src_hn->n_views -= 1;
                        AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
-                        if (view_src->n_views == 0 && view_src->n_children == 0 && view_src->data != node->data) {
+                        if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
                            ggml_allocator_free_tensor(alloc, view_src);
                        }
                    }
--- a/ggml.c
+++ b/ggml.c
@ -4612,8 +4612,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
        /*.data         =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
        /*.name         =*/ { 0 },
        /*.extra        =*/ NULL,
-        /*.n_children   =*/ 0,
-        /*.n_views      =*/ 0,
        /*.padding      =*/ { 0 },
    };

--- a/ggml.h
+++ b/ggml.h
@ -451,11 +451,7 @@ extern "C" {

        void * extra; // extra things e.g. for ggml-cuda.cu

-        // temp - used by allocator
-        int n_children;
-        int n_views;
-
-        char padding[16];
+        char padding[4];
    };

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
--- a/llama.cpp
+++ b/llama.cpp
@ -1813,7 +1813,7 @@ static bool llama_eval_internal(
    ggml_allocator_alloc_graph_tensors(lctx.alloc, gf);
 #endif

-    // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
+    // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);

    // for big prompts, if BLAS is enabled, it is better to use only one thread
    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance