rename ggml_allocator to ggml_allocr

cleanup ggml-ci
2023-07-29 15:01:43 +02:00 · 2023-07-29 15:01:43 +02:00 · 570aa7ceeb
commit 570aa7ceeb
parent cd4a8cd28c
3 changed files with 68 additions and 76 deletions
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -58,7 +58,7 @@ struct free_block {

 #define MAX_FREE_BLOCKS 128

-struct ggml_allocator {
+struct ggml_allocr {
    void * data;
    size_t size;
    size_t alignment;
@ -97,13 +97,13 @@ static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_t
 #endif


-static size_t ggml_allocator_get_alloc_size(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
+static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    return ggml_nbytes(tensor);

    UNUSED(alloc);
 }

-void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
+void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);

@ -163,7 +163,7 @@ void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tens
 }

 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_allocator_free_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
+static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    void * ptr = tensor->data;

    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
@ -229,17 +229,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocator * alloc, struct ggm
    alloc->n_free_blocks++;
 }

-void ggml_allocator_reset(struct ggml_allocator * alloc) {
+void ggml_allocr_reset(struct ggml_allocr * alloc) {
    alloc->n_free_blocks = 1;
    size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
    alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
    alloc->free_blocks[0].size = alloc->size - align_offset;
 }

-struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alignment) {
-    struct ggml_allocator * alloc = (struct ggml_allocator *)malloc(sizeof(struct ggml_allocator) /* + n_free_blocks * sizeof(struct free_block) */);
+struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);

-    *alloc = (struct ggml_allocator){
+    *alloc = (struct ggml_allocr){
        /*.data          = */ data,
        /*.size          = */ size,
        /*.alignment     = */ alignment,
@ -253,7 +253,7 @@ struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alig
 #endif
    };

-    ggml_allocator_reset(alloc);
+    ggml_allocr_reset(alloc);

    return alloc;
 }
@ -263,10 +263,10 @@ struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alig
 static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
 static const size_t MEASURE_MAX_SIZE  = 1ULL<<40; // 1 TB

-struct ggml_allocator * ggml_allocator_new_measure(size_t alignment) {
-    struct ggml_allocator * alloc = (struct ggml_allocator *)malloc(sizeof(struct ggml_allocator) /* + n_free_blocks * sizeof(struct free_block) */);
+struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);

-    *alloc = (struct ggml_allocator){
+    *alloc = (struct ggml_allocr){
        /*.data          = */ MEASURE_BASE_ADDR,
        /*.size          = */ MEASURE_MAX_SIZE,
        /*.alignment     = */ alignment,
@ -280,16 +280,16 @@ struct ggml_allocator * ggml_allocator_new_measure(size_t alignment) {
 #endif
    };

-    ggml_allocator_reset(alloc);
+    ggml_allocr_reset(alloc);

    return alloc;
 }

-void ggml_allocator_free(struct ggml_allocator * alloc) {
+void ggml_allocr_free(struct ggml_allocr * alloc) {
    free(alloc);
 }

-bool ggml_allocator_is_measure(struct ggml_allocator * alloc) {
+bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
    return alloc->measure;
 }

@ -364,7 +364,7 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
    }
 }

-static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * node) {
+static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
    struct hash_node * ht = alloc->hash_table;
    if (node->data == NULL) {
        if (ggml_is_view(node)) {
@ -388,41 +388,43 @@ static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * no
            }
        } else {
            // see if we can reuse a parent's buffer (inplace)
-            for (int i = 0; i < GGML_MAX_SRC; i++) {
-                struct ggml_tensor * parent = node->src[i];
-                if (parent == NULL) {
-                    break;
-                }
-                struct hash_node * p_hn = hash_get(ht, parent);
-                if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent) && ggml_op_can_inplace(node->op)) {
-                    if (ggml_is_view(parent)) {
-                        struct ggml_tensor * view_src = get_view_source(parent);
-                        struct hash_node * view_src_hn = hash_get(ht, view_src);
-                        if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
-                            // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
-                            // the parent's data that it will need later (same layout requirement). the problem is that then
-                            // we cannot free the tensor because the original address of the allocation is lost.
-                            // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
-                            // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
-                            AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
-                            node->data = parent->data;
-                            return;
+            if (ggml_op_can_inplace(node->op)) {
+                for (int i = 0; i < GGML_MAX_SRC; i++) {
+                    struct ggml_tensor * parent = node->src[i];
+                    if (parent == NULL) {
+                        break;
+                    }
+                    struct hash_node * p_hn = hash_get(ht, parent);
+                    if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
+                        if (ggml_is_view(parent)) {
+                            struct ggml_tensor * view_src = get_view_source(parent);
+                            struct hash_node * view_src_hn = hash_get(ht, view_src);
+                            if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
+                                // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
+                                // the parent's data that it will need later (same layout requirement). the problem is that then
+                                // we cannot free the tensor because the original address of the allocation is lost.
+                                // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
+                                // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
+                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
+                                node->data = parent->data;
+                                return;
+                            }
                        }
+                        else {
+                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
+                            node->data = parent->data;
+                        }
+                        return;
                    }
-                    else {
-                        AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
-                        node->data = parent->data;
-                    }
-                    return;
                }
            }
-            ggml_allocator_alloc_tensor(alloc, node);
+            ggml_allocr_alloc(alloc, node);
        }
    }
 }

 static size_t ggml_allocator_alloc_graph_tensors_n(
-    struct ggml_allocator * alloc,
+    struct ggml_allocr * alloc,
    struct ggml_cgraph ** graphs, int n_graphs,
    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {

@ -455,7 +457,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
    for (int g = 0; g < n_graphs; g++) {
        struct ggml_cgraph * gf = graphs[g];
        AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
-        // graph inputs are allocated first to ensure that they are never overwritten
+        // graph inputs are allocated first to ensure that they are not overwritten by each other
        if (inputs != NULL && inputs[g] != NULL) {
            for (int i = 0; inputs[g][i] != NULL; i++) {
                struct ggml_tensor * input = inputs[g][i];
@ -534,6 +536,6 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
    return alloc->max_size;
 }

-size_t ggml_allocator_alloc_graph_tensors(struct ggml_allocator * alloc, struct ggml_cgraph * graph) {
+size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
    return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
 }
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@ -7,13 +7,14 @@ extern "C" {
 #endif


-GGML_API struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alignment);
-GGML_API struct ggml_allocator * ggml_allocator_new_measure(size_t alignment);
-GGML_API void ggml_allocator_free(struct ggml_allocator * alloc);
-GGML_API bool ggml_allocator_is_measure(struct ggml_allocator * alloc);
-GGML_API void ggml_allocator_reset(struct ggml_allocator * alloc);
-GGML_API void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor);
-GGML_API size_t ggml_allocator_alloc_graph_tensors(struct ggml_allocator * alloc, struct ggml_cgraph * graph);
+GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
+GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
+
+GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
+GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
+GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);


 #ifdef  __cplusplus
--- a/llama.cpp
+++ b/llama.cpp
@ -344,7 +344,7 @@ struct llama_context {
 #endif
 #ifdef LLAMA_USE_ALLOCATOR
        if (alloc) {
-            ggml_allocator_free(alloc);
+            ggml_allocr_free(alloc);
        }
 #endif
    }
@ -389,7 +389,7 @@ struct llama_context {

 #ifdef LLAMA_USE_ALLOCATOR
    llama_ctx_buffer buf_alloc;
-    ggml_allocator * alloc = NULL;
+    ggml_allocr * alloc = NULL;
 #endif

 #ifdef LLAMA_USE_SCRATCH
@ -1431,10 +1431,6 @@ static struct ggml_cgraph * llama_build_graph(
    };

 #ifdef LLAMA_USE_ALLOCATOR
-#  define ggml_rope_custom_inplace ggml_rope_custom
-#  define ggml_scale_inplace ggml_scale
-#  define ggml_diag_mask_inf_inplace ggml_diag_mask_inf
-#  define ggml_soft_max_inplace ggml_soft_max
    params.no_alloc = true;
 #endif

@ -1449,8 +1445,8 @@ static struct ggml_cgraph * llama_build_graph(
        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);

 #ifdef LLAMA_USE_ALLOCATOR
-        ggml_allocator_alloc_tensor(lctx.alloc, inp_tokens);
-        if (!ggml_allocator_is_measure(lctx.alloc)) {
+        ggml_allocr_alloc(lctx.alloc, inp_tokens);
+        if (!ggml_allocr_is_measure(lctx.alloc)) {
            memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
        }
 #else
@ -1467,8 +1463,8 @@ static struct ggml_cgraph * llama_build_graph(
        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);

 #ifdef LLAMA_USE_ALLOCATOR
-        ggml_allocator_alloc_tensor(lctx.alloc, inpL);
-        if (!ggml_allocator_is_measure(lctx.alloc)) {
+        ggml_allocr_alloc(lctx.alloc, inpL);
+        if (!ggml_allocr_is_measure(lctx.alloc)) {
            memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
        }
 #else
@ -1502,8 +1498,8 @@ static struct ggml_cgraph * llama_build_graph(

    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
 #ifdef LLAMA_USE_ALLOCATOR
-    ggml_allocator_alloc_tensor(lctx.alloc, KQ_scale);
-    if (!ggml_allocator_is_measure(lctx.alloc)) {
+    ggml_allocr_alloc(lctx.alloc, KQ_scale);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
    }
 #else
@ -1760,13 +1756,6 @@ static struct ggml_cgraph * llama_build_graph(
    ggml_free(ctx0);

    return gf;
-
-#ifdef LLAMA_USE_ALLOCATOR
-#  undef ggml_rope_custom
-#  undef ggml_scale
-#  undef ggml_diag_mask_inf
-#  undef ggml_soft_max
-#endif
 }

 // evaluate the transformer
@ -1808,13 +1797,13 @@ static bool llama_eval_internal(
    const int64_t n_vocab     = hparams.n_vocab;

 #ifdef LLAMA_USE_ALLOCATOR
-    ggml_allocator_reset(lctx.alloc);
+    ggml_allocr_reset(lctx.alloc);
 #endif

    ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);

 #ifdef LLAMA_USE_ALLOCATOR
-    ggml_allocator_alloc_graph_tensors(lctx.alloc, gf);
+    ggml_allocr_alloc_graph(lctx.alloc, gf);
 #endif

    // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@ -3282,7 +3271,7 @@ struct llama_context * llama_new_context_with_model(
            ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());

            // create measure allocator
-            ctx->alloc = ggml_allocator_new_measure(tensor_alignment);
+            ctx->alloc = ggml_allocr_new_measure(tensor_alignment);

            // build worst-case graph
            int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
@ -3291,7 +3280,7 @@ struct llama_context * llama_new_context_with_model(
            ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);

            // measure memory requirements for the graph
-            size_t alloc_size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf) + tensor_alignment;
+            size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;

            fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);

@ -3303,10 +3292,10 @@ struct llama_context * llama_new_context_with_model(
            //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);

            // recreate allocator with exact memory requirements
-            ggml_allocator_free(ctx->alloc);
+            ggml_allocr_free(ctx->alloc);

            ctx->buf_alloc.resize(alloc_size);
-            ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
+            ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
        }
 #else
        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());