diff --git a/ggml-alloc.c b/ggml-alloc.c index d7c7978e4..5e1be61ff 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -58,7 +58,7 @@ struct free_block { #define MAX_FREE_BLOCKS 128 -struct ggml_allocator { +struct ggml_allocr { void * data; size_t size; size_t alignment; @@ -97,13 +97,13 @@ static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_t #endif -static size_t ggml_allocator_get_alloc_size(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { +static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { return ggml_nbytes(tensor); UNUSED(alloc); } -void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { +void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { size_t size = ggml_allocator_get_alloc_size(alloc, tensor); size = aligned_offset(NULL, size, alloc->alignment); @@ -163,7 +163,7 @@ void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tens } // this is a very naive implementation, but for our case the number of free blocks should be very small -static void ggml_allocator_free_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) { +static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { void * ptr = tensor->data; if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) { @@ -229,17 +229,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocator * alloc, struct ggm alloc->n_free_blocks++; } -void ggml_allocator_reset(struct ggml_allocator * alloc) { +void ggml_allocr_reset(struct ggml_allocr * alloc) { alloc->n_free_blocks = 1; size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); alloc->free_blocks[0].addr = (char *)alloc->data + align_offset; alloc->free_blocks[0].size = alloc->size - align_offset; } -struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alignment) { - struct ggml_allocator * alloc = (struct ggml_allocator *)malloc(sizeof(struct ggml_allocator) /* + n_free_blocks * sizeof(struct free_block) */); +struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) { + struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); - *alloc = (struct ggml_allocator){ + *alloc = (struct ggml_allocr){ /*.data = */ data, /*.size = */ size, /*.alignment = */ alignment, @@ -253,7 +253,7 @@ struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alig #endif }; - ggml_allocator_reset(alloc); + ggml_allocr_reset(alloc); return alloc; } @@ -263,10 +263,10 @@ struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alig static void * const MEASURE_BASE_ADDR = (void *) 0x1000; static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB -struct ggml_allocator * ggml_allocator_new_measure(size_t alignment) { - struct ggml_allocator * alloc = (struct ggml_allocator *)malloc(sizeof(struct ggml_allocator) /* + n_free_blocks * sizeof(struct free_block) */); +struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { + struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); - *alloc = (struct ggml_allocator){ + *alloc = (struct ggml_allocr){ /*.data = */ MEASURE_BASE_ADDR, /*.size = */ MEASURE_MAX_SIZE, /*.alignment = */ alignment, @@ -280,16 +280,16 @@ struct ggml_allocator * ggml_allocator_new_measure(size_t alignment) { #endif }; - ggml_allocator_reset(alloc); + ggml_allocr_reset(alloc); return alloc; } -void ggml_allocator_free(struct ggml_allocator * alloc) { +void ggml_allocr_free(struct ggml_allocr * alloc) { free(alloc); } -bool ggml_allocator_is_measure(struct ggml_allocator * alloc) { +bool ggml_allocr_is_measure(struct ggml_allocr * alloc) { return alloc->measure; } @@ -364,7 +364,7 @@ static bool ggml_op_can_inplace(enum ggml_op op) { } } -static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * node) { +static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) { struct hash_node * ht = alloc->hash_table; if (node->data == NULL) { if (ggml_is_view(node)) { @@ -388,41 +388,43 @@ static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * no } } else { // see if we can reuse a parent's buffer (inplace) - for (int i = 0; i < GGML_MAX_SRC; i++) { - struct ggml_tensor * parent = node->src[i]; - if (parent == NULL) { - break; - } - struct hash_node * p_hn = hash_get(ht, parent); - if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent) && ggml_op_can_inplace(node->op)) { - if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = get_view_source(parent); - struct hash_node * view_src_hn = hash_get(ht, view_src); - if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { - // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite - // the parent's data that it will need later (same layout requirement). the problem is that then - // we cannot free the tensor because the original address of the allocation is lost. - // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views - // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) - AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); - node->data = parent->data; - return; + if (ggml_op_can_inplace(node->op)) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + struct ggml_tensor * parent = node->src[i]; + if (parent == NULL) { + break; + } + struct hash_node * p_hn = hash_get(ht, parent); + if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = get_view_source(parent); + struct hash_node * view_src_hn = hash_get(ht, view_src); + if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { + // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite + // the parent's data that it will need later (same layout requirement). the problem is that then + // we cannot free the tensor because the original address of the allocation is lost. + // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views + // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) + AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); + node->data = parent->data; + return; + } } + else { + AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); + node->data = parent->data; + } + return; } - else { - AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); - node->data = parent->data; - } - return; } } - ggml_allocator_alloc_tensor(alloc, node); + ggml_allocr_alloc(alloc, node); } } } static size_t ggml_allocator_alloc_graph_tensors_n( - struct ggml_allocator * alloc, + struct ggml_allocr * alloc, struct ggml_cgraph ** graphs, int n_graphs, struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) { @@ -455,7 +457,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n( for (int g = 0; g < n_graphs; g++) { struct ggml_cgraph * gf = graphs[g]; AT_PRINTF("####### graph %d/%d\n", g, n_graphs); - // graph inputs are allocated first to ensure that they are never overwritten + // graph inputs are allocated first to ensure that they are not overwritten by each other if (inputs != NULL && inputs[g] != NULL) { for (int i = 0; inputs[g][i] != NULL; i++) { struct ggml_tensor * input = inputs[g][i]; @@ -534,6 +536,6 @@ static size_t ggml_allocator_alloc_graph_tensors_n( return alloc->max_size; } -size_t ggml_allocator_alloc_graph_tensors(struct ggml_allocator * alloc, struct ggml_cgraph * graph) { +size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) { return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); } diff --git a/ggml-alloc.h b/ggml-alloc.h index 716d74642..a5ec8f87a 100644 --- a/ggml-alloc.h +++ b/ggml-alloc.h @@ -7,13 +7,14 @@ extern "C" { #endif -GGML_API struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alignment); -GGML_API struct ggml_allocator * ggml_allocator_new_measure(size_t alignment); -GGML_API void ggml_allocator_free(struct ggml_allocator * alloc); -GGML_API bool ggml_allocator_is_measure(struct ggml_allocator * alloc); -GGML_API void ggml_allocator_reset(struct ggml_allocator * alloc); -GGML_API void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor); -GGML_API size_t ggml_allocator_alloc_graph_tensors(struct ggml_allocator * alloc, struct ggml_cgraph * graph); +GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment); +GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); + +GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); +GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); +GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); +GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor); +GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph); #ifdef __cplusplus diff --git a/llama.cpp b/llama.cpp index 6c57a2be9..e3e10fb73 100644 --- a/llama.cpp +++ b/llama.cpp @@ -344,7 +344,7 @@ struct llama_context { #endif #ifdef LLAMA_USE_ALLOCATOR if (alloc) { - ggml_allocator_free(alloc); + ggml_allocr_free(alloc); } #endif } @@ -389,7 +389,7 @@ struct llama_context { #ifdef LLAMA_USE_ALLOCATOR llama_ctx_buffer buf_alloc; - ggml_allocator * alloc = NULL; + ggml_allocr * alloc = NULL; #endif #ifdef LLAMA_USE_SCRATCH @@ -1431,10 +1431,6 @@ static struct ggml_cgraph * llama_build_graph( }; #ifdef LLAMA_USE_ALLOCATOR -# define ggml_rope_custom_inplace ggml_rope_custom -# define ggml_scale_inplace ggml_scale -# define ggml_diag_mask_inf_inplace ggml_diag_mask_inf -# define ggml_soft_max_inplace ggml_soft_max params.no_alloc = true; #endif @@ -1449,8 +1445,8 @@ static struct ggml_cgraph * llama_build_graph( struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); #ifdef LLAMA_USE_ALLOCATOR - ggml_allocator_alloc_tensor(lctx.alloc, inp_tokens); - if (!ggml_allocator_is_measure(lctx.alloc)) { + ggml_allocr_alloc(lctx.alloc, inp_tokens); + if (!ggml_allocr_is_measure(lctx.alloc)) { memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); } #else @@ -1467,8 +1463,8 @@ static struct ggml_cgraph * llama_build_graph( inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); #ifdef LLAMA_USE_ALLOCATOR - ggml_allocator_alloc_tensor(lctx.alloc, inpL); - if (!ggml_allocator_is_measure(lctx.alloc)) { + ggml_allocr_alloc(lctx.alloc, inpL); + if (!ggml_allocr_is_measure(lctx.alloc)) { memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); } #else @@ -1502,8 +1498,8 @@ static struct ggml_cgraph * llama_build_graph( struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); #ifdef LLAMA_USE_ALLOCATOR - ggml_allocator_alloc_tensor(lctx.alloc, KQ_scale); - if (!ggml_allocator_is_measure(lctx.alloc)) { + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); } #else @@ -1760,13 +1756,6 @@ static struct ggml_cgraph * llama_build_graph( ggml_free(ctx0); return gf; - -#ifdef LLAMA_USE_ALLOCATOR -# undef ggml_rope_custom -# undef ggml_scale -# undef ggml_diag_mask_inf -# undef ggml_soft_max -#endif } // evaluate the transformer @@ -1808,13 +1797,13 @@ static bool llama_eval_internal( const int64_t n_vocab = hparams.n_vocab; #ifdef LLAMA_USE_ALLOCATOR - ggml_allocator_reset(lctx.alloc); + ggml_allocr_reset(lctx.alloc); #endif ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past); #ifdef LLAMA_USE_ALLOCATOR - ggml_allocator_alloc_graph_tensors(lctx.alloc, gf); + ggml_allocr_alloc_graph(lctx.alloc, gf); #endif // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -3282,7 +3271,7 @@ struct llama_context * llama_new_context_with_model( ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); // create measure allocator - ctx->alloc = ggml_allocator_new_measure(tensor_alignment); + ctx->alloc = ggml_allocr_new_measure(tensor_alignment); // build worst-case graph int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); @@ -3291,7 +3280,7 @@ struct llama_context * llama_new_context_with_model( ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); // measure memory requirements for the graph - size_t alloc_size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf) + tensor_alignment; + size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); @@ -3303,10 +3292,10 @@ struct llama_context * llama_new_context_with_model( //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); // recreate allocator with exact memory requirements - ggml_allocator_free(ctx->alloc); + ggml_allocr_free(ctx->alloc); ctx->buf_alloc.resize(alloc_size); - ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); + ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); } #else ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());