rename ggml_allocator to ggml_allocr

cleanup

ggml-ci
This commit is contained in:
slaren 2023-07-29 15:01:43 +02:00
parent cd4a8cd28c
commit 570aa7ceeb
3 changed files with 68 additions and 76 deletions

View file

@ -58,7 +58,7 @@ struct free_block {
#define MAX_FREE_BLOCKS 128
struct ggml_allocator {
struct ggml_allocr {
void * data;
size_t size;
size_t alignment;
@ -97,13 +97,13 @@ static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_t
#endif
static size_t ggml_allocator_get_alloc_size(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
return ggml_nbytes(tensor);
UNUSED(alloc);
}
void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
size = aligned_offset(NULL, size, alloc->alignment);
@ -163,7 +163,7 @@ void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tens
}
// this is a very naive implementation, but for our case the number of free blocks should be very small
static void ggml_allocator_free_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
void * ptr = tensor->data;
if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
@ -229,17 +229,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocator * alloc, struct ggm
alloc->n_free_blocks++;
}
void ggml_allocator_reset(struct ggml_allocator * alloc) {
void ggml_allocr_reset(struct ggml_allocr * alloc) {
alloc->n_free_blocks = 1;
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
alloc->free_blocks[0].size = alloc->size - align_offset;
}
struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alignment) {
struct ggml_allocator * alloc = (struct ggml_allocator *)malloc(sizeof(struct ggml_allocator) /* + n_free_blocks * sizeof(struct free_block) */);
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
*alloc = (struct ggml_allocator){
*alloc = (struct ggml_allocr){
/*.data = */ data,
/*.size = */ size,
/*.alignment = */ alignment,
@ -253,7 +253,7 @@ struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alig
#endif
};
ggml_allocator_reset(alloc);
ggml_allocr_reset(alloc);
return alloc;
}
@ -263,10 +263,10 @@ struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alig
static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
struct ggml_allocator * ggml_allocator_new_measure(size_t alignment) {
struct ggml_allocator * alloc = (struct ggml_allocator *)malloc(sizeof(struct ggml_allocator) /* + n_free_blocks * sizeof(struct free_block) */);
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
*alloc = (struct ggml_allocator){
*alloc = (struct ggml_allocr){
/*.data = */ MEASURE_BASE_ADDR,
/*.size = */ MEASURE_MAX_SIZE,
/*.alignment = */ alignment,
@ -280,16 +280,16 @@ struct ggml_allocator * ggml_allocator_new_measure(size_t alignment) {
#endif
};
ggml_allocator_reset(alloc);
ggml_allocr_reset(alloc);
return alloc;
}
void ggml_allocator_free(struct ggml_allocator * alloc) {
void ggml_allocr_free(struct ggml_allocr * alloc) {
free(alloc);
}
bool ggml_allocator_is_measure(struct ggml_allocator * alloc) {
bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
return alloc->measure;
}
@ -364,7 +364,7 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
}
}
static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * node) {
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
struct hash_node * ht = alloc->hash_table;
if (node->data == NULL) {
if (ggml_is_view(node)) {
@ -388,41 +388,43 @@ static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * no
}
} else {
// see if we can reuse a parent's buffer (inplace)
for (int i = 0; i < GGML_MAX_SRC; i++) {
struct ggml_tensor * parent = node->src[i];
if (parent == NULL) {
break;
}
struct hash_node * p_hn = hash_get(ht, parent);
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent) && ggml_op_can_inplace(node->op)) {
if (ggml_is_view(parent)) {
struct ggml_tensor * view_src = get_view_source(parent);
struct hash_node * view_src_hn = hash_get(ht, view_src);
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
// the parent's data that it will need later (same layout requirement). the problem is that then
// we cannot free the tensor because the original address of the allocation is lost.
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
node->data = parent->data;
return;
if (ggml_op_can_inplace(node->op)) {
for (int i = 0; i < GGML_MAX_SRC; i++) {
struct ggml_tensor * parent = node->src[i];
if (parent == NULL) {
break;
}
struct hash_node * p_hn = hash_get(ht, parent);
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
if (ggml_is_view(parent)) {
struct ggml_tensor * view_src = get_view_source(parent);
struct hash_node * view_src_hn = hash_get(ht, view_src);
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
// the parent's data that it will need later (same layout requirement). the problem is that then
// we cannot free the tensor because the original address of the allocation is lost.
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
node->data = parent->data;
return;
}
}
else {
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
node->data = parent->data;
}
return;
}
else {
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
node->data = parent->data;
}
return;
}
}
ggml_allocator_alloc_tensor(alloc, node);
ggml_allocr_alloc(alloc, node);
}
}
}
static size_t ggml_allocator_alloc_graph_tensors_n(
struct ggml_allocator * alloc,
struct ggml_allocr * alloc,
struct ggml_cgraph ** graphs, int n_graphs,
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@ -455,7 +457,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
for (int g = 0; g < n_graphs; g++) {
struct ggml_cgraph * gf = graphs[g];
AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
// graph inputs are allocated first to ensure that they are never overwritten
// graph inputs are allocated first to ensure that they are not overwritten by each other
if (inputs != NULL && inputs[g] != NULL) {
for (int i = 0; inputs[g][i] != NULL; i++) {
struct ggml_tensor * input = inputs[g][i];
@ -534,6 +536,6 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
return alloc->max_size;
}
size_t ggml_allocator_alloc_graph_tensors(struct ggml_allocator * alloc, struct ggml_cgraph * graph) {
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
}

View file

@ -7,13 +7,14 @@ extern "C" {
#endif
GGML_API struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alignment);
GGML_API struct ggml_allocator * ggml_allocator_new_measure(size_t alignment);
GGML_API void ggml_allocator_free(struct ggml_allocator * alloc);
GGML_API bool ggml_allocator_is_measure(struct ggml_allocator * alloc);
GGML_API void ggml_allocator_reset(struct ggml_allocator * alloc);
GGML_API void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor);
GGML_API size_t ggml_allocator_alloc_graph_tensors(struct ggml_allocator * alloc, struct ggml_cgraph * graph);
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
#ifdef __cplusplus

View file

@ -344,7 +344,7 @@ struct llama_context {
#endif
#ifdef LLAMA_USE_ALLOCATOR
if (alloc) {
ggml_allocator_free(alloc);
ggml_allocr_free(alloc);
}
#endif
}
@ -389,7 +389,7 @@ struct llama_context {
#ifdef LLAMA_USE_ALLOCATOR
llama_ctx_buffer buf_alloc;
ggml_allocator * alloc = NULL;
ggml_allocr * alloc = NULL;
#endif
#ifdef LLAMA_USE_SCRATCH
@ -1431,10 +1431,6 @@ static struct ggml_cgraph * llama_build_graph(
};
#ifdef LLAMA_USE_ALLOCATOR
# define ggml_rope_custom_inplace ggml_rope_custom
# define ggml_scale_inplace ggml_scale
# define ggml_diag_mask_inf_inplace ggml_diag_mask_inf
# define ggml_soft_max_inplace ggml_soft_max
params.no_alloc = true;
#endif
@ -1449,8 +1445,8 @@ static struct ggml_cgraph * llama_build_graph(
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocator_alloc_tensor(lctx.alloc, inp_tokens);
if (!ggml_allocator_is_measure(lctx.alloc)) {
ggml_allocr_alloc(lctx.alloc, inp_tokens);
if (!ggml_allocr_is_measure(lctx.alloc)) {
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
}
#else
@ -1467,8 +1463,8 @@ static struct ggml_cgraph * llama_build_graph(
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocator_alloc_tensor(lctx.alloc, inpL);
if (!ggml_allocator_is_measure(lctx.alloc)) {
ggml_allocr_alloc(lctx.alloc, inpL);
if (!ggml_allocr_is_measure(lctx.alloc)) {
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
}
#else
@ -1502,8 +1498,8 @@ static struct ggml_cgraph * llama_build_graph(
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocator_alloc_tensor(lctx.alloc, KQ_scale);
if (!ggml_allocator_is_measure(lctx.alloc)) {
ggml_allocr_alloc(lctx.alloc, KQ_scale);
if (!ggml_allocr_is_measure(lctx.alloc)) {
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
}
#else
@ -1760,13 +1756,6 @@ static struct ggml_cgraph * llama_build_graph(
ggml_free(ctx0);
return gf;
#ifdef LLAMA_USE_ALLOCATOR
# undef ggml_rope_custom
# undef ggml_scale
# undef ggml_diag_mask_inf
# undef ggml_soft_max
#endif
}
// evaluate the transformer
@ -1808,13 +1797,13 @@ static bool llama_eval_internal(
const int64_t n_vocab = hparams.n_vocab;
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocator_reset(lctx.alloc);
ggml_allocr_reset(lctx.alloc);
#endif
ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
#ifdef LLAMA_USE_ALLOCATOR
ggml_allocator_alloc_graph_tensors(lctx.alloc, gf);
ggml_allocr_alloc_graph(lctx.alloc, gf);
#endif
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
@ -3282,7 +3271,7 @@ struct llama_context * llama_new_context_with_model(
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
// create measure allocator
ctx->alloc = ggml_allocator_new_measure(tensor_alignment);
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
// build worst-case graph
int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
@ -3291,7 +3280,7 @@ struct llama_context * llama_new_context_with_model(
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
// measure memory requirements for the graph
size_t alloc_size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf) + tensor_alignment;
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
@ -3303,10 +3292,10 @@ struct llama_context * llama_new_context_with_model(
//fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
// recreate allocator with exact memory requirements
ggml_allocator_free(ctx->alloc);
ggml_allocr_free(ctx->alloc);
ctx->buf_alloc.resize(alloc_size);
ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
}
#else
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());