ggml : allocate graphs in a context (#2392)

* ggml : graph allocation in contexts

* allocate work buffer as a ggml_object in ggml_graph_compute_with_ctx

* llama.cpp : allocate graph in the context

* add GGML_PAD

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
slaren 2023-07-26 15:56:53 +02:00 committed by GitHub
parent eb542d3932
commit 5488fb789e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 136 additions and 90 deletions

175
ggml.c
View file

@ -4071,8 +4071,8 @@ bool ggml_is_numa(void) {
////////////////////////////////////////////////////////////////////////////////
void ggml_print_object(const struct ggml_object * obj) {
GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
obj->offs, obj->size, (const void *) obj->next);
GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
obj->type, obj->offs, obj->size, (const void *) obj->next);
}
void ggml_print_objects(const struct ggml_context * ctx) {
@ -4212,7 +4212,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
}
size_t ggml_tensor_overhead(void) {
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
}
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
@ -4383,7 +4383,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
return NULL;
}
const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
*ctx = (struct ggml_context) {
/*.mem_size =*/ mem_size,
@ -4472,12 +4472,14 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
struct ggml_object * obj = ctx->objects_begin;
while (obj != NULL) {
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
if (obj->type == GGML_OBJECT_TENSOR) {
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
const size_t size = ggml_nbytes(tensor);
const size_t size = ggml_nbytes(tensor);
if (max_size < size) {
max_size = size;
if (max_size < size) {
max_size = size;
}
}
obj = obj->next;
@ -4509,12 +4511,7 @@ static void ggml_scratch_load(struct ggml_context * ctx) {
////////////////////////////////////////////////////////////////////////////////
static struct ggml_tensor * ggml_new_tensor_impl(
struct ggml_context * ctx,
enum ggml_type type,
int n_dims,
const int64_t* ne,
void* data) {
static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
// always insert objects at the end of the context's memory pool
struct ggml_object * obj_cur = ctx->objects_end;
@ -4522,63 +4519,28 @@ static struct ggml_tensor * ggml_new_tensor_impl(
const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
const size_t cur_end = cur_offs + cur_size;
size_t size_needed = 0;
if (data == NULL && !ctx->no_alloc) {
size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
for (int i = 1; i < n_dims; i++) {
size_needed *= ne[i];
}
// align to GGML_MEM_ALIGN
size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
}
// align to GGML_MEM_ALIGN
size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
char * const mem_buffer = ctx->mem_buffer;
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
if (ctx->scratch.data == NULL || data != NULL) {
size_needed += GGML_TENSOR_SIZE;
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
assert(false);
return NULL;
}
*obj_new = (struct ggml_object) {
.offs = cur_end + GGML_OBJECT_SIZE,
.size = size_needed,
.next = NULL,
};
} else {
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
__func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
assert(false);
return NULL;
}
if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
__func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
assert(false);
return NULL;
}
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
*obj_new = (struct ggml_object) {
.offs = cur_end + GGML_OBJECT_SIZE,
.size = GGML_TENSOR_SIZE,
.next = NULL,
};
//printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
ctx->scratch.offs += size_needed;
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
__func__, cur_end + size_needed, ctx->mem_size);
assert(false);
return NULL;
}
*obj_new = (struct ggml_object) {
.offs = cur_end + GGML_OBJECT_SIZE,
.size = size_needed,
.next = NULL,
.type = type,
};
ggml_assert_aligned(mem_buffer + obj_new->offs);
if (obj_cur != NULL) {
obj_cur->next = obj_new;
} else {
@ -4590,9 +4552,46 @@ static struct ggml_tensor * ggml_new_tensor_impl(
//printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
return obj_new;
}
ggml_assert_aligned(result);
static struct ggml_tensor * ggml_new_tensor_impl(
struct ggml_context * ctx,
enum ggml_type type,
int n_dims,
const int64_t* ne,
void* data) {
size_t data_size = 0;
if (data == NULL && !ctx->no_alloc) {
data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
for (int i = 1; i < n_dims; i++) {
data_size *= ne[i];
}
}
if (ctx->scratch.data != NULL && data == NULL) {
// allocate tensor data in the scratch buffer
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
__func__, ctx->scratch.offs + data_size, ctx->scratch.size);
assert(false);
return NULL;
}
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
ctx->scratch.offs += data_size;
data_size = 0;
}
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
// TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
*result = (struct ggml_tensor) {
/*.type =*/ type,
@ -5026,9 +5025,11 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
char * const mem_buffer = ctx->mem_buffer;
while (obj != NULL) {
struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
if (strcmp(cur->name, name) == 0) {
return cur;
if (obj->type == GGML_OBJECT_TENSOR) {
struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
if (strcmp(cur->name, name) == 0) {
return cur;
}
}
obj = obj->next;
@ -15829,6 +15830,35 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
return result;
}
struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
*cgraph = (struct ggml_cgraph) {
/*.n_nodes =*/ 0,
/*.n_leafs =*/ 0,
/*.nodes =*/ { NULL },
/*.grads =*/ { NULL },
/*.leafs =*/ { NULL },
/*.hash_table =*/ { NULL },
/*.perf_runs =*/ 0,
/*.perf_cycles =*/ 0,
/*.perf_time_us =*/ 0,
};
return cgraph;
}
struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
ggml_build_forward_impl(cgraph, tensor, false);
return cgraph;
}
size_t ggml_graph_overhead(void) {
return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
}
//
// thread data
//
@ -16544,10 +16574,9 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
GGML_ASSERT(buf);
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
cplan.work_data = buf->data;
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
ggml_graph_compute(cgraph, &cplan);
}