rename ggml_allocator to ggml_allocr
cleanup ggml-ci
This commit is contained in:
parent
cd4a8cd28c
commit
570aa7ceeb
3 changed files with 68 additions and 76 deletions
90
ggml-alloc.c
90
ggml-alloc.c
|
@ -58,7 +58,7 @@ struct free_block {
|
|||
|
||||
#define MAX_FREE_BLOCKS 128
|
||||
|
||||
struct ggml_allocator {
|
||||
struct ggml_allocr {
|
||||
void * data;
|
||||
size_t size;
|
||||
size_t alignment;
|
||||
|
@ -97,13 +97,13 @@ static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_t
|
|||
#endif
|
||||
|
||||
|
||||
static size_t ggml_allocator_get_alloc_size(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
|
||||
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
return ggml_nbytes(tensor);
|
||||
|
||||
UNUSED(alloc);
|
||||
}
|
||||
|
||||
void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
|
||||
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
|
||||
|
@ -163,7 +163,7 @@ void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tens
|
|||
}
|
||||
|
||||
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||
static void ggml_allocator_free_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
|
||||
static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
void * ptr = tensor->data;
|
||||
|
||||
if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
|
||||
|
@ -229,17 +229,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocator * alloc, struct ggm
|
|||
alloc->n_free_blocks++;
|
||||
}
|
||||
|
||||
void ggml_allocator_reset(struct ggml_allocator * alloc) {
|
||||
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
||||
alloc->n_free_blocks = 1;
|
||||
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
||||
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
|
||||
alloc->free_blocks[0].size = alloc->size - align_offset;
|
||||
}
|
||||
|
||||
struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alignment) {
|
||||
struct ggml_allocator * alloc = (struct ggml_allocator *)malloc(sizeof(struct ggml_allocator) /* + n_free_blocks * sizeof(struct free_block) */);
|
||||
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
||||
|
||||
*alloc = (struct ggml_allocator){
|
||||
*alloc = (struct ggml_allocr){
|
||||
/*.data = */ data,
|
||||
/*.size = */ size,
|
||||
/*.alignment = */ alignment,
|
||||
|
@ -253,7 +253,7 @@ struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alig
|
|||
#endif
|
||||
};
|
||||
|
||||
ggml_allocator_reset(alloc);
|
||||
ggml_allocr_reset(alloc);
|
||||
|
||||
return alloc;
|
||||
}
|
||||
|
@ -263,10 +263,10 @@ struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alig
|
|||
static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
|
||||
static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
|
||||
|
||||
struct ggml_allocator * ggml_allocator_new_measure(size_t alignment) {
|
||||
struct ggml_allocator * alloc = (struct ggml_allocator *)malloc(sizeof(struct ggml_allocator) /* + n_free_blocks * sizeof(struct free_block) */);
|
||||
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
||||
|
||||
*alloc = (struct ggml_allocator){
|
||||
*alloc = (struct ggml_allocr){
|
||||
/*.data = */ MEASURE_BASE_ADDR,
|
||||
/*.size = */ MEASURE_MAX_SIZE,
|
||||
/*.alignment = */ alignment,
|
||||
|
@ -280,16 +280,16 @@ struct ggml_allocator * ggml_allocator_new_measure(size_t alignment) {
|
|||
#endif
|
||||
};
|
||||
|
||||
ggml_allocator_reset(alloc);
|
||||
ggml_allocr_reset(alloc);
|
||||
|
||||
return alloc;
|
||||
}
|
||||
|
||||
void ggml_allocator_free(struct ggml_allocator * alloc) {
|
||||
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
||||
free(alloc);
|
||||
}
|
||||
|
||||
bool ggml_allocator_is_measure(struct ggml_allocator * alloc) {
|
||||
bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
||||
return alloc->measure;
|
||||
}
|
||||
|
||||
|
@ -364,7 +364,7 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|||
}
|
||||
}
|
||||
|
||||
static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * node) {
|
||||
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
|
||||
struct hash_node * ht = alloc->hash_table;
|
||||
if (node->data == NULL) {
|
||||
if (ggml_is_view(node)) {
|
||||
|
@ -388,41 +388,43 @@ static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * no
|
|||
}
|
||||
} else {
|
||||
// see if we can reuse a parent's buffer (inplace)
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
struct ggml_tensor * parent = node->src[i];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
struct hash_node * p_hn = hash_get(ht, parent);
|
||||
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent) && ggml_op_can_inplace(node->op)) {
|
||||
if (ggml_is_view(parent)) {
|
||||
struct ggml_tensor * view_src = get_view_source(parent);
|
||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
||||
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
||||
// the parent's data that it will need later (same layout requirement). the problem is that then
|
||||
// we cannot free the tensor because the original address of the allocation is lost.
|
||||
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
|
||||
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
|
||||
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
||||
node->data = parent->data;
|
||||
return;
|
||||
if (ggml_op_can_inplace(node->op)) {
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
struct ggml_tensor * parent = node->src[i];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
struct hash_node * p_hn = hash_get(ht, parent);
|
||||
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
||||
if (ggml_is_view(parent)) {
|
||||
struct ggml_tensor * view_src = get_view_source(parent);
|
||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
||||
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
||||
// the parent's data that it will need later (same layout requirement). the problem is that then
|
||||
// we cannot free the tensor because the original address of the allocation is lost.
|
||||
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
|
||||
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
|
||||
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
||||
node->data = parent->data;
|
||||
return;
|
||||
}
|
||||
}
|
||||
else {
|
||||
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||
node->data = parent->data;
|
||||
}
|
||||
return;
|
||||
}
|
||||
else {
|
||||
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||
node->data = parent->data;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
ggml_allocator_alloc_tensor(alloc, node);
|
||||
ggml_allocr_alloc(alloc, node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
struct ggml_allocator * alloc,
|
||||
struct ggml_allocr * alloc,
|
||||
struct ggml_cgraph ** graphs, int n_graphs,
|
||||
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
||||
|
||||
|
@ -455,7 +457,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|||
for (int g = 0; g < n_graphs; g++) {
|
||||
struct ggml_cgraph * gf = graphs[g];
|
||||
AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
|
||||
// graph inputs are allocated first to ensure that they are never overwritten
|
||||
// graph inputs are allocated first to ensure that they are not overwritten by each other
|
||||
if (inputs != NULL && inputs[g] != NULL) {
|
||||
for (int i = 0; inputs[g][i] != NULL; i++) {
|
||||
struct ggml_tensor * input = inputs[g][i];
|
||||
|
@ -534,6 +536,6 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|||
return alloc->max_size;
|
||||
}
|
||||
|
||||
size_t ggml_allocator_alloc_graph_tensors(struct ggml_allocator * alloc, struct ggml_cgraph * graph) {
|
||||
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
||||
return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
||||
}
|
||||
|
|
15
ggml-alloc.h
15
ggml-alloc.h
|
@ -7,13 +7,14 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
|
||||
GGML_API struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alignment);
|
||||
GGML_API struct ggml_allocator * ggml_allocator_new_measure(size_t alignment);
|
||||
GGML_API void ggml_allocator_free(struct ggml_allocator * alloc);
|
||||
GGML_API bool ggml_allocator_is_measure(struct ggml_allocator * alloc);
|
||||
GGML_API void ggml_allocator_reset(struct ggml_allocator * alloc);
|
||||
GGML_API void ggml_allocator_alloc_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_allocator_alloc_graph_tensors(struct ggml_allocator * alloc, struct ggml_cgraph * graph);
|
||||
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
||||
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
||||
|
||||
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
||||
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
||||
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
||||
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
39
llama.cpp
39
llama.cpp
|
@ -344,7 +344,7 @@ struct llama_context {
|
|||
#endif
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
if (alloc) {
|
||||
ggml_allocator_free(alloc);
|
||||
ggml_allocr_free(alloc);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -389,7 +389,7 @@ struct llama_context {
|
|||
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
llama_ctx_buffer buf_alloc;
|
||||
ggml_allocator * alloc = NULL;
|
||||
ggml_allocr * alloc = NULL;
|
||||
#endif
|
||||
|
||||
#ifdef LLAMA_USE_SCRATCH
|
||||
|
@ -1431,10 +1431,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
};
|
||||
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
# define ggml_rope_custom_inplace ggml_rope_custom
|
||||
# define ggml_scale_inplace ggml_scale
|
||||
# define ggml_diag_mask_inf_inplace ggml_diag_mask_inf
|
||||
# define ggml_soft_max_inplace ggml_soft_max
|
||||
params.no_alloc = true;
|
||||
#endif
|
||||
|
||||
|
@ -1449,8 +1445,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
ggml_allocator_alloc_tensor(lctx.alloc, inp_tokens);
|
||||
if (!ggml_allocator_is_measure(lctx.alloc)) {
|
||||
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
||||
}
|
||||
#else
|
||||
|
@ -1467,8 +1463,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
||||
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
ggml_allocator_alloc_tensor(lctx.alloc, inpL);
|
||||
if (!ggml_allocator_is_measure(lctx.alloc)) {
|
||||
ggml_allocr_alloc(lctx.alloc, inpL);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
||||
}
|
||||
#else
|
||||
|
@ -1502,8 +1498,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
|
||||
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
ggml_allocator_alloc_tensor(lctx.alloc, KQ_scale);
|
||||
if (!ggml_allocator_is_measure(lctx.alloc)) {
|
||||
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||
}
|
||||
#else
|
||||
|
@ -1760,13 +1756,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
ggml_free(ctx0);
|
||||
|
||||
return gf;
|
||||
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
# undef ggml_rope_custom
|
||||
# undef ggml_scale
|
||||
# undef ggml_diag_mask_inf
|
||||
# undef ggml_soft_max
|
||||
#endif
|
||||
}
|
||||
|
||||
// evaluate the transformer
|
||||
|
@ -1808,13 +1797,13 @@ static bool llama_eval_internal(
|
|||
const int64_t n_vocab = hparams.n_vocab;
|
||||
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
ggml_allocator_reset(lctx.alloc);
|
||||
ggml_allocr_reset(lctx.alloc);
|
||||
#endif
|
||||
|
||||
ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
|
||||
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
ggml_allocator_alloc_graph_tensors(lctx.alloc, gf);
|
||||
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
||||
#endif
|
||||
|
||||
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||
|
@ -3282,7 +3271,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
|
||||
|
||||
// create measure allocator
|
||||
ctx->alloc = ggml_allocator_new_measure(tensor_alignment);
|
||||
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||
|
||||
// build worst-case graph
|
||||
int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
|
||||
|
@ -3291,7 +3280,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
||||
|
||||
// measure memory requirements for the graph
|
||||
size_t alloc_size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf) + tensor_alignment;
|
||||
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
||||
|
||||
fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
||||
|
||||
|
@ -3303,10 +3292,10 @@ struct llama_context * llama_new_context_with_model(
|
|||
//fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
||||
|
||||
// recreate allocator with exact memory requirements
|
||||
ggml_allocator_free(ctx->alloc);
|
||||
ggml_allocr_free(ctx->alloc);
|
||||
|
||||
ctx->buf_alloc.resize(alloc_size);
|
||||
ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
||||
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
||||
}
|
||||
#else
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue