replace n_views and n_children in ggml_tensor with a hash table in the allocator
This commit is contained in:
parent
af7bd42b2a
commit
e39e62ba4a
4 changed files with 51 additions and 32 deletions
73
ggml-alloc.c
73
ggml-alloc.c
|
@ -14,6 +14,35 @@
|
|||
//#define AT_PRINTF printf
|
||||
#define AT_PRINTF(...) ((void)0)
|
||||
|
||||
struct hash_node {
|
||||
struct ggml_tensor * t;
|
||||
int n_children;
|
||||
int n_views;
|
||||
};
|
||||
|
||||
static size_t hash(void * p) {
|
||||
return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
|
||||
}
|
||||
|
||||
static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
|
||||
size_t h = hash(t);
|
||||
|
||||
// linear probing
|
||||
size_t i = h;
|
||||
while (hash_table[i].t != NULL) {
|
||||
if (hash_table[i].t == t) {
|
||||
return &hash_table[i];
|
||||
}
|
||||
i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
|
||||
if (i == h) {
|
||||
// hash table is full
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
hash_table[i].t = t;
|
||||
return &hash_table[i];
|
||||
}
|
||||
|
||||
// TODO: GGML_PAD ?
|
||||
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
||||
|
@ -35,6 +64,7 @@ struct ggml_allocator {
|
|||
size_t alignment;
|
||||
int n_free_blocks;
|
||||
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
||||
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
||||
size_t max_size;
|
||||
bool measure;
|
||||
|
||||
|
@ -215,6 +245,7 @@ struct ggml_allocator * ggml_allocator_new(void * data, size_t size, size_t alig
|
|||
/*.alignment = */ alignment,
|
||||
/*.n_free_blocks = */ 0,
|
||||
/*.free_blocks = */ {{0}},
|
||||
/*.hash_table = */ {{0}},
|
||||
/*.max_size = */ 0,
|
||||
/*.measure = */ false,
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
|
@ -241,6 +272,7 @@ struct ggml_allocator * ggml_allocator_new_measure(size_t alignment) {
|
|||
/*.alignment = */ alignment,
|
||||
/*.n_free_blocks = */ 0,
|
||||
/*.free_blocks = */ {{0}},
|
||||
/*.hash_table = */ {{0}},
|
||||
/*.max_size = */ 0,
|
||||
/*.measure = */ true,
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
|
@ -305,7 +337,7 @@ static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
|
|||
return parent;
|
||||
}
|
||||
|
||||
bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_DIAG_MASK_ZERO:
|
||||
|
@ -333,6 +365,7 @@ bool ggml_op_can_inplace(enum ggml_op op) {
|
|||
}
|
||||
|
||||
static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * node) {
|
||||
struct hash_node * ht = alloc->hash_table;
|
||||
if (node->data == NULL) {
|
||||
if (ggml_is_view(node)) {
|
||||
size_t offset;
|
||||
|
@ -360,10 +393,12 @@ static void allocate_node(struct ggml_allocator * alloc, struct ggml_tensor * no
|
|||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
if (parent->data != NULL && parent->n_children == 1 && parent->n_views == 0 && ggml_are_same_layout(node, parent) && ggml_op_can_inplace(node->op)) {
|
||||
struct hash_node * p_hn = hash_get(ht, parent);
|
||||
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent) && ggml_op_can_inplace(node->op)) {
|
||||
if (ggml_is_view(parent)) {
|
||||
struct ggml_tensor * view_src = get_view_source(parent);
|
||||
if (view_src->n_views == 1 && view_src->n_children == 0 && view_src->data == parent->data) {
|
||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
||||
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
||||
// the parent's data that it will need later (same layout requirement). the problem is that then
|
||||
// we cannot free the tensor because the original address of the allocation is lost.
|
||||
|
@ -391,21 +426,9 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|||
struct ggml_cgraph ** graphs, int n_graphs,
|
||||
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
||||
|
||||
// reset counters
|
||||
for (int g = 0; g < n_graphs; g++) {
|
||||
struct ggml_cgraph * gf = graphs[g];
|
||||
for (int i = 0; i < gf->n_nodes; i++) {
|
||||
struct ggml_tensor * node = gf->nodes[i];
|
||||
node->n_children = 0;
|
||||
node->n_views = 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < gf->n_leafs; i++) {
|
||||
struct ggml_tensor * leaf = gf->leafs[i];
|
||||
leaf->n_children = 0;
|
||||
leaf->n_views = 0;
|
||||
}
|
||||
}
|
||||
// reset hash table
|
||||
struct hash_node * ht = alloc->hash_table;
|
||||
memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
|
||||
|
||||
// count number of children and views
|
||||
for (int g = 0; g < n_graphs; g++) {
|
||||
|
@ -415,7 +438,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|||
|
||||
if (ggml_is_view(node)) {
|
||||
struct ggml_tensor * view_src = get_view_source(node);
|
||||
view_src->n_views += 1;
|
||||
hash_get(ht, view_src)->n_views += 1;
|
||||
}
|
||||
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
|
@ -423,7 +446,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
parent->n_children += 1;
|
||||
hash_get(ht, parent)->n_children += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -474,16 +497,18 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
parent->n_children -= 1;
|
||||
struct hash_node * p_hn = hash_get(ht, parent);
|
||||
p_hn->n_children -= 1;
|
||||
|
||||
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
||||
|
||||
if (parent->n_children == 0 && parent->n_views == 0) {
|
||||
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
||||
if (ggml_is_view(parent)) {
|
||||
struct ggml_tensor * view_src = get_view_source(parent);
|
||||
view_src->n_views -= 1;
|
||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||
view_src_hn->n_views -= 1;
|
||||
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
|
||||
if (view_src->n_views == 0 && view_src->n_children == 0 && view_src->data != node->data) {
|
||||
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
||||
ggml_allocator_free_tensor(alloc, view_src);
|
||||
}
|
||||
}
|
||||
|
|
2
ggml.c
2
ggml.c
|
@ -4612,8 +4612,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|||
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
||||
/*.name =*/ { 0 },
|
||||
/*.extra =*/ NULL,
|
||||
/*.n_children =*/ 0,
|
||||
/*.n_views =*/ 0,
|
||||
/*.padding =*/ { 0 },
|
||||
};
|
||||
|
||||
|
|
6
ggml.h
6
ggml.h
|
@ -451,11 +451,7 @@ extern "C" {
|
|||
|
||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||
|
||||
// temp - used by allocator
|
||||
int n_children;
|
||||
int n_views;
|
||||
|
||||
char padding[16];
|
||||
char padding[4];
|
||||
};
|
||||
|
||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||
|
|
|
@ -1813,7 +1813,7 @@ static bool llama_eval_internal(
|
|||
ggml_allocator_alloc_graph_tensors(lctx.alloc, gf);
|
||||
#endif
|
||||
|
||||
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
|
||||
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||
|
||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue