use a hash table instead
This commit is contained in:
parent
e371b716ca
commit
6542a035f9
4 changed files with 50 additions and 35 deletions
|
@ -1342,10 +1342,17 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
|
||||||
// expand the graph nodes without creating leafs.
|
// expand the graph nodes without creating leafs.
|
||||||
struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
|
struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
|
||||||
// check if already visited
|
// check if already visited
|
||||||
if (t->visited) {
|
for (int i = 0; i < g->n_nodes; i++) {
|
||||||
|
if (g->nodes[i] == t) {
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
t->visited = true;
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < g->n_leafs; i++) {
|
||||||
|
if (g->leafs[i] == t) {
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
||||||
if (t->src[i]) {
|
if (t->src[i]) {
|
||||||
|
|
51
ggml.c
51
ggml.c
|
@ -4592,7 +4592,6 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
/*.op =*/ GGML_OP_NONE,
|
/*.op =*/ GGML_OP_NONE,
|
||||||
/*.op_params =*/ {0},
|
/*.op_params =*/ {0},
|
||||||
/*.is_param =*/ false,
|
/*.is_param =*/ false,
|
||||||
/*.visited =*/ false,
|
|
||||||
/*.grad =*/ NULL,
|
/*.grad =*/ NULL,
|
||||||
/*.src =*/ { NULL },
|
/*.src =*/ { NULL },
|
||||||
/*.perf_runs =*/ 0,
|
/*.perf_runs =*/ 0,
|
||||||
|
@ -15743,6 +15742,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
|
||||||
|
|
||||||
|
static size_t hash(void * p) {
|
||||||
|
return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool hash_insert(void * hash_table[], void * p) {
|
||||||
|
size_t h = hash(p);
|
||||||
|
|
||||||
|
// linear probing
|
||||||
|
size_t i = h;
|
||||||
|
while (hash_table[i] != NULL && hash_table[i] != p) {
|
||||||
|
i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
|
||||||
|
if (i == h) {
|
||||||
|
// hash table is full
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hash_table[i] == p) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// insert
|
||||||
|
hash_table[i] = p;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
|
static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
|
||||||
if (node->grad == NULL) {
|
if (node->grad == NULL) {
|
||||||
// this usually happens when we generate intermediate nodes from constants in the backward pass
|
// this usually happens when we generate intermediate nodes from constants in the backward pass
|
||||||
|
@ -15753,11 +15780,9 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if already visited
|
// check if already visited
|
||||||
if (node->visited) {
|
if (hash_insert(cgraph->visited_hash_table, node)) {
|
||||||
GGML_ASSERT(cgraph->n_nodes > 0 || cgraph->n_leafs > 0); // to fix this, call ggml_graph_close() after building the graph
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
node->visited = true;
|
|
||||||
|
|
||||||
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
||||||
if (node->src[i]) {
|
if (node->src[i]) {
|
||||||
|
@ -15809,31 +15834,17 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
|
void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(!cgraph->closed && "graph is closed");
|
|
||||||
ggml_build_forward_impl(cgraph, tensor, true);
|
ggml_build_forward_impl(cgraph, tensor, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_graph_close(struct ggml_cgraph * cgraph) {
|
|
||||||
if (cgraph->closed) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
|
||||||
cgraph->nodes[i]->visited = false;
|
|
||||||
}
|
|
||||||
for (int i = 0; i < cgraph->n_leafs; ++i) {
|
|
||||||
cgraph->leafs[i]->visited = false;
|
|
||||||
}
|
|
||||||
cgraph->closed = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
||||||
struct ggml_cgraph result = {
|
struct ggml_cgraph result = {
|
||||||
/*.n_nodes =*/ 0,
|
/*.n_nodes =*/ 0,
|
||||||
/*.n_leafs =*/ 0,
|
/*.n_leafs =*/ 0,
|
||||||
/*.closed =*/ false,
|
|
||||||
/*.nodes =*/ { NULL },
|
/*.nodes =*/ { NULL },
|
||||||
/*.grads =*/ { NULL },
|
/*.grads =*/ { NULL },
|
||||||
/*.leafs =*/ { NULL },
|
/*.leafs =*/ { NULL },
|
||||||
|
/*.hash_table =*/ { NULL },
|
||||||
/*.perf_runs =*/ 0,
|
/*.perf_runs =*/ 0,
|
||||||
/*.perf_cycles =*/ 0,
|
/*.perf_cycles =*/ 0,
|
||||||
/*.perf_time_us =*/ 0,
|
/*.perf_time_us =*/ 0,
|
||||||
|
@ -16145,8 +16156,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
ggml_graph_close(cgraph);
|
|
||||||
|
|
||||||
if (n_threads <= 0) {
|
if (n_threads <= 0) {
|
||||||
n_threads = GGML_DEFAULT_N_THREADS;
|
n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
}
|
}
|
||||||
|
|
16
ggml.h
16
ggml.h
|
@ -422,8 +422,7 @@ extern "C" {
|
||||||
// op params - allocated as int32_t for alignment
|
// op params - allocated as int32_t for alignment
|
||||||
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)];
|
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)];
|
||||||
|
|
||||||
uint32_t is_param:1;
|
bool is_param;
|
||||||
uint32_t visited:1; // used to build graphs
|
|
||||||
|
|
||||||
struct ggml_tensor * grad;
|
struct ggml_tensor * grad;
|
||||||
struct ggml_tensor * src[GGML_MAX_SRC];
|
struct ggml_tensor * src[GGML_MAX_SRC];
|
||||||
|
@ -460,16 +459,22 @@ extern "C" {
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// next prime after GGML_MAX_NODES
|
||||||
|
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
||||||
|
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
||||||
|
#define GGML_GRAPH_HASHTABLE_SIZE 8273
|
||||||
|
|
||||||
// computation graph
|
// computation graph
|
||||||
struct ggml_cgraph {
|
struct ggml_cgraph {
|
||||||
int n_nodes;
|
int n_nodes;
|
||||||
int n_leafs;
|
int n_leafs;
|
||||||
bool closed;
|
|
||||||
|
|
||||||
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
||||||
struct ggml_tensor * grads[GGML_MAX_NODES];
|
struct ggml_tensor * grads[GGML_MAX_NODES];
|
||||||
struct ggml_tensor * leafs[GGML_MAX_NODES];
|
struct ggml_tensor * leafs[GGML_MAX_NODES];
|
||||||
|
|
||||||
|
void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
||||||
|
|
||||||
// performance
|
// performance
|
||||||
int perf_runs;
|
int perf_runs;
|
||||||
int64_t perf_cycles;
|
int64_t perf_cycles;
|
||||||
|
@ -1351,11 +1356,6 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
// resets the visited flag for all the tensors in the graph
|
|
||||||
// called by ggml_graph_plan()
|
|
||||||
// shouldn't be necessary to call manually except building when building multiple graphs without computing them
|
|
||||||
GGML_API void ggml_graph_close(struct ggml_cgraph * cgraph);
|
|
||||||
|
|
||||||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
||||||
|
|
||||||
|
|
|
@ -1701,18 +1701,17 @@ static bool llama_eval_internal(
|
||||||
// logits -> probs
|
// logits -> probs
|
||||||
//cur = ggml_soft_max_inplace(ctx0, cur);
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
||||||
|
|
||||||
//fprintf(stderr, "graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
|
|
||||||
|
|
||||||
// run the computation
|
// run the computation
|
||||||
ggml_build_forward_expand(&gf, cur);
|
ggml_build_forward_expand(&gf, cur);
|
||||||
|
|
||||||
|
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
|
||||||
|
|
||||||
#if GGML_USE_MPI
|
#if GGML_USE_MPI
|
||||||
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (lctx.ctx_metal && N == 1) {
|
if (lctx.ctx_metal && N == 1) {
|
||||||
ggml_graph_close(&gf); // should only be required for the Metal backend, as ggml_graph_plan() does this automatically
|
|
||||||
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
||||||
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
||||||
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue