avoid creating unnecessary grad tensors
previously we need to create grads for model parameters, so that expand(..) correctly populates cgraph->leafs & cgraph->grads this wasted memory, because unnecessary grad for each op were automatically created: the automatically generated grad was unnecessary because we later manually set the grad (e.g. t35->grad = expand(gb, ...) ). this discarded the automatically generated grad resulting in wasted memory. improved this by changing expand(..) to not use ggml_build_forward_expand. expand set cgraph->nodes but not the leafs. cgraph->leafs & cgraph->grads are set in another pass after the last expand call.
This commit is contained in:
parent
59544f0cdf
commit
7be3222b64
1 changed files with 96 additions and 28 deletions
|
@ -1337,6 +1337,82 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
|
||||||
return inpL;
|
return inpL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// expand the graph nodes without creating leafs.
|
||||||
|
struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
|
||||||
|
// check if already visited
|
||||||
|
for (int i = 0; i < g->n_nodes; i++) {
|
||||||
|
if (g->nodes[i] == t) {
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < g->n_leafs; i++) {
|
||||||
|
if (g->leafs[i] == t) {
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (t->src0) {
|
||||||
|
expand(g, t->src0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (t->src1) {
|
||||||
|
expand(g, t->src1);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < GGML_MAX_OPT; ++i) {
|
||||||
|
if (t->opt[i]) {
|
||||||
|
expand(g, t->opt[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(g->n_nodes < GGML_MAX_NODES);
|
||||||
|
|
||||||
|
if (strlen(t->name) == 0) {
|
||||||
|
snprintf(t->name, sizeof(t->name), "node_%d", g->n_nodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
g->nodes[g->n_nodes] = t;
|
||||||
|
g->grads[g->n_nodes] = t->grad;
|
||||||
|
g->n_nodes++;
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
|
void graph_set_leafs_grads(struct ggml_cgraph * g) {
|
||||||
|
// moves leaf nodes to g->leafs.
|
||||||
|
// i.e. g->n_nodes might change.
|
||||||
|
int n_nodes = 0;
|
||||||
|
for (int i = 0; i < g->n_nodes; ++i) {
|
||||||
|
struct ggml_tensor * node = g->nodes[i];
|
||||||
|
const bool is_leaf = node->op == GGML_OP_NONE && node->grad == NULL;
|
||||||
|
if (is_leaf) {
|
||||||
|
GGML_ASSERT(g->n_leafs < GGML_MAX_NODES);
|
||||||
|
|
||||||
|
if (strlen(node->name) == 0) {
|
||||||
|
snprintf(node->name, sizeof(node->name), "leaf_%d", g->n_leafs);
|
||||||
|
}
|
||||||
|
|
||||||
|
g->leafs[g->n_leafs] = node;
|
||||||
|
g->n_leafs++;
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(n_nodes < GGML_MAX_NODES);
|
||||||
|
|
||||||
|
if (strlen(node->name) == 0) {
|
||||||
|
snprintf(node->name, sizeof(node->name), "node_%d", n_nodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
g->nodes[n_nodes] = node;
|
||||||
|
g->grads[n_nodes] = node->grad;
|
||||||
|
n_nodes++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i=n_nodes; i < g->n_nodes; ++i) {
|
||||||
|
g->nodes[n_nodes] = NULL;
|
||||||
|
g->grads[n_nodes] = NULL;
|
||||||
|
}
|
||||||
|
g->n_nodes = n_nodes;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||||
struct my_llama_model * model,
|
struct my_llama_model * model,
|
||||||
struct ggml_context * ctx0,
|
struct ggml_context * ctx0,
|
||||||
|
@ -1375,11 +1451,6 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||||
const int n_ff = get_n_ff(&hparams);
|
const int n_ff = get_n_ff(&hparams);
|
||||||
const int rope_mode = 0;
|
const int rope_mode = 0;
|
||||||
|
|
||||||
auto expand = [] (struct ggml_cgraph * g, struct ggml_tensor * t) -> struct ggml_tensor * {
|
|
||||||
ggml_build_forward_expand(g, t);
|
|
||||||
return t;
|
|
||||||
};
|
|
||||||
|
|
||||||
int last_buf = -1;
|
int last_buf = -1;
|
||||||
size_t buf_offs[2] = { 0, 0 };
|
size_t buf_offs[2] = { 0, 0 };
|
||||||
size_t buf_size[2] = { size_buf_0,
|
size_t buf_size[2] = { size_buf_0,
|
||||||
|
@ -1423,6 +1494,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
|
auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
|
||||||
int64_t ne0 = n_embd/n_head;
|
int64_t ne0 = n_embd/n_head;
|
||||||
int64_t ne1 = N;
|
int64_t ne1 = N;
|
||||||
|
@ -1472,28 +1544,21 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||||
|
|
||||||
use_buf(-1);
|
use_buf(-1);
|
||||||
|
|
||||||
// need to create grads for model parameters, so that expand(..) correctly populates cgraph->leafs & cgraph->grads
|
model->tok_embeddings->grad = NULL;
|
||||||
// this wastes memory, because unnecessary grad for each op is automatically created:
|
model->norm->grad = NULL;
|
||||||
// the automatically generated grad is unnecessary because we later manually set the grad (e.g. t35->grad = expand(gb, ...) ).
|
model->output->grad = NULL;
|
||||||
// this discards the automatically generated grad resulting in wasted memory.
|
|
||||||
// TODO: improve this, possibly by changing expand(..) to not use ggml_build_forward_expand.
|
|
||||||
// expand should correctly set cgraph->nodes.
|
|
||||||
// cgraph->leafs & cgraph->grads could be set in another pass after the last expand call.
|
|
||||||
model->tok_embeddings->grad = ggml_dup_tensor(ctx0, model->tok_embeddings->grad);
|
|
||||||
model->norm->grad = ggml_dup_tensor(ctx0, model->norm->grad);
|
|
||||||
model->output->grad = ggml_dup_tensor(ctx0, model->output->grad);
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct my_llama_layer & layer = model->layers[il];
|
struct my_llama_layer & layer = model->layers[il];
|
||||||
layer.attention_norm->grad = ggml_dup_tensor(ctx0, layer.attention_norm->grad);
|
layer.attention_norm->grad = NULL;
|
||||||
layer.wq->grad = ggml_dup_tensor(ctx0, layer.wq->grad);
|
layer.wq->grad = NULL;
|
||||||
layer.wk->grad = ggml_dup_tensor(ctx0, layer.wk->grad);
|
layer.wk->grad = NULL;
|
||||||
layer.wv->grad = ggml_dup_tensor(ctx0, layer.wv->grad);
|
layer.wv->grad = NULL;
|
||||||
layer.wo->grad = ggml_dup_tensor(ctx0, layer.wo->grad);
|
layer.wo->grad = NULL;
|
||||||
layer.ffn_norm->grad = ggml_dup_tensor(ctx0, layer.ffn_norm->grad);
|
layer.ffn_norm->grad = NULL;
|
||||||
layer.w1->grad = ggml_dup_tensor(ctx0, layer.w1->grad);
|
layer.w1->grad = NULL;
|
||||||
layer.w2->grad = ggml_dup_tensor(ctx0, layer.w2->grad);
|
layer.w2->grad = NULL;
|
||||||
layer.w3->grad = ggml_dup_tensor(ctx0, layer.w3->grad);
|
layer.w3->grad = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
clr_buf(0);
|
clr_buf(0);
|
||||||
|
@ -1717,10 +1782,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||||
*gb = *gf;
|
*gb = *gf;
|
||||||
|
|
||||||
// t36->grad gets set to one by optimizer, so we need the tensor.
|
// t36->grad gets set to one by optimizer, so we need the tensor.
|
||||||
GGML_ASSERT(t36->grad != NULL);
|
|
||||||
// initialize it with 1.0f to make sure.
|
// initialize it with 1.0f to make sure.
|
||||||
// use_buf(-1);
|
use_buf(-1);
|
||||||
// t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
|
t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
|
||||||
|
|
||||||
use_buf(0);
|
use_buf(0);
|
||||||
t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad)); assert_shape_3d(t35->grad, n_vocab, N, n_batch);
|
t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad)); assert_shape_3d(t35->grad, n_vocab, N, n_batch);
|
||||||
|
@ -1839,7 +1903,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||||
use_buf(0);
|
use_buf(0);
|
||||||
t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad))); assert_shape_2d(t01->grad, n_embd, N*n_batch);
|
t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad))); assert_shape_2d(t01->grad, n_embd, N*n_batch);
|
||||||
use_buf(-1);
|
use_buf(-1);
|
||||||
model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings)); assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
|
model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings)); assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
|
||||||
// clr_buf(1);
|
// clr_buf(1);
|
||||||
// clr_buf(0);
|
// clr_buf(0);
|
||||||
|
|
||||||
|
@ -1850,6 +1914,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||||
printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
|
printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// now that all grads are created, set the graph leafs and grads
|
||||||
|
graph_set_leafs_grads(gf);
|
||||||
|
graph_set_leafs_grads(gb);
|
||||||
|
|
||||||
return t36;
|
return t36;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue