ggml : change ggml_scale to take a float instead of tensor (#4573)
* ggml : change ggml_scale to take a float instead of tensor * ggml : fix CPU implementation * tests : fix test-grad0 ggml-ci
This commit is contained in:
parent
769a7bc85e
commit
afefa319f1
12 changed files with 82 additions and 205 deletions
|
@ -575,10 +575,7 @@ static struct ggml_tensor * forward(
|
|||
|
||||
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
||||
struct ggml_tensor * KQ_scaled =
|
||||
ggml_scale(ctx0,
|
||||
KQ,
|
||||
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
|
||||
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||
|
||||
// KQ_masked = mask_past(KQ_scaled)
|
||||
// KQ_masked shape [n_past + N, N, n_head, 1]
|
||||
|
@ -844,10 +841,7 @@ static struct ggml_tensor * forward_batch(
|
|||
|
||||
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||
// KQ_scaled shape [n_past + N, N, n_head, n_batch]
|
||||
struct ggml_tensor * KQ_scaled =
|
||||
ggml_scale(ctx0,
|
||||
KQ,
|
||||
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
|
||||
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||
assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
|
||||
|
||||
// KQ_masked = mask_past(KQ_scaled)
|
||||
|
@ -1131,10 +1125,7 @@ static struct ggml_tensor * forward_lora(
|
|||
|
||||
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
||||
struct ggml_tensor * KQ_scaled =
|
||||
ggml_scale(ctx0,
|
||||
KQ,
|
||||
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
|
||||
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||
|
||||
// KQ_masked = mask_past(KQ_scaled)
|
||||
// KQ_masked shape [n_past + N, N, n_head, 1]
|
||||
|
|
|
@ -309,7 +309,7 @@ static struct ggml_cgraph * build_graph_lora(
|
|||
) {
|
||||
struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
|
||||
if (scaling != 1.0f) {
|
||||
ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling));
|
||||
ab = ggml_scale(ctx, ab, scaling);
|
||||
}
|
||||
struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
|
||||
|
||||
|
|
|
@ -269,7 +269,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h
|
|||
float rope_freq_scale = 1.0f;
|
||||
GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
||||
GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
||||
GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
||||
GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
||||
if (rope_freq_scale != 1.0f) {
|
||||
hparams->rope_freq_scale = 1.0f / rope_freq_scale;
|
||||
}
|
||||
|
@ -612,6 +612,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
|||
const int n_rot = hparams.n_embd_head();
|
||||
const int n_embd_head = hparams.n_embd_head();
|
||||
const int n_embd_gqa = hparams.n_embd_gqa();
|
||||
|
||||
const float rms_norm_eps = hparams.f_norm_rms_eps;
|
||||
const float rope_freq_base = hparams.rope_freq_base;
|
||||
const float rope_freq_scale = hparams.rope_freq_scale;
|
||||
|
@ -680,10 +681,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
|||
checkpoints.push_back(t01);
|
||||
}
|
||||
|
||||
struct ggml_tensor * kv_scale = NULL;
|
||||
if (!enable_flash_attn) {
|
||||
kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||
}
|
||||
const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct my_llama_layer & layer = model->layers[il];
|
||||
|
@ -781,32 +779,32 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
|||
// make sure some tensors are not reallocated by inserting new temporary nodes depending on them
|
||||
int n_leafs_before = gb->n_leafs;
|
||||
int n_nodes_before = gb->n_nodes;
|
||||
struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f);
|
||||
|
||||
// output tensors
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f));
|
||||
// input gradient
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
|
||||
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
|
||||
ggml_allocr_alloc(alloc, t36->grad);
|
||||
// KQ_pos
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
|
||||
|
||||
// make sure base model tensors data cannot be used in viewable operations
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f));
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct my_llama_layer & layer = model->layers[il];
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f));
|
||||
}
|
||||
|
||||
// allocating checkpoints in one block to reduce memory fragmentation
|
||||
|
|
|
@ -330,12 +330,6 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
|||
ggml_repeat(ctx0, model.pre_ln_b, embeddings));
|
||||
}
|
||||
|
||||
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||
ggml_allocr_alloc(ctx->alloc, KQ_scale);
|
||||
if (!ggml_allocr_is_measure(ctx->alloc)) {
|
||||
ggml_set_f32(KQ_scale, 1.0f / sqrt((float)d_head));
|
||||
}
|
||||
|
||||
// loop over layers
|
||||
for (int il = 0; il < n_layer - 1; il++) {
|
||||
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
||||
|
@ -356,7 +350,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
|||
struct ggml_tensor * Q =
|
||||
ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), ggml_mul_mat(ctx0, model.layers[il].q_w, cur));
|
||||
|
||||
Q = ggml_scale_inplace(ctx0, Q, KQ_scale);
|
||||
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
||||
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
|
||||
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
||||
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
|
||||
|
|
|
@ -369,10 +369,7 @@ static struct ggml_tensor * llama_build_train_graphs(
|
|||
checkpoints.push_back(t00);
|
||||
checkpoints.push_back(t01);
|
||||
|
||||
struct ggml_tensor * kv_scale = NULL;
|
||||
if (!enable_flash_attn) {
|
||||
kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||
}
|
||||
const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct my_llama_layer & layer = model->layers[il];
|
||||
|
@ -444,14 +441,13 @@ static struct ggml_tensor * llama_build_train_graphs(
|
|||
// make sure some tensors are not reallocated by inserting new temporary nodes depending on them
|
||||
int n_leafs_before = gb->n_leafs;
|
||||
int n_nodes_before = gb->n_nodes;
|
||||
struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f);
|
||||
// output tensors
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f));
|
||||
// input gradient
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
|
||||
// KQ_pos
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
|
||||
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
|
||||
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
|
||||
|
||||
ggml_allocr_alloc(alloc, t36->grad);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue