make rms_norm_eps a parameter (#2374)
* make rms_norm_eps a parameter * add rms_norm_eps to command line * fix baby llama, test-grad0 * use scientific notation for eps param in the help ggml-ci
This commit is contained in:
parent
b3f138d058
commit
41c674161f
11 changed files with 89 additions and 56 deletions
|
@ -16,6 +16,8 @@
|
|||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
static const float rms_norm_eps = 1e-6f;
|
||||
|
||||
struct random_normal_distribution {
|
||||
std::mt19937 gen;
|
||||
std::normal_distribution<float> rd;
|
||||
|
@ -439,7 +441,7 @@ struct ggml_tensor * forward(
|
|||
// norm
|
||||
{
|
||||
// cur shape [n_embd,N,1,1]
|
||||
cur = ggml_rms_norm(ctx0, inpL);
|
||||
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||
|
||||
// cur = attention_norm*cur
|
||||
cur = ggml_mul(ctx0,
|
||||
|
@ -562,7 +564,7 @@ struct ggml_tensor * forward(
|
|||
// norm
|
||||
{
|
||||
// cur shape [n_embd,N,1,1]
|
||||
cur = ggml_rms_norm(ctx0, inpFF);
|
||||
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
||||
|
||||
// cur = ffn_norm*cur
|
||||
// cur shape [n_embd,N,1,1]
|
||||
|
@ -606,7 +608,7 @@ struct ggml_tensor * forward(
|
|||
{
|
||||
|
||||
// inpL shape [n_embd,N,1,1]
|
||||
inpL = ggml_rms_norm(ctx0, inpL);
|
||||
inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||
|
||||
// inpL = norm*inpL
|
||||
// inpL shape [n_embd,N,1,1]
|
||||
|
@ -694,7 +696,7 @@ struct ggml_tensor * forward_batch(
|
|||
// norm
|
||||
{
|
||||
// cur shape [n_embd,N*n_batch,1,1]
|
||||
cur = ggml_rms_norm(ctx0, inpL);
|
||||
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||
assert_shape_2d(cur, n_embd, N*n_batch);
|
||||
|
||||
// cur = attention_norm*cur
|
||||
|
@ -857,7 +859,7 @@ struct ggml_tensor * forward_batch(
|
|||
// norm
|
||||
{
|
||||
// cur shape [n_embd,N*n_batch,1,1]
|
||||
cur = ggml_rms_norm(ctx0, inpFF);
|
||||
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
||||
assert_shape_2d(cur, n_embd, N*n_batch);
|
||||
|
||||
// cur = ffn_norm*cur
|
||||
|
@ -910,7 +912,7 @@ struct ggml_tensor * forward_batch(
|
|||
{
|
||||
|
||||
// inpL shape [n_embd,N*n_batch,1,1]
|
||||
inpL = ggml_rms_norm(ctx0, inpL);
|
||||
inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||
assert_shape_2d(inpL, n_embd, N*n_batch);
|
||||
|
||||
// inpL = norm*inpL
|
||||
|
@ -979,7 +981,7 @@ struct ggml_tensor * forward_batch_wo_cache(
|
|||
// norm
|
||||
{
|
||||
// cur shape [n_embd,N*n_batch,1,1]
|
||||
cur = ggml_rms_norm(ctx0, inpL);
|
||||
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||
assert_shape_2d(cur, n_embd, N*n_batch);
|
||||
|
||||
// cur = attention_norm*cur
|
||||
|
@ -1085,7 +1087,7 @@ struct ggml_tensor * forward_batch_wo_cache(
|
|||
// norm
|
||||
{
|
||||
// cur shape [n_embd,N*n_batch,1,1]
|
||||
cur = ggml_rms_norm(ctx0, inpFF);
|
||||
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
||||
assert_shape_2d(cur, n_embd, N*n_batch);
|
||||
|
||||
// cur = ffn_norm*cur
|
||||
|
@ -1138,7 +1140,7 @@ struct ggml_tensor * forward_batch_wo_cache(
|
|||
{
|
||||
|
||||
// inpL shape [n_embd,N*n_batch,1,1]
|
||||
inpL = ggml_rms_norm(ctx0, inpL);
|
||||
inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||
assert_shape_2d(inpL, n_embd, N*n_batch);
|
||||
|
||||
// inpL = norm*inpL
|
||||
|
@ -1203,7 +1205,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
|
|||
|
||||
// norm
|
||||
{
|
||||
cur = ggml_rms_norm(ctx0, inpL);
|
||||
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||
assert_shape_2d(cur, n_embd, N*n_batch);
|
||||
|
||||
// cur = attention_norm*cur
|
||||
|
@ -1267,7 +1269,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
|
|||
{
|
||||
// norm
|
||||
{
|
||||
cur = ggml_rms_norm(ctx0, inpFF);
|
||||
cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
|
||||
assert_shape_2d(cur, n_embd, N*n_batch);
|
||||
|
||||
// cur = ffn_norm*cur
|
||||
|
@ -1311,7 +1313,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
|
|||
// norm
|
||||
{
|
||||
|
||||
inpL = ggml_rms_norm(ctx0, inpL);
|
||||
inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||
assert_shape_2d(inpL, n_embd, N*n_batch);
|
||||
|
||||
// inpL = norm*inpL
|
||||
|
@ -1603,7 +1605,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
|||
struct my_llama_layer & layer = model->layers[il];
|
||||
// tensors with values necessary for backward pass are in persistent buf(-1)
|
||||
// other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed.
|
||||
use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t02, n_embd, N*n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t02, n_embd, N*n_batch);
|
||||
use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch);
|
||||
|
@ -1623,7 +1625,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
|||
use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch);
|
||||
use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21)); assert_shape_2d(t22, n_embd, N*n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21, rms_norm_eps)); assert_shape_2d(t22, n_embd, N*n_batch);
|
||||
use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch);
|
||||
|
@ -1666,7 +1668,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
|||
}
|
||||
clr_buf(0);
|
||||
use_buf(0);
|
||||
struct ggml_tensor * t31 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t31, n_embd, N*n_batch);
|
||||
struct ggml_tensor * t31 = expand(gf, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t31, n_embd, N*n_batch);
|
||||
struct ggml_tensor * t32 = expand(gf, ggml_repeat (ctx0, model->norm, t31)); assert_shape_2d(t32, n_embd, N*n_batch);
|
||||
struct ggml_tensor * t33 = expand(gf, ggml_mul (ctx0, t32, t31)); assert_shape_2d(t33, n_embd, N*n_batch);
|
||||
use_buf(-1);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue