diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 4965881ec..f9dc0aaa6 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -8,6 +8,8 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +static const float rms_norm_eps = 1e-6f; + float frand() { return (float)rand()/(float)RAND_MAX; } @@ -562,7 +564,7 @@ struct ggml_tensor * forward( // norm { // cur shape [n_embd,N,1,1] - cur = ggml_rms_norm(ctx0, inpL); + cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); // cur = attention_norm*cur cur = ggml_mul(ctx0, @@ -685,7 +687,7 @@ struct ggml_tensor * forward( // norm { // cur shape [n_embd,N,1,1] - cur = ggml_rms_norm(ctx0, inpFF); + cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); // cur = ffn_norm*cur // cur shape [n_embd,N,1,1] @@ -729,7 +731,7 @@ struct ggml_tensor * forward( { // inpL shape [n_embd,N,1,1] - inpL = ggml_rms_norm(ctx0, inpL); + inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); // inpL = norm*inpL // inpL shape [n_embd,N,1,1] @@ -817,7 +819,7 @@ struct ggml_tensor * forward_batch( // norm { // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_rms_norm(ctx0, inpL); + cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); assert_shape_2d(cur, n_embd, N*n_batch); // cur = attention_norm*cur @@ -981,7 +983,7 @@ struct ggml_tensor * forward_batch( // norm { // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_rms_norm(ctx0, inpFF); + cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); assert_shape_2d(cur, n_embd, N*n_batch); // cur = ffn_norm*cur @@ -1034,7 +1036,7 @@ struct ggml_tensor * forward_batch( { // inpL shape [n_embd,N*n_batch,1,1] - inpL = ggml_rms_norm(ctx0, inpL); + inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); assert_shape_2d(inpL, n_embd, N*n_batch); // inpL = norm*inpL @@ -1104,7 +1106,7 @@ struct ggml_tensor * forward_lora( // norm { // cur shape [n_embd,N,1,1] - cur = ggml_rms_norm(ctx0, inpL); + cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); // cur = attention_norm*cur cur = ggml_mul(ctx0, @@ -1251,7 +1253,7 @@ struct ggml_tensor * forward_lora( // norm { // cur shape [n_embd,N,1,1] - cur = ggml_rms_norm(ctx0, inpFF); + cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); // cur = ffn_norm*cur // cur shape [n_embd,N,1,1] @@ -1295,7 +1297,7 @@ struct ggml_tensor * forward_lora( { // inpL shape [n_embd,N,1,1] - inpL = ggml_rms_norm(ctx0, inpL); + inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); // inpL = norm*inpL // inpL shape [n_embd,N,1,1] diff --git a/tests/test-grad0.c b/tests/test-grad0.c index ef20bce51..6d312216d 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -850,7 +850,7 @@ int main(int argc, const char ** argv) { ggml_set_param(ctx0, x[i]); } - struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0])); + struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f)); check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY); }