metal : fix GELU kernel numerical stability by using precise::tanh

2023-08-23 15:04:53 +03:00 · 2023-08-23 15:04:53 +03:00 · 0a85ae7397
commit 0a85ae7397
parent b693000c2e
2 changed files with 8 additions and 3 deletions
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -540,7 +540,7 @@ void ggml_metal_graph_compute(
            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];

            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
-            const int node_end   = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
+            const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);

            for (int ind = node_start; ind < node_end; ++ind) {
                const int i = has_concur ? ctx->concur_list[ind] : ind;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -87,7 +87,12 @@ kernel void kernel_gelu(
    device       float * dst,
    uint tpig[[thread_position_in_grid]]) {
    float x = src0[tpig];
-    dst[tpig] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+
+    // BEWARE !!!
+    // Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
+    // This was observed with Falcon 7B and 40B models
+    //
+    dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
 }

 kernel void kernel_soft_max(