diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 2de105640..85003ebdd 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -782,49 +782,49 @@ int llama_mtl_eval(
     // TODO
     const float * logits = ctx->out.contents;
 
-    {
-        struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
-        if (t->type == GGML_TYPE_F32) {
-            const const float * data = (float *) ctx->out.contents;
-            printf("data: ");
-            for (int i = 0; i < (int) t->ne[0]; i++) {
-                printf("%f ", data[i]);
-            }
-            printf("\n");
-            double sum = 0.0;
-            for (int i = 0; i < ggml_nelements(t); i++) {
-                double cur = data[i];
-                if (isinf(cur)) continue;
-                sum += cur;
-            }
-            printf("sum:  %f\n", sum);
-        } else if (t->type == GGML_TYPE_F16) {
-            ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
-            printf("data: ");
-            for (int i = 0; i < (int) t->ne[0]; i++) {
-                printf("%f ", ggml_fp16_to_fp32(data[i]));
-            }
-            printf("\n");
-            double sum = 0.0;
-            printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
-            for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
-                for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
-                    for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
-                        for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
-                            const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
-                            const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
-                            const float curf = ggml_fp16_to_fp32(cur);
-                            if (isinf(curf)) continue;
-                            sum += curf;
-                        }
-                    }
-                }
-            }
-            printf("sum:  %f\n", sum);
-        } else {
-            GGML_ASSERT(false && "not implemented");
-        }
-    }
+    //{
+    //    struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
+    //    if (t->type == GGML_TYPE_F32) {
+    //        const const float * data = (float *) ctx->out.contents;
+    //        printf("data: ");
+    //        for (int i = 0; i < (int) t->ne[0]; i++) {
+    //            printf("%f ", data[i]);
+    //        }
+    //        printf("\n");
+    //        double sum = 0.0;
+    //        for (int i = 0; i < ggml_nelements(t); i++) {
+    //            double cur = data[i];
+    //            if (isinf(cur)) continue;
+    //            sum += cur;
+    //        }
+    //        printf("sum:  %f\n", sum);
+    //    } else if (t->type == GGML_TYPE_F16) {
+    //        ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
+    //        printf("data: ");
+    //        for (int i = 0; i < (int) t->ne[0]; i++) {
+    //            printf("%f ", ggml_fp16_to_fp32(data[i]));
+    //        }
+    //        printf("\n");
+    //        double sum = 0.0;
+    //        printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
+    //        for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
+    //            for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
+    //                for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
+    //                    for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
+    //                        const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
+    //                        const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
+    //                        const float curf = ggml_fp16_to_fp32(cur);
+    //                        if (isinf(curf)) continue;
+    //                        sum += curf;
+    //                    }
+    //                }
+    //            }
+    //        }
+    //        printf("sum:  %f\n", sum);
+    //    } else {
+    //        GGML_ASSERT(false && "not implemented");
+    //    }
+    //}
 
     return 0;
 }
diff --git a/llama.cpp b/llama.cpp
index 81d998c18..e0fbc6f73 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1243,10 +1243,6 @@ static bool llama_eval_internal(
     ggml_cgraph gf = {};
     gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
 
-    // TODO: TMP !!!
-    ggml_cgraph gf_export = {};
-    gf_export.n_threads = 1;
-
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     ggml_set_name(embd, "embd");
     memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1299,12 +1295,6 @@ static bool llama_eval_internal(
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
                 //ggml_build_forward_expand(&gf, t);
-
-                // TODO: TMP !!!!!!!!!!
-                if (il == 0) {
-                    ggml_build_forward_expand(&gf_export, ggml_cpy(ctx0, Kcur, k));
-                    ggml_build_forward_expand(&gf_export, ggml_cpy(ctx0, Vcur, v));
-                }
             }
 
             struct ggml_tensor * Q =
@@ -1404,11 +1394,6 @@ static bool llama_eval_internal(
 
             cur = ggml_mul(ctx0, cur, tmp);
 
-            // TODO: TMP !!!!
-            if (il == 0) {
-                ggml_set_name(cur, "mtl-check");
-            }
-
             cur = ggml_mul_mat(ctx0,
                     model.layers[il].w2,
                     cur);
@@ -1444,84 +1429,83 @@ static bool llama_eval_internal(
     // logits -> probs
     //inpL = ggml_soft_max_inplace(ctx0, inpL);
 
-    // TODO: TMP !!!!!!!!!!!!!!!!!!!!
     // run the computation
-    //ggml_build_forward_expand(&gf, inpL);
-    //ggml_graph_compute       (ctx0, &gf);
+    ggml_build_forward_expand(&gf, inpL);
+    ggml_graph_compute       (ctx0, &gf);
 
-    // lets export a smaller graph to get things rolling -- baby steps first
-    {
-        struct ggml_tensor * t = ggml_get_tensor(ctx0, "mtl-check");
-        if (!t) {
-            fprintf(stderr, "%s: failed to find tensor 'mtl-check'\n", __func__);
-            exit(1);
-        }
-        ggml_build_forward_expand(&gf_export, t);
-    }
+    // TODO: not needed anymore, keeping for a bit
+    //// lets export a smaller graph to get things rolling -- baby steps first
+    //{
+    //    struct ggml_tensor * t = ggml_get_tensor(ctx0, "mtl-check");
+    //    if (!t) {
+    //        fprintf(stderr, "%s: failed to find tensor 'mtl-check'\n", __func__);
+    //        exit(1);
+    //    }
+    //    ggml_build_forward_expand(&gf, t);
+    //}
 
     // print
-    {
-        auto print_t_f32 = [&](struct ggml_tensor * t) {
-            float * data = (float *)t->data;
-            printf("data: ");
-            for (int i = 0; i < (int) t->ne[0]; i++) {
-                printf("%f ", data[i]);
-            }
-            printf("\n");
-            double sum = 0.0;
-            for (int i = 0; i < ggml_nelements(t); i++) {
-                double cur = data[i];
-                if (isinf(cur)) continue;
-                sum += data[i];
-            }
-            printf("sum:  %f\n", sum);
-        };
-        auto print_t_f16 = [&](struct ggml_tensor * t) {
-            ggml_fp16_t * data = (ggml_fp16_t *)t->data;
-            printf("data: ");
-            for (int i = 0; i < (int) t->ne[0]; i++) {
-                printf("%f ", ggml_fp16_to_fp32(data[i]));
-            }
-            printf("\n");
-            double sum = 0.0;
-            printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
-            for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
-                for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
-                    for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
-                        for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
-                            const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
-                            const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
-                            const float curf = ggml_fp16_to_fp32(cur);
-                            if (isinf(curf)) continue;
-                            sum += curf;
-                        }
-                    }
-                }
-            }
-            printf("sum:  %f\n", sum);
-        };
+    //{
+    //    auto print_t_f32 = [&](struct ggml_tensor * t) {
+    //        float * data = (float *)t->data;
+    //        printf("data: ");
+    //        for (int i = 0; i < (int) t->ne[0]; i++) {
+    //            printf("%f ", data[i]);
+    //        }
+    //        printf("\n");
+    //        double sum = 0.0;
+    //        for (int i = 0; i < ggml_nelements(t); i++) {
+    //            double cur = data[i];
+    //            if (isinf(cur)) continue;
+    //            sum += data[i];
+    //        }
+    //        printf("sum:  %f\n", sum);
+    //    };
+    //    auto print_t_f16 = [&](struct ggml_tensor * t) {
+    //        ggml_fp16_t * data = (ggml_fp16_t *)t->data;
+    //        printf("data: ");
+    //        for (int i = 0; i < (int) t->ne[0]; i++) {
+    //            printf("%f ", ggml_fp16_to_fp32(data[i]));
+    //        }
+    //        printf("\n");
+    //        double sum = 0.0;
+    //        printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
+    //        for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
+    //            for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
+    //                for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
+    //                    for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
+    //                        const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
+    //                        const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
+    //                        const float curf = ggml_fp16_to_fp32(cur);
+    //                        if (isinf(curf)) continue;
+    //                        sum += curf;
+    //                    }
+    //                }
+    //            }
+    //        }
+    //        printf("sum:  %f\n", sum);
+    //    };
 
-        ggml_graph_compute(ctx0, &gf_export);
+    //    ggml_graph_compute(ctx0, &gf);
 
-        {
-            auto * t = ggml_get_tensor(ctx0, "mtl-check");
-            switch (t->type) {
-                case GGML_TYPE_F32:
-                    print_t_f32(t);
-                    break;
-                case GGML_TYPE_F16:
-                    print_t_f16(t);
-                    break;
-                default:
-                    fprintf(stderr, "%s: unsupported type\n", __func__);
-                    exit(1);
-            }
-        }
-    }
+    //    {
+    //        auto * t = ggml_get_tensor(ctx0, "mtl-check");
+    //        switch (t->type) {
+    //            case GGML_TYPE_F32:
+    //                print_t_f32(t);
+    //                break;
+    //            case GGML_TYPE_F16:
+    //                print_t_f16(t);
+    //                break;
+    //            default:
+    //                fprintf(stderr, "%s: unsupported type\n", __func__);
+    //                exit(1);
+    //        }
+    //    }
+    //}
 
     if (cgraph_fname) {
-        //ggml_graph_export(&gf, cgraph_fname);
-        ggml_graph_export(&gf_export, cgraph_fname);
+        ggml_graph_export(&gf, cgraph_fname);
     }
 
 #ifdef GGML_PERF