diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 58f1f0371..47bbdb4ad 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -117,15 +117,15 @@ struct ggml_mtl_context * llama_mtl_init(
 
         ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
         ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
-        fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, ctx->pipeline_add);
+        fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, (void *) ctx->pipeline_add);
 
         ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
         ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
-        fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, ctx->pipeline_relu);
+        fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, (void *) ctx->pipeline_relu);
 
         ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil];
         ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
-        fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, ctx->pipeline_soft_max);
+        fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, (void *) ctx->pipeline_soft_max);
     }
 
     // MTLBuffer approach
@@ -217,11 +217,11 @@ int llama_mtl_eval(
 
     // copy the input data to the GPU
     {
-        struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "input");
+        struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
 
-        id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, inp, &offs_src0);
+        id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, embd, &offs_src0);
 
-        memcpy(id_dst.contents + offs_src0, inp->data, ggml_nbytes(inp));
+        memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
     }
 
     for (int i = 0; i < gf->n_nodes; ++i) {
diff --git a/llama.cpp b/llama.cpp
index 9dccf0ed1..e6d544615 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1243,6 +1243,10 @@ static bool llama_eval_internal(
     ggml_cgraph gf = {};
     gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
 
+    // TODO: TMP !!!
+    ggml_cgraph gf_export = {};
+    gf_export.n_threads = 1;
+
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     ggml_set_name(embd, "embd");
     memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1264,6 +1268,11 @@ static bool llama_eval_internal(
             cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
         }
 
+        // TODO: TMP !!!!
+        if (il == 0) {
+            ggml_set_name(cur, "mtl-check");
+        }
+
         // self-attention
         {
             // compute Q and K and RoPE them
@@ -1420,12 +1429,17 @@ static bool llama_eval_internal(
     // logits -> probs
     //inpL = ggml_soft_max_inplace(ctx0, inpL);
 
+    // TODO: TMP !!!!!!!!!!!!!!!!!!!!
     // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
+    //ggml_build_forward_expand(&gf, inpL);
+    //ggml_graph_compute       (ctx0, &gf);
+
+    // lets export a smaller graph to get things rolling -- baby steps first
+    ggml_build_forward_expand(&gf_export, ggml_get_tensor(ctx0, "mtl-check"));
 
     if (cgraph_fname) {
-        ggml_graph_export(&gf, cgraph_fname);
+        //ggml_graph_export(&gf, cgraph_fname);
+        ggml_graph_export(&gf_export, cgraph_fname);
     }
 
 #ifdef GGML_PERF