diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 6d509a2ab..1327de0b4 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -267,6 +267,7 @@ int llama_mtl_eval(
             case GGML_OP_RESHAPE:
             case GGML_OP_VIEW:
             case GGML_OP_TRANSPOSE:
+            case GGML_OP_PERMUTE:
                 {
                     // noop
                 } break;
@@ -344,81 +345,101 @@ int llama_mtl_eval(
                     [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
             case GGML_OP_MUL_MAT:
-                if (gf->nodes[i]->src0->type == GGML_TYPE_F32) {
-                    // for F32 x F32 we use MPS
-
-                    if (encoder != nil) {
-                        [encoder endEncoding];
-                        encoder = nil;
-                    }
-
-                    // use MPSMatrixMultiplication
-                    id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_src1 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
-                    id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                    const int64_t ncols0 = gf->nodes[i]->src0->ne[0];
-                    const int64_t nrows0 = gf->nodes[i]->src0->ne[1];
-
-                    const int64_t ncols1 = gf->nodes[i]->src1->ne[0];
-                    const int64_t nrows1 = gf->nodes[i]->src1->ne[1];
-
-                    const int64_t ncols2 = gf->nodes[i]->ne[0];
-                    const int64_t nrows2 = gf->nodes[i]->ne[1];
-
-                    GGML_ASSERT(ncols0 == ncols1);
-
-                    MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
-                        matrixDescriptorWithRows:nrows0 columns:ncols0 rowBytes:gf->nodes[i]->src0->nb[1] dataType:MPSDataTypeFloat32];
-                    MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
-                        matrixDescriptorWithRows:nrows1 columns:ncols1 rowBytes:gf->nodes[i]->src1->nb[1] dataType:MPSDataTypeFloat32];
-                    MPSMatrixDescriptor * desc2 = [MPSMatrixDescriptor
-                        matrixDescriptorWithRows:nrows2 columns:ncols2 rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
-
-                    MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0 descriptor:desc0];
-                    MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1 descriptor:desc1];
-                    MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst  descriptor:desc2];
-
-                    MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc] initWithDevice:ctx->device
-                        transposeLeft:false transposeRight:true resultRows:nrows1 resultColumns:nrows0 interiorColumns:ncols0 alpha:1.0 beta:0.0];
-
-                    [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
-                } else {
-                    // for Q4 x F32 we use custom kernel
-
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
-
-                    GGML_ASSERT(gf->nodes[i]->src0->ne[2] == 1);
-                    GGML_ASSERT(gf->nodes[i]->src1->ne[2] == 1);
-
+                {
                     id<MTLBuffer> id_src0 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
                     id<MTLBuffer> id_src1 = llama_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
                     id<MTLBuffer> id_dst  = llama_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
 
                     const int64_t ne00 = gf->nodes[i]->src0->ne[0];
                     const int64_t ne01 = gf->nodes[i]->src0->ne[1];
+                    const int64_t ne02 = gf->nodes[i]->src0->ne[2];
+
+                    //const uint64_t nb00 = gf->nodes[i]->src0->nb[0];
+                    //const uint64_t nb01 = gf->nodes[i]->src0->nb[1];
+                    const uint64_t nb02 = gf->nodes[i]->src0->nb[2];
+
                     const int64_t ne10 = gf->nodes[i]->src1->ne[0];
                     const int64_t ne11 = gf->nodes[i]->src1->ne[1];
+                    const int64_t ne12 = gf->nodes[i]->src1->ne[2];
+
+                    //const uint64_t nb10 = gf->nodes[i]->src1->nb[0];
+                    //const uint64_t nb11 = gf->nodes[i]->src1->nb[1];
+                    const uint64_t nb12 = gf->nodes[i]->src1->nb[2];
+
                     const int64_t ne0  = gf->nodes[i]->ne[0];
                     const int64_t ne1  = gf->nodes[i]->ne[1];
+                    const int64_t ne2  = gf->nodes[i]->ne[2];
 
-                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                    [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                    [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:5];
-                    [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:6];
-                    [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:7];
-                    [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:8];
-                    [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+                    //const uint64_t nb0 = gf->nodes[i]->nb[0];
+                    //const uint64_t nb1 = gf->nodes[i]->nb[1];
+                    const uint64_t nb2 = gf->nodes[i]->nb[2];
 
-                    printf("mul_mat: %lldx%lld * %lldx%lld -> %lldx%lld\n", ne00, ne01, ne10, ne11, ne0, ne1);
+                    const enum ggml_type src0t = gf->nodes[i]->src0->type;
+                    const enum ggml_type src1t = gf->nodes[i]->src1->type;
+                    const enum ggml_type dstt  = gf->nodes[i]->type;
 
-                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+                    printf("mul_mat: src0 - %s[%lld, %lld, %lld]\n", ggml_type_name(src0t), ne00, ne01, ne02);
+                    printf("mul_mat: src1 - %s[%lld, %lld, %lld]\n", ggml_type_name(src1t), ne10, ne11, ne12);
+                    printf("mul_mat: dst  - %s[%lld, %lld, %lld]\n", ggml_type_name(dstt),  ne0,  ne1,  ne2);
+                    printf("mul_mat: %s * %s -> %s\n", ggml_type_name(src0t), ggml_type_name(src1t), ggml_type_name(dstt));
+
+                    GGML_ASSERT(ne00 == ne10);
+                    GGML_ASSERT(ne02 == ne12);
+
+                    if (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) {
+                        if (encoder != nil) {
+                            [encoder endEncoding];
+                            encoder = nil;
+                        }
+
+                        MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
+                        MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
+
+                        // for F32 x F32 we use MPS
+                        MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
+                            matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:gf->nodes[i]->src0->nb[1] dataType:src0dt];
+
+                        MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
+                            matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:gf->nodes[i]->src1->nb[1] dataType:src1dt];
+
+                        MPSMatrixDescriptor * desc  = [MPSMatrixDescriptor
+                            matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
+
+                        MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
+                            initWithDevice:ctx->device transposeLeft:false transposeRight:true
+                                resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
+
+                        for (int64_t i02 = 0; i02 < ne02; ++i02) {
+                            size_t offs_src0_cur = offs_src0 + i02*nb02;
+                            size_t offs_src1_cur = offs_src1 + i02*nb12;
+                            size_t offs_dst_cur  = offs_dst  + i02*nb2;
+
+                            MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
+                            MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
+                            MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst_cur  descriptor:desc ];
+
+                            [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
+                        }
+                    } else {
+                        if (encoder == nil) {
+                            encoder = [command_buffer computeCommandEncoder];
+                        }
+
+                        // for Q4 x F32 we use custom kernel
+                        [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:5];
+                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:6];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:7];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:8];
+                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+                    }
                 } break;
             case GGML_OP_GET_ROWS:
                 {
diff --git a/ggml.c b/ggml.c
index 7a3f74771..114136122 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14613,7 +14613,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
     const int64_t * ne = tensor->ne;
     const size_t  * nb = tensor->nb;
 
-    fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
+    fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %32s\n",
             ggml_type_name(tensor->type),
             ggml_op_name  (tensor->op),
             tensor->n_dims,
@@ -14627,7 +14627,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
     const int64_t * ne = tensor->ne;
     const size_t  * nb = tensor->nb;
 
-    fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
+    fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %32s\n",
             arg,
             ggml_type_name(tensor->type),
             ggml_op_name  (tensor->op),
@@ -15067,6 +15067,10 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
                         {
                             tensor = ggml_transpose(*ctx_eval, args[0]);
                         } break;
+                    case GGML_OP_PERMUTE:
+                        {
+                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
+                        } break;
                     default:
                         {
                             tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
diff --git a/llama.cpp b/llama.cpp
index 5e7c3db86..f6d93bd93 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1289,16 +1289,22 @@ static bool llama_eval_internal(
                         (   n_ctx)*ggml_element_size(kv_self.v),
                         (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
 
-                struct ggml_tensor * t = ggml_cpy(ctx0, Kcur, k);
-                // TODO: TMP !!!!
-                if (il == 0) {
-                    ggml_set_name(t, "mtl-check");
-                }
+                //struct ggml_tensor * t = ggml_cpy(ctx0, Vcur, v);
+                //// TODO: TMP !!!!
+                //if (il == 0) {
+                //    ggml_set_name(t, "mtl-check");
+                //}
 
                 // important: storing RoPE-ed version of K in the KV cache!
-                //ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, t);
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+                //ggml_build_forward_expand(&gf, t);
+
+                // TODO: TMP !!!!!!!!!!
+                if (il == 0) {
+                    ggml_build_forward_expand(&gf_export, ggml_cpy(ctx0, Kcur, k));
+                    ggml_build_forward_expand(&gf_export, ggml_cpy(ctx0, Vcur, v));
+                }
             }
 
             struct ggml_tensor * Q =
@@ -1318,6 +1324,10 @@ static bool llama_eval_internal(
             // K * Q
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
             ggml_set_name(KQ, "KQ");
+            // TODO: TMP !!!!
+            if (il == 0) {
+                ggml_set_name(KQ, "mtl-check");
+            }
 
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
             struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));