diff --git a/.gitignore b/.gitignore
index d231f3ff8..edcb6b144 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,7 @@ build-release/
 build-static/
 build-cublas/
 build-opencl/
+build-mtl/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc23c2c5b..62f1467aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,13 +64,14 @@ if (NOT MSVC)
 endif()
 
 # 3rd party libs
-option(LLAMA_ACCELERATE                 "llama: enable Accelerate framework"                    ON)
-option(LLAMA_BLAS                       "llama: use BLAS"                                       OFF)
+option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
+option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                     "llama: use cuBLAS"                                     OFF)
-set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
-set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING  "llama: y block size for dmmv CUDA kernels")
-option(LLAMA_CLBLAST                    "llama: use CLBlast"                                    OFF)
+option(LLAMA_CUBLAS                          "llama: use cuBLAS"                                OFF)
+set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
+set(LLAMA_CUDA_DMMV_Y       "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
+option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
+option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
 
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
@@ -183,7 +184,7 @@ if (LLAMA_CUBLAS)
 
         enable_language(CUDA)
 
-        set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
+        set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
 
         add_compile_definitions(GGML_USE_CUBLAS)
         add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
@@ -200,12 +201,37 @@ if (LLAMA_CUBLAS)
     endif()
 endif()
 
+if (LLAMA_METAL)
+    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
+    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
+    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
+    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
+
+    set(GGML_SOURCES_METAL ggml-mtl.m ggml-mtl.h)
+
+    add_compile_definitions(GGML_USE_METAL)
+    add_compile_definitions(GGML_METAL_NDEBUG)
+
+    # get full path to the file
+    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
+
+    # copy ggml-mtl.metal to bin directory
+    configure_file(ggml-mtl.metal bin/ggml-mtl.metal COPYONLY)
+
+    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
+        ${FOUNDATION_LIBRARY}
+        ${METAL_FRAMEWORK}
+        ${METALKIT_FRAMEWORK}
+        ${METALPERFORMANCE_FRAMEWORK}
+        )
+endif()
+
 if (LLAMA_CLBLAST)
     find_package(CLBlast)
     if (CLBlast_FOUND)
         message(STATUS "CLBlast found")
 
-        set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
+        set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
 
         add_compile_definitions(GGML_USE_CLBLAST)
 
@@ -370,8 +396,10 @@ endif()
 add_library(ggml OBJECT
             ggml.c
             ggml.h
-            ${GGML_CUDA_SOURCES}
-            ${GGML_OPENCL_SOURCES})
+            ${GGML_SOURCES_CUDA}
+            ${GGML_SOURCES_OPENCL}
+            ${GGML_SOURCES_METAL}
+            )
 
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
@@ -385,8 +413,6 @@ add_library(llama
             llama.cpp
             llama.h
             llama-util.h
-            examples/mtl/mtl.h # TODO: METAL TMP
-            examples/mtl/mtl.m # TODO: METAL TMP
             )
 
 target_include_directories(llama PUBLIC .)
@@ -394,22 +420,17 @@ target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE
     ggml
     ${LLAMA_EXTRA_LIBS}
-    ${FOUNDATION_LIBRARY}         # TODO: METAL TMP
-    ${METAL_FRAMEWORK}            # TODO: METAL TMP
-    ${METALKIT_FRAMEWORK}         # TODO: METAL TMP
-    ${METALPERFORMANCE_FRAMEWORK} # TODO: METAL TMP
     )
-target_compile_definitions(llama PRIVATE LLAMA_MTL_NDEBUG) # TODO: METAL TMP
 
 if (BUILD_SHARED_LIBS)
     set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
 endif()
 
-if (GGML_CUDA_SOURCES)
+if (GGML_SOURCES_CUDA)
     message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
-    set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
-    set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
+    set_property(TARGET ggml  PROPERTY CUDA_ARCHITECTURES OFF)
+    set_property(TARGET ggml  PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
     set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
 endif()
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 97a3ffd1b..e23bf1cb3 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -37,8 +37,10 @@ else()
     add_subdirectory(save-load-state)
     add_subdirectory(benchmark)
     add_subdirectory(baby-llama)
-    add_subdirectory(mtl)
-    if(LLAMA_BUILD_SERVER)
+    if (LLAMA_METAL)
+        add_subdirectory(mtl)
+    endif()
+    if (LLAMA_BUILD_SERVER)
         add_subdirectory(server)
     endif()
 endif()
diff --git a/examples/common.cpp b/examples/common.cpp
index 53e9200fa..b5810f28f 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -301,8 +301,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.mem_test = true;
         } else if (arg == "--export") {
             params.export_cgraph = true;
-        } else if (arg == "--import") {
-            params.import_cgraph = true;
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -443,7 +441,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
-    fprintf(stderr, "  --import              import a computation graph from 'llama.ggml'\n");
     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
     fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
@@ -493,7 +490,6 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
     lparams.use_mlock    = params.use_mlock;
     lparams.logits_all   = params.perplexity;
     lparams.embedding    = params.embedding;
-    lparams.cgraph       = params.import_cgraph;
 
     llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
 
diff --git a/examples/common.h b/examples/common.h
index c7d4d6e0e..66bdeb5e9 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -72,7 +72,6 @@ struct gpt_params {
     bool use_mlock         = false; // use mlock to keep model in memory
     bool mem_test          = false; // compute maximum memory usage
     bool export_cgraph     = false; // export the computation graph
-    bool import_cgraph     = false; // import a computation graph
     bool verbose_prompt    = false; // print prompt tokens before generation
 };
 
diff --git a/examples/mtl/CMakeLists.txt b/examples/mtl/CMakeLists.txt
index 1de83a1b6..0fe3a7197 100644
--- a/examples/mtl/CMakeLists.txt
+++ b/examples/mtl/CMakeLists.txt
@@ -1,33 +1,6 @@
 if (APPLE)
-    #
-    # mtl
-
-    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
-    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
-    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
-    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
-
     set(TEST_TARGET mtl)
-    add_executable(${TEST_TARGET} mtl.cpp mtl.h mtl.m)
-    target_link_libraries(${TEST_TARGET} PRIVATE
-        ggml
-        ${FOUNDATION_LIBRARY}
-        ${METAL_FRAMEWORK}
-        ${METALKIT_FRAMEWORK}
-        ${METALPERFORMANCE_FRAMEWORK}
-    )
-
-    # TODO: temporary until the kernels are ready
-    # custom command to build mtl.metal into a library
-    # depends on the mtl.metal file
-    add_custom_target(mtl.metallib-tmp ALL DEPENDS ${CMAKE_BINARY_DIR}/mtl.metallib)
-
-    add_custom_command(
-        OUTPUT  ${CMAKE_BINARY_DIR}/mtl.metallib
-        COMMAND xcrun -sdk macosx metal    -c ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal -o ${CMAKE_BINARY_DIR}/mtl.air
-        COMMAND xcrun -sdk macosx metallib            ${CMAKE_BINARY_DIR}/mtl.air   -o ${CMAKE_BINARY_DIR}/mtl.metallib
-        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal
-        COMMENT "Building mtl.metallib"
-    )
+    add_executable(${TEST_TARGET} mtl.cpp)
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml)
 endif()
 
diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp
index 7411ea932..382976667 100644
--- a/examples/mtl/mtl.cpp
+++ b/examples/mtl/mtl.cpp
@@ -1,5 +1,5 @@
 #include "ggml.h"
-#include "mtl.h"
+#include "ggml-mtl.h"
 
 #include <cstdio>
 #include <cstring>
@@ -51,7 +51,7 @@ int main(int argc, char ** argv) {
     }
 
     // this allocates all Metal resources and memory buffers
-    auto * ctx_mtl = llama_mtl_init(
+    auto * ctx_mtl = ggml_mtl_init(
             ggml_get_mem_buffer(ctx_data),
             ggml_get_mem_size  (ctx_data),
             ggml_get_mem_buffer(ctx_eval),
@@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
         const std::vector<int> tmp(n_batch, 1); // BOS
 
         // warmup
-        llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
+        ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
 
         const int n_iter = 16;
 
@@ -75,7 +75,7 @@ int main(int argc, char ** argv) {
 
         // the actual inference happens here
         for (int i = 0; i < n_iter; ++i) {
-            llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
+            ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
         }
 
         const int64_t t1 = ggml_time_us();
@@ -83,7 +83,7 @@ int main(int argc, char ** argv) {
         printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
     }
 
-    llama_mtl_free(ctx_mtl);
+    ggml_mtl_free(ctx_mtl);
 
     ggml_free(ctx_data);
     ggml_free(ctx_eval);
diff --git a/examples/mtl/mtl.h b/ggml-mtl.h
similarity index 77%
rename from examples/mtl/mtl.h
rename to ggml-mtl.h
index ff92a6a7b..15256b27d 100644
--- a/examples/mtl/mtl.h
+++ b/ggml-mtl.h
@@ -11,7 +11,7 @@ extern "C" {
 
 struct ggml_mtl_context;
 
-struct ggml_mtl_context * llama_mtl_init(
+struct ggml_mtl_context * ggml_mtl_init(
         void   * data_buf,
         size_t   data_size,
         void   * eval_buf,
@@ -20,18 +20,16 @@ struct ggml_mtl_context * llama_mtl_init(
         size_t   cach_size,
         size_t   outp_size);
 
-void llama_mtl_free(struct ggml_mtl_context * ctx);
+void ggml_mtl_free(struct ggml_mtl_context * ctx);
 
 // return 0 on success
-int llama_mtl_eval(
+int ggml_mtl_graph_compute(
         struct ggml_mtl_context * ctx,
              struct ggml_cgraph * gf,
                       const int * tokens,
                             int   n_tokens,
                             int   n_past);
 
-float * llama_mtl_get_logits(struct ggml_mtl_context * ctx);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/examples/mtl/mtl.m b/ggml-mtl.m
similarity index 92%
rename from examples/mtl/mtl.m
rename to ggml-mtl.m
index b8fd1c144..ecbb1a188 100644
--- a/examples/mtl/mtl.m
+++ b/ggml-mtl.m
@@ -1,4 +1,4 @@
-#import "mtl.h"
+#import "ggml-mtl.h"
 
 #import "ggml.h"
 
@@ -6,7 +6,7 @@
 #import <Metal/Metal.h>
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 
-#ifdef LLAMA_MTL_NDEBUG
+#ifdef GGML_METAL_NDEBUG
 #define mtl_printf(...)
 #else
 #define mtl_printf(...) fprintf(stderr, __VA_ARGS__)
@@ -85,9 +85,9 @@ struct ggml_mtl_context {
 // MSL code
 // TODO: move the contents here when ready
 //       for now it is easier to work in a separate file
-NSString * const msl_library_llama = @"see mtl.metal";
+NSString * const msl_library_source = @"see mtl.metal";
 
-struct ggml_mtl_context * llama_mtl_init(
+struct ggml_mtl_context * ggml_mtl_init(
                      void   * data_buf,
                      size_t   data_size,
                      void   * eval_buf,
@@ -122,7 +122,7 @@ struct ggml_mtl_context * llama_mtl_init(
     {
         NSError * error = nil;
 
-        ctx->library = [ctx->device newLibraryWithSource:msl_library_llama options:nil error:&error];
+        ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
         if (error) {
             fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
             exit(1);
@@ -133,7 +133,10 @@ struct ggml_mtl_context * llama_mtl_init(
     {
         NSError * error = nil;
 
-        NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"];
+        //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"];
+        NSString * path = [[NSBundle mainBundle] pathForResource:@"ggml-mtl" ofType:@"metal"];
+        fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]);
+
         NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
         if (error) {
             fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
@@ -220,7 +223,7 @@ struct ggml_mtl_context * llama_mtl_init(
     // TODO: how to use MTLStorageModeManaged?
     // TODO: see if we can avoid this copy somehow
     {
-        void * mem_buffer = data_buf;
+        const void * mem_buffer = data_buf;
         const size_t mem_size   = data_size;
 
         //ctx->buffer_data = [ctx->device newBufferWithBytesNoCopy:mem_buffer length:mem_size options:MTLResourceStorageModeShared deallocator:nil];
@@ -261,18 +264,20 @@ struct ggml_mtl_context * llama_mtl_init(
     return ctx;
 }
 
-void llama_mtl_free(struct ggml_mtl_context * ctx) {
+void ggml_mtl_free(struct ggml_mtl_context * ctx) {
     fprintf(stderr, "%s: deallocating\n", __func__);
 
     free(ctx);
 }
 
 // get data / eval buffer + offset
-id<MTLBuffer> llama_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
+id<MTLBuffer> ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
     const int64_t offs_data = (int64_t) t->data - (int64_t) ctx->data_buf;
     const int64_t offs_eval = (int64_t) t->data - (int64_t) ctx->eval_buf;
     const int64_t offs_cach = (int64_t) t->data - (int64_t) ctx->cach_buf;
 
+    //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
+
     //const size_t t_size = ggml_nbytes(t);
 
     id<MTLBuffer> result;
@@ -317,7 +322,7 @@ id<MTLBuffer> llama_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_te
     return result;
 }
 
-int llama_mtl_eval(
+int ggml_mtl_graph_compute(
         struct ggml_mtl_context * ctx,
              struct ggml_cgraph * gf,
                       const int * tokens,
@@ -336,7 +341,7 @@ int llama_mtl_eval(
     {
         struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
 
-        id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, embd, &offs_src0);
+        id<MTLBuffer> id_dst = ggml_mtl_get_buffer(ctx, embd, &offs_src0);
 
         memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
     }
@@ -385,9 +390,9 @@ int llama_mtl_eval(
         const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
         const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
 
-        id<MTLBuffer> id_src0 = src0 ? llama_mtl_get_buffer(ctx, src0, &offs_src0) : nil;
-        id<MTLBuffer> id_src1 = src1 ? llama_mtl_get_buffer(ctx, src1, &offs_src1) : nil;
-        id<MTLBuffer> id_dst  = dst  ? llama_mtl_get_buffer(ctx, dst,  &offs_dst)  : nil;
+        id<MTLBuffer> id_src0 = src0 ? ggml_mtl_get_buffer(ctx, src0, &offs_src0) : nil;
+        id<MTLBuffer> id_src1 = src1 ? ggml_mtl_get_buffer(ctx, src1, &offs_src1) : nil;
+        id<MTLBuffer> id_dst  = dst  ? ggml_mtl_get_buffer(ctx, dst,  &offs_dst)  : nil;
 
         //mtl_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
         //if (src0) {
@@ -775,7 +780,7 @@ int llama_mtl_eval(
 
         struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
 
-        id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, out, &offs_src0);
+        id<MTLBuffer> id_src = ggml_mtl_get_buffer(ctx, out, &offs_src0);
         id<MTLBuffer> id_dst = ctx->out;
 
         id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
@@ -817,53 +822,5 @@ int llama_mtl_eval(
     mtl_printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
 #endif
 
-    //{
-    //    struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
-    //    if (t->type == GGML_TYPE_F32) {
-    //        const const float * data = (float *) ctx->out.contents;
-    //        printf("data: ");
-    //        for (int i = 0; i < (int) t->ne[0]; i++) {
-    //            printf("%f ", data[i]);
-    //        }
-    //        printf("\n");
-    //        double sum = 0.0;
-    //        for (int i = 0; i < ggml_nelements(t); i++) {
-    //            double cur = data[i];
-    //            if (isinf(cur)) continue;
-    //            sum += cur;
-    //        }
-    //        printf("sum:  %f\n", sum);
-    //    } else if (t->type == GGML_TYPE_F16) {
-    //        ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
-    //        printf("data: ");
-    //        for (int i = 0; i < (int) t->ne[0]; i++) {
-    //            printf("%f ", ggml_fp16_to_fp32(data[i]));
-    //        }
-    //        printf("\n");
-    //        double sum = 0.0;
-    //        printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
-    //        for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
-    //            for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
-    //                for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
-    //                    for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
-    //                        const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
-    //                        const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
-    //                        const float curf = ggml_fp16_to_fp32(cur);
-    //                        if (isinf(curf)) continue;
-    //                        sum += curf;
-    //                    }
-    //                }
-    //            }
-    //        }
-    //        printf("sum:  %f\n", sum);
-    //    } else {
-    //        GGML_ASSERT(false && "not implemented");
-    //    }
-    //}
-
     return 0;
 }
-
-float * llama_mtl_get_logits(struct ggml_mtl_context * ctx) {
-    return ctx->logits;
-}
diff --git a/examples/mtl/mtl.metal b/ggml-mtl.metal
similarity index 100%
rename from examples/mtl/mtl.metal
rename to ggml-mtl.metal
diff --git a/ggml.h b/ggml.h
index 7f821cf32..2ea87ce9a 100644
--- a/ggml.h
+++ b/ggml.h
@@ -451,7 +451,7 @@ extern "C" {
     // main
 
     GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
-    GGML_API void    ggml_free(struct ggml_context * ctx);
+    GGML_API void                  ggml_free(struct ggml_context * ctx);
 
     GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
 
diff --git a/llama.cpp b/llama.cpp
index 24b9d633b..5cd39b612 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9,9 +9,6 @@
 #include "llama-util.h"
 #include "llama.h"
 
-// METAL
-#include "examples/mtl/mtl.h"
-
 #include "ggml.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
@@ -19,6 +16,10 @@
 #include "ggml-opencl.h"
 #endif
 
+#ifdef GGML_USE_METAL
+#include "ggml-mtl.h"
+#endif
+
 #include <array>
 #include <ctime>
 #include <cinttypes>
@@ -241,8 +242,9 @@ struct llama_context {
     llama_ctx_buffer buf_compute;
     llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
 
-    // METAL
+#ifdef GGML_USE_METAL
     ggml_mtl_context * mtl_ctx = NULL;
+#endif
 
     int    buf_last = 0;
     size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -842,7 +844,6 @@ struct llama_context_params llama_context_default_params() {
         /*.use_mmap                    =*/ true,
         /*.use_mlock                   =*/ false,
         /*.embedding                   =*/ false,
-        /*.cgraph                      =*/ false,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
     };
@@ -1442,12 +1443,15 @@ static bool llama_eval_internal(
     // run the computation
     ggml_build_forward_expand(&gf, inpL);
 
-    // METAL
+#ifdef GGML_USE_METAL
     if (lctx.mtl_ctx) {
-        llama_mtl_eval(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past);
+        ggml_mtl_graph_compute(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past);
     } else {
-        ggml_graph_compute (ctx0, &gf);
+        ggml_graph_compute(ctx0, &gf);
     }
+#else
+    ggml_graph_compute(ctx0, &gf);
+#endif
 
     if (cgraph_fname) {
         // TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found
@@ -2376,11 +2380,10 @@ struct llama_context * llama_init_from_file(
         ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
     }
 
-    // METAL
-    if (params.cgraph) {
+#ifdef GGML_USE_METAL
+    if (params.n_gpu_layers > 0) {
         // this allocates all Metal resources and memory buffers
-        //ctx->mtl_ctx = llama_mtl_init(ctx_data, ctx_eval, &gf);
-        ctx->mtl_ctx = llama_mtl_init(
+        ctx->mtl_ctx = ggml_mtl_init(
                 ggml_get_mem_buffer(ctx->model.ctx),
                 ggml_get_mem_size  (ctx->model.ctx),
                 ctx->buf_compute.addr,
@@ -2389,6 +2392,7 @@ struct llama_context * llama_init_from_file(
                 ctx->model.kv_self.buf.size,
                 32*ctx->model.hparams.n_vocab*sizeof(float));
     }
+#endif
 
     return ctx;
 }
diff --git a/llama.h b/llama.h
index faaca2637..a650ddf45 100644
--- a/llama.h
+++ b/llama.h
@@ -31,7 +31,7 @@
 #define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION        1
 
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
@@ -75,7 +75,6 @@ extern "C" {
         bool use_mmap;   // use mmap if possible
         bool use_mlock;  // force system to keep model in RAM
         bool embedding;  // embedding mode only
-        bool cgraph;     // try to load computation graph from "llama.ggml" (METAL)
 
         // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;