mtl : preparing for merge

2023-06-04 09:27:27 +03:00 · 2023-06-04 09:27:27 +03:00 · 18e482a89c
commit 18e482a89c
parent 4df2ef3161
13 changed files with 94 additions and 144 deletions
--- a/.gitignore
+++ b/.gitignore
@ -17,6 +17,7 @@ build-release/
 build-static/
 build-cublas/
 build-opencl/
 build-mtl/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -71,6 +71,7 @@ option(LLAMA_CUBLAS                     "llama: use cuBLAS"
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_DMMV_Y       "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
@ -183,7 +184,7 @@ if (LLAMA_CUBLAS)
        enable_language(CUDA)
-        set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
+        set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
        add_compile_definitions(GGML_USE_CUBLAS)
        add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
@ -200,12 +201,37 @@ if (LLAMA_CUBLAS)
    endif()
 endif()
 if (LLAMA_METAL)
    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
    set(GGML_SOURCES_METAL ggml-mtl.m ggml-mtl.h)
    add_compile_definitions(GGML_USE_METAL)
    add_compile_definitions(GGML_METAL_NDEBUG)
    # get full path to the file
    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
    # copy ggml-mtl.metal to bin directory
    configure_file(ggml-mtl.metal bin/ggml-mtl.metal COPYONLY)
    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
        ${FOUNDATION_LIBRARY}
        ${METAL_FRAMEWORK}
        ${METALKIT_FRAMEWORK}
        ${METALPERFORMANCE_FRAMEWORK}
        )
 endif()
 if (LLAMA_CLBLAST)
    find_package(CLBlast)
    if (CLBlast_FOUND)
        message(STATUS "CLBlast found")
-        set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
+        set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
        add_compile_definitions(GGML_USE_CLBLAST)
@ -370,8 +396,10 @@ endif()
 add_library(ggml OBJECT
            ggml.c
            ggml.h
-            ${GGML_CUDA_SOURCES}
+            ${GGML_SOURCES_CUDA}
-            ${GGML_OPENCL_SOURCES})
+            ${GGML_SOURCES_OPENCL}
            ${GGML_SOURCES_METAL}
            )
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
@ -385,8 +413,6 @@ add_library(llama
            llama.cpp
            llama.h
            llama-util.h
            examples/mtl/mtl.h # TODO: METAL TMP
            examples/mtl/mtl.m # TODO: METAL TMP
            )
 target_include_directories(llama PUBLIC .)
@ -394,19 +420,14 @@ target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE
    ggml
    ${LLAMA_EXTRA_LIBS}
    ${FOUNDATION_LIBRARY}         # TODO: METAL TMP
    ${METAL_FRAMEWORK}            # TODO: METAL TMP
    ${METALKIT_FRAMEWORK}         # TODO: METAL TMP
    ${METALPERFORMANCE_FRAMEWORK} # TODO: METAL TMP
    )
 target_compile_definitions(llama PRIVATE LLAMA_MTL_NDEBUG) # TODO: METAL TMP
 if (BUILD_SHARED_LIBS)
    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
 endif()
-if (GGML_CUDA_SOURCES)
+if (GGML_SOURCES_CUDA)
    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
    set_property(TARGET ggml  PROPERTY CUDA_ARCHITECTURES OFF)
    set_property(TARGET ggml  PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -37,8 +37,10 @@ else()
    add_subdirectory(save-load-state)
    add_subdirectory(benchmark)
    add_subdirectory(baby-llama)
    if (LLAMA_METAL)
        add_subdirectory(mtl)
-    if(LLAMA_BUILD_SERVER)
+    endif()
    if (LLAMA_BUILD_SERVER)
        add_subdirectory(server)
    endif()
 endif()
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -301,8 +301,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.mem_test = true;
        } else if (arg == "--export") {
            params.export_cgraph = true;
        } else if (arg == "--import") {
            params.import_cgraph = true;
        } else if (arg == "--verbose-prompt") {
            params.verbose_prompt = true;
        } else if (arg == "-r" || arg == "--reverse-prompt") {
@ -443,7 +441,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
    fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
    fprintf(stderr, "  --import              import a computation graph from 'llama.ggml'\n");
    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
@ -493,7 +490,6 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
    lparams.use_mlock    = params.use_mlock;
    lparams.logits_all   = params.perplexity;
    lparams.embedding    = params.embedding;
    lparams.cgraph       = params.import_cgraph;
    llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
--- a/examples/common.h
+++ b/examples/common.h
@ -72,7 +72,6 @@ struct gpt_params {
    bool use_mlock         = false; // use mlock to keep model in memory
    bool mem_test          = false; // compute maximum memory usage
    bool export_cgraph     = false; // export the computation graph
    bool import_cgraph     = false; // import a computation graph
    bool verbose_prompt    = false; // print prompt tokens before generation
 };
--- a/examples/mtl/CMakeLists.txt
+++ b/examples/mtl/CMakeLists.txt
@ -1,33 +1,6 @@
 if (APPLE)
    #
    # mtl
    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
    set(TEST_TARGET mtl)
-    add_executable(${TEST_TARGET} mtl.cpp mtl.h mtl.m)
+    add_executable(${TEST_TARGET} mtl.cpp)
-    target_link_libraries(${TEST_TARGET} PRIVATE
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml)
        ggml
        ${FOUNDATION_LIBRARY}
        ${METAL_FRAMEWORK}
        ${METALKIT_FRAMEWORK}
        ${METALPERFORMANCE_FRAMEWORK}
    )
    # TODO: temporary until the kernels are ready
    # custom command to build mtl.metal into a library
    # depends on the mtl.metal file
    add_custom_target(mtl.metallib-tmp ALL DEPENDS ${CMAKE_BINARY_DIR}/mtl.metallib)
    add_custom_command(
        OUTPUT  ${CMAKE_BINARY_DIR}/mtl.metallib
        COMMAND xcrun -sdk macosx metal    -c ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal -o ${CMAKE_BINARY_DIR}/mtl.air
        COMMAND xcrun -sdk macosx metallib            ${CMAKE_BINARY_DIR}/mtl.air   -o ${CMAKE_BINARY_DIR}/mtl.metallib
        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal
        COMMENT "Building mtl.metallib"
    )
 endif()
--- a/examples/mtl/mtl.cpp
+++ b/examples/mtl/mtl.cpp
@ -1,5 +1,5 @@
 #include "ggml.h"
-#include "mtl.h"
+#include "ggml-mtl.h"
 #include <cstdio>
 #include <cstring>
@ -51,7 +51,7 @@ int main(int argc, char ** argv) {
    }
    // this allocates all Metal resources and memory buffers
-    auto * ctx_mtl = llama_mtl_init(
+    auto * ctx_mtl = ggml_mtl_init(
            ggml_get_mem_buffer(ctx_data),
            ggml_get_mem_size  (ctx_data),
            ggml_get_mem_buffer(ctx_eval),
@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
        const std::vector<int> tmp(n_batch, 1); // BOS
        // warmup
-        llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
+        ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
        const int n_iter = 16;
@ -75,7 +75,7 @@ int main(int argc, char ** argv) {
        // the actual inference happens here
        for (int i = 0; i < n_iter; ++i) {
-            llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
+            ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
        }
        const int64_t t1 = ggml_time_us();
@ -83,7 +83,7 @@ int main(int argc, char ** argv) {
        printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
    }
-    llama_mtl_free(ctx_mtl);
+    ggml_mtl_free(ctx_mtl);
    ggml_free(ctx_data);
    ggml_free(ctx_eval);
--- a/examples/mtl/mtl.h
+++ b/examples/mtl/mtl.h
@ -11,7 +11,7 @@ extern "C" {
 struct ggml_mtl_context;
-struct ggml_mtl_context * llama_mtl_init(
+struct ggml_mtl_context * ggml_mtl_init(
        void   * data_buf,
        size_t   data_size,
        void   * eval_buf,
@ -20,18 +20,16 @@ struct ggml_mtl_context * llama_mtl_init(
        size_t   cach_size,
        size_t   outp_size);
-void llama_mtl_free(struct ggml_mtl_context * ctx);
+void ggml_mtl_free(struct ggml_mtl_context * ctx);
 // return 0 on success
-int llama_mtl_eval(
+int ggml_mtl_graph_compute(
        struct ggml_mtl_context * ctx,
             struct ggml_cgraph * gf,
                      const int * tokens,
                            int   n_tokens,
                            int   n_past);
 float * llama_mtl_get_logits(struct ggml_mtl_context * ctx);
 #ifdef __cplusplus
 }
 #endif
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@ -1,4 +1,4 @@
-#import "mtl.h"
+#import "ggml-mtl.h"
 #import "ggml.h"
@ -6,7 +6,7 @@
 #import <Metal/Metal.h>
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
-#ifdef LLAMA_MTL_NDEBUG
+#ifdef GGML_METAL_NDEBUG
 #define mtl_printf(...)
 #else
 #define mtl_printf(...) fprintf(stderr, __VA_ARGS__)
@ -85,9 +85,9 @@ struct ggml_mtl_context {
 // MSL code
 // TODO: move the contents here when ready
 //       for now it is easier to work in a separate file
-NSString * const msl_library_llama = @"see mtl.metal";
+NSString * const msl_library_source = @"see mtl.metal";
-struct ggml_mtl_context * llama_mtl_init(
+struct ggml_mtl_context * ggml_mtl_init(
                     void   * data_buf,
                     size_t   data_size,
                     void   * eval_buf,
@ -122,7 +122,7 @@ struct ggml_mtl_context * llama_mtl_init(
    {
        NSError * error = nil;
-        ctx->library = [ctx->device newLibraryWithSource:msl_library_llama options:nil error:&error];
+        ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
        if (error) {
            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
            exit(1);
@ -133,7 +133,10 @@ struct ggml_mtl_context * llama_mtl_init(
    {
        NSError * error = nil;
-        NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"];
+        //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"];
        NSString * path = [[NSBundle mainBundle] pathForResource:@"ggml-mtl" ofType:@"metal"];
        fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]);
        NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
        if (error) {
            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
@ -220,7 +223,7 @@ struct ggml_mtl_context * llama_mtl_init(
    // TODO: how to use MTLStorageModeManaged?
    // TODO: see if we can avoid this copy somehow
    {
-        void * mem_buffer = data_buf;
+        const void * mem_buffer = data_buf;
        const size_t mem_size   = data_size;
        //ctx->buffer_data = [ctx->device newBufferWithBytesNoCopy:mem_buffer length:mem_size options:MTLResourceStorageModeShared deallocator:nil];
@ -261,18 +264,20 @@ struct ggml_mtl_context * llama_mtl_init(
    return ctx;
 }
-void llama_mtl_free(struct ggml_mtl_context * ctx) {
+void ggml_mtl_free(struct ggml_mtl_context * ctx) {
    fprintf(stderr, "%s: deallocating\n", __func__);
    free(ctx);
 }
 // get data / eval buffer + offset
-id<MTLBuffer> llama_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
+id<MTLBuffer> ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
    const int64_t offs_data = (int64_t) t->data - (int64_t) ctx->data_buf;
    const int64_t offs_eval = (int64_t) t->data - (int64_t) ctx->eval_buf;
    const int64_t offs_cach = (int64_t) t->data - (int64_t) ctx->cach_buf;
    //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
    //const size_t t_size = ggml_nbytes(t);
    id<MTLBuffer> result;
@ -317,7 +322,7 @@ id<MTLBuffer> llama_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_te
    return result;
 }
-int llama_mtl_eval(
+int ggml_mtl_graph_compute(
        struct ggml_mtl_context * ctx,
             struct ggml_cgraph * gf,
                      const int * tokens,
@ -336,7 +341,7 @@ int llama_mtl_eval(
    {
        struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
-        id<MTLBuffer> id_dst = llama_mtl_get_buffer(ctx, embd, &offs_src0);
+        id<MTLBuffer> id_dst = ggml_mtl_get_buffer(ctx, embd, &offs_src0);
        memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
    }
@ -385,9 +390,9 @@ int llama_mtl_eval(
        const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
        const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
-        id<MTLBuffer> id_src0 = src0 ? llama_mtl_get_buffer(ctx, src0, &offs_src0) : nil;
+        id<MTLBuffer> id_src0 = src0 ? ggml_mtl_get_buffer(ctx, src0, &offs_src0) : nil;
-        id<MTLBuffer> id_src1 = src1 ? llama_mtl_get_buffer(ctx, src1, &offs_src1) : nil;
+        id<MTLBuffer> id_src1 = src1 ? ggml_mtl_get_buffer(ctx, src1, &offs_src1) : nil;
-        id<MTLBuffer> id_dst  = dst  ? llama_mtl_get_buffer(ctx, dst,  &offs_dst)  : nil;
+        id<MTLBuffer> id_dst  = dst  ? ggml_mtl_get_buffer(ctx, dst,  &offs_dst)  : nil;
        //mtl_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
        //if (src0) {
@ -775,7 +780,7 @@ int llama_mtl_eval(
        struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
-        id<MTLBuffer> id_src = llama_mtl_get_buffer(ctx, out, &offs_src0);
+        id<MTLBuffer> id_src = ggml_mtl_get_buffer(ctx, out, &offs_src0);
        id<MTLBuffer> id_dst = ctx->out;
        id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
@ -817,53 +822,5 @@ int llama_mtl_eval(
    mtl_printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
 #endif
    //{
    //    struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check");
    //    if (t->type == GGML_TYPE_F32) {
    //        const const float * data = (float *) ctx->out.contents;
    //        printf("data: ");
    //        for (int i = 0; i < (int) t->ne[0]; i++) {
    //            printf("%f ", data[i]);
    //        }
    //        printf("\n");
    //        double sum = 0.0;
    //        for (int i = 0; i < ggml_nelements(t); i++) {
    //            double cur = data[i];
    //            if (isinf(cur)) continue;
    //            sum += cur;
    //        }
    //        printf("sum:  %f\n", sum);
    //    } else if (t->type == GGML_TYPE_F16) {
    //        ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents;
    //        printf("data: ");
    //        for (int i = 0; i < (int) t->ne[0]; i++) {
    //            printf("%f ", ggml_fp16_to_fp32(data[i]));
    //        }
    //        printf("\n");
    //        double sum = 0.0;
    //        printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
    //        for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) {
    //            for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) {
    //                for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) {
    //                    for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) {
    //                        const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
    //                        const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs));
    //                        const float curf = ggml_fp16_to_fp32(cur);
    //                        if (isinf(curf)) continue;
    //                        sum += curf;
    //                    }
    //                }
    //            }
    //        }
    //        printf("sum:  %f\n", sum);
    //    } else {
    //        GGML_ASSERT(false && "not implemented");
    //    }
    //}
    return 0;
 }
 float * llama_mtl_get_logits(struct ggml_mtl_context * ctx) {
    return ctx->logits;
 }
--- a/examples/mtl/mtl.metal
+++ b/examples/mtl/mtl.metal
--- a/llama.cpp
+++ b/llama.cpp
@ -9,9 +9,6 @@
 #include "llama-util.h"
 #include "llama.h"
 // METAL
 #include "examples/mtl/mtl.h"
 #include "ggml.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
@ -19,6 +16,10 @@
 #include "ggml-opencl.h"
 #endif
 #ifdef GGML_USE_METAL
 #include "ggml-mtl.h"
 #endif
 #include <array>
 #include <ctime>
 #include <cinttypes>
@ -241,8 +242,9 @@ struct llama_context {
    llama_ctx_buffer buf_compute;
    llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
-    // METAL
+#ifdef GGML_USE_METAL
    ggml_mtl_context * mtl_ctx = NULL;
 #endif
    int    buf_last = 0;
    size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@ -842,7 +844,6 @@ struct llama_context_params llama_context_default_params() {
        /*.use_mmap                    =*/ true,
        /*.use_mlock                   =*/ false,
        /*.embedding                   =*/ false,
        /*.cgraph                      =*/ false,
        /*.progress_callback           =*/ nullptr,
        /*.progress_callback_user_data =*/ nullptr,
    };
@ -1442,12 +1443,15 @@ static bool llama_eval_internal(
    // run the computation
    ggml_build_forward_expand(&gf, inpL);
-    // METAL
+#ifdef GGML_USE_METAL
    if (lctx.mtl_ctx) {
-        llama_mtl_eval(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past);
+        ggml_mtl_graph_compute(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past);
    } else {
-        ggml_graph_compute (ctx0, &gf);
+        ggml_graph_compute(ctx0, &gf);
    }
 #else
    ggml_graph_compute(ctx0, &gf);
 #endif
    if (cgraph_fname) {
        // TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found
@ -2376,11 +2380,10 @@ struct llama_context * llama_init_from_file(
        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
    }
-    // METAL
+#ifdef GGML_USE_METAL
-    if (params.cgraph) {
+    if (params.n_gpu_layers > 0) {
        // this allocates all Metal resources and memory buffers
-        //ctx->mtl_ctx = llama_mtl_init(ctx_data, ctx_eval, &gf);
+        ctx->mtl_ctx = ggml_mtl_init(
        ctx->mtl_ctx = llama_mtl_init(
                ggml_get_mem_buffer(ctx->model.ctx),
                ggml_get_mem_size  (ctx->model.ctx),
                ctx->buf_compute.addr,
@ -2389,6 +2392,7 @@ struct llama_context * llama_init_from_file(
                ctx->model.kv_self.buf.size,
                32*ctx->model.hparams.n_vocab*sizeof(float));
    }
 #endif
    return ctx;
 }
--- a/llama.h
+++ b/llama.h
@ -31,7 +31,7 @@
 #define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION        1
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
@ -75,7 +75,6 @@ extern "C" {
        bool use_mmap;   // use mmap if possible
        bool use_mlock;  // force system to keep model in RAM
        bool embedding;  // embedding mode only
        bool cgraph;     // try to load computation graph from "llama.ggml" (METAL)
        // called with a progress value between 0 and 1, pass NULL to disable
        llama_progress_callback progress_callback;