diff --git a/.gitignore b/.gitignore index d231f3ff8..edcb6b144 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ build-release/ build-static/ build-cublas/ build-opencl/ +build-mtl/ build-no-accel/ build-sanitize-addr/ build-sanitize-thread/ diff --git a/CMakeLists.txt b/CMakeLists.txt index bc23c2c5b..62f1467aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,13 +64,14 @@ if (NOT MSVC) endif() # 3rd party libs -option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) -option(LLAMA_BLAS "llama: use BLAS" OFF) +option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) +option(LLAMA_BLAS "llama: use BLAS" OFF) set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") -option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) -set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") -set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") -option(LLAMA_CLBLAST "llama: use CLBlast" OFF) +option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) +set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") +set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") +option(LLAMA_CLBLAST "llama: use CLBlast" OFF) +option(LLAMA_METAL "llama: use Metal" OFF) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -183,7 +184,7 @@ if (LLAMA_CUBLAS) enable_language(CUDA) - set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h) + set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h) add_compile_definitions(GGML_USE_CUBLAS) add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) @@ -200,12 +201,37 @@ if (LLAMA_CUBLAS) endif() endif() +if (LLAMA_METAL) + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED) + + set(GGML_SOURCES_METAL ggml-mtl.m ggml-mtl.h) + + add_compile_definitions(GGML_USE_METAL) + add_compile_definitions(GGML_METAL_NDEBUG) + + # get full path to the file + #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") + + # copy ggml-mtl.metal to bin directory + configure_file(ggml-mtl.metal bin/ggml-mtl.metal COPYONLY) + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} + ${FOUNDATION_LIBRARY} + ${METAL_FRAMEWORK} + ${METALKIT_FRAMEWORK} + ${METALPERFORMANCE_FRAMEWORK} + ) +endif() + if (LLAMA_CLBLAST) find_package(CLBlast) if (CLBlast_FOUND) message(STATUS "CLBlast found") - set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h) + set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h) add_compile_definitions(GGML_USE_CLBLAST) @@ -370,8 +396,10 @@ endif() add_library(ggml OBJECT ggml.c ggml.h - ${GGML_CUDA_SOURCES} - ${GGML_OPENCL_SOURCES}) + ${GGML_SOURCES_CUDA} + ${GGML_SOURCES_OPENCL} + ${GGML_SOURCES_METAL} + ) target_include_directories(ggml PUBLIC .) target_compile_features(ggml PUBLIC c_std_11) # don't bump @@ -385,8 +413,6 @@ add_library(llama llama.cpp llama.h llama-util.h - examples/mtl/mtl.h # TODO: METAL TMP - examples/mtl/mtl.m # TODO: METAL TMP ) target_include_directories(llama PUBLIC .) @@ -394,22 +420,17 @@ target_compile_features(llama PUBLIC cxx_std_11) # don't bump target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS} - ${FOUNDATION_LIBRARY} # TODO: METAL TMP - ${METAL_FRAMEWORK} # TODO: METAL TMP - ${METALKIT_FRAMEWORK} # TODO: METAL TMP - ${METALPERFORMANCE_FRAMEWORK} # TODO: METAL TMP ) -target_compile_definitions(llama PRIVATE LLAMA_MTL_NDEBUG) # TODO: METAL TMP if (BUILD_SHARED_LIBS) set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) endif() -if (GGML_CUDA_SOURCES) +if (GGML_SOURCES_CUDA) message(STATUS "GGML CUDA sources found, configuring CUDA architecture") - set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF) - set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") + set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF) + set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF) endif() diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 97a3ffd1b..e23bf1cb3 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -37,8 +37,10 @@ else() add_subdirectory(save-load-state) add_subdirectory(benchmark) add_subdirectory(baby-llama) - add_subdirectory(mtl) - if(LLAMA_BUILD_SERVER) + if (LLAMA_METAL) + add_subdirectory(mtl) + endif() + if (LLAMA_BUILD_SERVER) add_subdirectory(server) endif() endif() diff --git a/examples/common.cpp b/examples/common.cpp index 53e9200fa..b5810f28f 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -301,8 +301,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.mem_test = true; } else if (arg == "--export") { params.export_cgraph = true; - } else if (arg == "--import") { - params.import_cgraph = true; } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; } else if (arg == "-r" || arg == "--reverse-prompt") { @@ -443,7 +441,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #endif fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); - fprintf(stderr, " --import import a computation graph from 'llama.ggml'\n"); fprintf(stderr, " --verbose-prompt print prompt before generation\n"); fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); @@ -493,7 +490,6 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) { lparams.use_mlock = params.use_mlock; lparams.logits_all = params.perplexity; lparams.embedding = params.embedding; - lparams.cgraph = params.import_cgraph; llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams); diff --git a/examples/common.h b/examples/common.h index c7d4d6e0e..66bdeb5e9 100644 --- a/examples/common.h +++ b/examples/common.h @@ -72,7 +72,6 @@ struct gpt_params { bool use_mlock = false; // use mlock to keep model in memory bool mem_test = false; // compute maximum memory usage bool export_cgraph = false; // export the computation graph - bool import_cgraph = false; // import a computation graph bool verbose_prompt = false; // print prompt tokens before generation }; diff --git a/examples/mtl/CMakeLists.txt b/examples/mtl/CMakeLists.txt index 1de83a1b6..0fe3a7197 100644 --- a/examples/mtl/CMakeLists.txt +++ b/examples/mtl/CMakeLists.txt @@ -1,33 +1,6 @@ if (APPLE) - # - # mtl - - find_library(FOUNDATION_LIBRARY Foundation REQUIRED) - find_library(METAL_FRAMEWORK Metal REQUIRED) - find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) - find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED) - set(TEST_TARGET mtl) - add_executable(${TEST_TARGET} mtl.cpp mtl.h mtl.m) - target_link_libraries(${TEST_TARGET} PRIVATE - ggml - ${FOUNDATION_LIBRARY} - ${METAL_FRAMEWORK} - ${METALKIT_FRAMEWORK} - ${METALPERFORMANCE_FRAMEWORK} - ) - - # TODO: temporary until the kernels are ready - # custom command to build mtl.metal into a library - # depends on the mtl.metal file - add_custom_target(mtl.metallib-tmp ALL DEPENDS ${CMAKE_BINARY_DIR}/mtl.metallib) - - add_custom_command( - OUTPUT ${CMAKE_BINARY_DIR}/mtl.metallib - COMMAND xcrun -sdk macosx metal -c ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal -o ${CMAKE_BINARY_DIR}/mtl.air - COMMAND xcrun -sdk macosx metallib ${CMAKE_BINARY_DIR}/mtl.air -o ${CMAKE_BINARY_DIR}/mtl.metallib - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/mtl.metal - COMMENT "Building mtl.metallib" - ) + add_executable(${TEST_TARGET} mtl.cpp) + target_link_libraries(${TEST_TARGET} PRIVATE ggml) endif() diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp index 7411ea932..382976667 100644 --- a/examples/mtl/mtl.cpp +++ b/examples/mtl/mtl.cpp @@ -1,5 +1,5 @@ #include "ggml.h" -#include "mtl.h" +#include "ggml-mtl.h" #include #include @@ -51,7 +51,7 @@ int main(int argc, char ** argv) { } // this allocates all Metal resources and memory buffers - auto * ctx_mtl = llama_mtl_init( + auto * ctx_mtl = ggml_mtl_init( ggml_get_mem_buffer(ctx_data), ggml_get_mem_size (ctx_data), ggml_get_mem_buffer(ctx_eval), @@ -67,7 +67,7 @@ int main(int argc, char ** argv) { const std::vector tmp(n_batch, 1); // BOS // warmup - llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past); + ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past); const int n_iter = 16; @@ -75,7 +75,7 @@ int main(int argc, char ** argv) { // the actual inference happens here for (int i = 0; i < n_iter; ++i) { - llama_mtl_eval(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past); + ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past); } const int64_t t1 = ggml_time_us(); @@ -83,7 +83,7 @@ int main(int argc, char ** argv) { printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter); } - llama_mtl_free(ctx_mtl); + ggml_mtl_free(ctx_mtl); ggml_free(ctx_data); ggml_free(ctx_eval); diff --git a/examples/mtl/mtl.h b/ggml-mtl.h similarity index 77% rename from examples/mtl/mtl.h rename to ggml-mtl.h index ff92a6a7b..15256b27d 100644 --- a/examples/mtl/mtl.h +++ b/ggml-mtl.h @@ -11,7 +11,7 @@ extern "C" { struct ggml_mtl_context; -struct ggml_mtl_context * llama_mtl_init( +struct ggml_mtl_context * ggml_mtl_init( void * data_buf, size_t data_size, void * eval_buf, @@ -20,18 +20,16 @@ struct ggml_mtl_context * llama_mtl_init( size_t cach_size, size_t outp_size); -void llama_mtl_free(struct ggml_mtl_context * ctx); +void ggml_mtl_free(struct ggml_mtl_context * ctx); // return 0 on success -int llama_mtl_eval( +int ggml_mtl_graph_compute( struct ggml_mtl_context * ctx, struct ggml_cgraph * gf, const int * tokens, int n_tokens, int n_past); -float * llama_mtl_get_logits(struct ggml_mtl_context * ctx); - #ifdef __cplusplus } #endif diff --git a/examples/mtl/mtl.m b/ggml-mtl.m similarity index 92% rename from examples/mtl/mtl.m rename to ggml-mtl.m index b8fd1c144..ecbb1a188 100644 --- a/examples/mtl/mtl.m +++ b/ggml-mtl.m @@ -1,4 +1,4 @@ -#import "mtl.h" +#import "ggml-mtl.h" #import "ggml.h" @@ -6,7 +6,7 @@ #import #import -#ifdef LLAMA_MTL_NDEBUG +#ifdef GGML_METAL_NDEBUG #define mtl_printf(...) #else #define mtl_printf(...) fprintf(stderr, __VA_ARGS__) @@ -85,9 +85,9 @@ struct ggml_mtl_context { // MSL code // TODO: move the contents here when ready // for now it is easier to work in a separate file -NSString * const msl_library_llama = @"see mtl.metal"; +NSString * const msl_library_source = @"see mtl.metal"; -struct ggml_mtl_context * llama_mtl_init( +struct ggml_mtl_context * ggml_mtl_init( void * data_buf, size_t data_size, void * eval_buf, @@ -122,7 +122,7 @@ struct ggml_mtl_context * llama_mtl_init( { NSError * error = nil; - ctx->library = [ctx->device newLibraryWithSource:msl_library_llama options:nil error:&error]; + ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error]; if (error) { fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); exit(1); @@ -133,7 +133,10 @@ struct ggml_mtl_context * llama_mtl_init( { NSError * error = nil; - NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"]; + //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"]; + NSString * path = [[NSBundle mainBundle] pathForResource:@"ggml-mtl" ofType:@"metal"]; + fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]); + NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; if (error) { fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); @@ -220,7 +223,7 @@ struct ggml_mtl_context * llama_mtl_init( // TODO: how to use MTLStorageModeManaged? // TODO: see if we can avoid this copy somehow { - void * mem_buffer = data_buf; + const void * mem_buffer = data_buf; const size_t mem_size = data_size; //ctx->buffer_data = [ctx->device newBufferWithBytesNoCopy:mem_buffer length:mem_size options:MTLResourceStorageModeShared deallocator:nil]; @@ -261,18 +264,20 @@ struct ggml_mtl_context * llama_mtl_init( return ctx; } -void llama_mtl_free(struct ggml_mtl_context * ctx) { +void ggml_mtl_free(struct ggml_mtl_context * ctx) { fprintf(stderr, "%s: deallocating\n", __func__); free(ctx); } // get data / eval buffer + offset -id llama_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) { +id ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) { const int64_t offs_data = (int64_t) t->data - (int64_t) ctx->data_buf; const int64_t offs_eval = (int64_t) t->data - (int64_t) ctx->eval_buf; const int64_t offs_cach = (int64_t) t->data - (int64_t) ctx->cach_buf; + //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + //const size_t t_size = ggml_nbytes(t); id result; @@ -317,7 +322,7 @@ id llama_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_te return result; } -int llama_mtl_eval( +int ggml_mtl_graph_compute( struct ggml_mtl_context * ctx, struct ggml_cgraph * gf, const int * tokens, @@ -336,7 +341,7 @@ int llama_mtl_eval( { struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd"); - id id_dst = llama_mtl_get_buffer(ctx, embd, &offs_src0); + id id_dst = ggml_mtl_get_buffer(ctx, embd, &offs_src0); memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd)); } @@ -385,9 +390,9 @@ int llama_mtl_eval( const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; - id id_src0 = src0 ? llama_mtl_get_buffer(ctx, src0, &offs_src0) : nil; - id id_src1 = src1 ? llama_mtl_get_buffer(ctx, src1, &offs_src1) : nil; - id id_dst = dst ? llama_mtl_get_buffer(ctx, dst, &offs_dst) : nil; + id id_src0 = src0 ? ggml_mtl_get_buffer(ctx, src0, &offs_src0) : nil; + id id_src1 = src1 ? ggml_mtl_get_buffer(ctx, src1, &offs_src1) : nil; + id id_dst = dst ? ggml_mtl_get_buffer(ctx, dst, &offs_dst) : nil; //mtl_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); //if (src0) { @@ -775,7 +780,7 @@ int llama_mtl_eval( struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1]; - id id_src = llama_mtl_get_buffer(ctx, out, &offs_src0); + id id_src = ggml_mtl_get_buffer(ctx, out, &offs_src0); id id_dst = ctx->out; id encoder_blit = [command_buffer blitCommandEncoder]; @@ -817,53 +822,5 @@ int llama_mtl_eval( mtl_printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax); #endif - //{ - // struct ggml_tensor * t = ggml_get_tensor(ctx->ctx_eval, "mtl-check"); - // if (t->type == GGML_TYPE_F32) { - // const const float * data = (float *) ctx->out.contents; - // printf("data: "); - // for (int i = 0; i < (int) t->ne[0]; i++) { - // printf("%f ", data[i]); - // } - // printf("\n"); - // double sum = 0.0; - // for (int i = 0; i < ggml_nelements(t); i++) { - // double cur = data[i]; - // if (isinf(cur)) continue; - // sum += cur; - // } - // printf("sum: %f\n", sum); - // } else if (t->type == GGML_TYPE_F16) { - // ggml_fp16_t * data = (const ggml_fp16_t *) ctx->out.contents; - // printf("data: "); - // for (int i = 0; i < (int) t->ne[0]; i++) { - // printf("%f ", ggml_fp16_to_fp32(data[i])); - // } - // printf("\n"); - // double sum = 0.0; - // printf("nb: %lld %lld %lld %lld\n", t->nb[0], t->nb[1], t->nb[2], t->nb[3]); - // for (int64_t i3 = 0; i3 < t->ne[3]; ++i3) { - // for (int64_t i2 = 0; i2 < t->ne[2]; ++i2) { - // for (int64_t i1 = 0; i1 < t->ne[1]; ++i1) { - // for (int64_t i0 = 0; i0 < t->ne[0]; ++i0) { - // const size_t offs = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0]; - // const ggml_fp16_t cur = *((ggml_fp16_t *)((char *) data + offs)); - // const float curf = ggml_fp16_to_fp32(cur); - // if (isinf(curf)) continue; - // sum += curf; - // } - // } - // } - // } - // printf("sum: %f\n", sum); - // } else { - // GGML_ASSERT(false && "not implemented"); - // } - //} - return 0; } - -float * llama_mtl_get_logits(struct ggml_mtl_context * ctx) { - return ctx->logits; -} diff --git a/examples/mtl/mtl.metal b/ggml-mtl.metal similarity index 100% rename from examples/mtl/mtl.metal rename to ggml-mtl.metal diff --git a/ggml.h b/ggml.h index 7f821cf32..2ea87ce9a 100644 --- a/ggml.h +++ b/ggml.h @@ -451,7 +451,7 @@ extern "C" { // main GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); - GGML_API void ggml_free(struct ggml_context * ctx); + GGML_API void ggml_free(struct ggml_context * ctx); GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); diff --git a/llama.cpp b/llama.cpp index 24b9d633b..5cd39b612 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9,9 +9,6 @@ #include "llama-util.h" #include "llama.h" -// METAL -#include "examples/mtl/mtl.h" - #include "ggml.h" #ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" @@ -19,6 +16,10 @@ #include "ggml-opencl.h" #endif +#ifdef GGML_USE_METAL +#include "ggml-mtl.h" +#endif + #include #include #include @@ -241,8 +242,9 @@ struct llama_context { llama_ctx_buffer buf_compute; llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; - // METAL +#ifdef GGML_USE_METAL ggml_mtl_context * mtl_ctx = NULL; +#endif int buf_last = 0; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; @@ -842,7 +844,6 @@ struct llama_context_params llama_context_default_params() { /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.embedding =*/ false, - /*.cgraph =*/ false, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, }; @@ -1442,12 +1443,15 @@ static bool llama_eval_internal( // run the computation ggml_build_forward_expand(&gf, inpL); - // METAL +#ifdef GGML_USE_METAL if (lctx.mtl_ctx) { - llama_mtl_eval(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past); + ggml_mtl_graph_compute(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past); } else { - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute(ctx0, &gf); } +#else + ggml_graph_compute(ctx0, &gf); +#endif if (cgraph_fname) { // TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found @@ -2376,11 +2380,10 @@ struct llama_context * llama_init_from_file( ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); } - // METAL - if (params.cgraph) { +#ifdef GGML_USE_METAL + if (params.n_gpu_layers > 0) { // this allocates all Metal resources and memory buffers - //ctx->mtl_ctx = llama_mtl_init(ctx_data, ctx_eval, &gf); - ctx->mtl_ctx = llama_mtl_init( + ctx->mtl_ctx = ggml_mtl_init( ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size (ctx->model.ctx), ctx->buf_compute.addr, @@ -2389,6 +2392,7 @@ struct llama_context * llama_init_from_file( ctx->model.kv_self.buf.size, 32*ctx->model.hparams.n_vocab*sizeof(float)); } +#endif return ctx; } diff --git a/llama.h b/llama.h index faaca2637..a650ddf45 100644 --- a/llama.h +++ b/llama.h @@ -31,7 +31,7 @@ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_VERSION 1 -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. #define LLAMA_SUPPORTS_GPU_OFFLOAD #endif @@ -75,7 +75,6 @@ extern "C" { bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool embedding; // embedding mode only - bool cgraph; // try to load computation graph from "llama.ggml" (METAL) // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback;