diff --git a/ggml-metal.h b/ggml-metal.h index a726ddd1c..6d99d7e5a 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -35,7 +35,7 @@ extern "C" { #endif // GG: maybe return ptr and avoid the "ggml.h" include -struct ggml_backend ggml_backend_metal_init(); +struct ggml_backend ggml_backend_metal_init(struct ggml_backend * backend_cpu); //struct ggml_metal_context; // diff --git a/ggml-metal.m b/ggml-metal.m index d7ff833a4..6d610e678 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -993,30 +993,59 @@ void ggml_metal_graph_compute( } } +static const char * ggml_backend_metal_name(ggml_backend_context_t ctx) { + return "Metal"; + + UNUSED(ctx); +} + +static void ggml_backend_metal_graph_compute(ggml_backend_context_t ctx, struct ggml_cgraph * cgraph) { + struct ggml_metal_context * ctx_metal = (struct ggml_metal_context *) ctx; + + ggml_metal_graph_compute(ctx_metal, cgraph); +} + static struct ggml_backend_interface metal_backend_interface = { - /* .get_name = */ //ggml_backend_metal_name, - /* .free_context = */ //ggml_backend_metal_free_context, - /* .alloc_buffer = */ //ggml_backend_metal_alloc_buffer, - /* .free_buffer = */ //ggml_backend_metal_free_buffer, - /* .reset_buffer = */ //ggml_backend_metal_reset_buffer, - /* .alloc_tensor = */ //ggml_backend_metal_alloc_tensor, - /* .set_tensor_async = */ //ggml_backend_metal_set_tensor_async, - /* .get_tensor_async = */ //ggml_backend_metal_get_tensor_async, - /* .synchronize = */ //ggml_backend_metal_synchronize, - /* .cpy_tensor_from = */ //nullptr, - /* .cpy_tensor_to = */ //nullptr, - /* .graph_plan_create = */ //ggml_backend_metal_graph_plan_create, - /* .graph_plan_free = */ //ggml_backend_metal_graph_plan_free, - /* .graph_plan_compute = */ //ggml_backend_metal_graph_plan_compute, - /* .graph_compute = */ //ggml_backend_metal_graph_compute + /* .get_name = */ ggml_backend_metal_name, + /* .free_context = */ NULL, //ggml_backend_metal_free_context, + /* .alloc_buffer = */ NULL, //ggml_backend_metal_alloc_buffer, + /* .free_buffer = */ NULL, //ggml_backend_metal_free_buffer, + /* .reset_buffer = */ NULL, //ggml_backend_metal_reset_buffer, + /* .alloc_tensor = */ NULL, //ggml_backend_metal_alloc_tensor, + /* .set_tensor_async = */ NULL, //ggml_backend_metal_set_tensor_async, + /* .get_tensor_async = */ NULL, //ggml_backend_metal_get_tensor_async, + /* .synchronize = */ NULL, //ggml_backend_metal_synchronize, + /* .cpy_tensor_from = */ NULL, //nullptr, + /* .cpy_tensor_to = */ NULL, //nullptr, + /* .graph_plan_create = */ NULL, //ggml_backend_metal_graph_plan_create, + /* .graph_plan_free = */ NULL, //ggml_backend_metal_graph_plan_free, + /* .graph_plan_compute = */ NULL, //ggml_backend_metal_graph_plan_compute, + /* .graph_compute = */ ggml_backend_metal_graph_compute, }; -struct ggml_backend ggml_backend_metal_init(void) { +struct ggml_backend ggml_backend_metal_init(struct ggml_backend * backend_cpu) { struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); - struct ggml_backend metal_backend = { - /* .interface = */ &metal_backend_interface, - /* .context = */ ctx + struct ggml_backend backend_metal = { + /* .interface = */ &metal_backend_interface, + /* .context = */ ctx, + /* .is_ram_shared = */ true, }; - return metal_backend; + + // reuses CPU calls for now + backend_metal.interface->free_context = backend_cpu->interface->free_context; + backend_metal.interface->alloc_buffer = backend_cpu->interface->alloc_buffer; + backend_metal.interface->free_buffer = backend_cpu->interface->free_buffer; + backend_metal.interface->reset_buffer = backend_cpu->interface->reset_buffer; + backend_metal.interface->alloc_tensor = backend_cpu->interface->alloc_tensor; + backend_metal.interface->set_tensor_async = backend_cpu->interface->set_tensor_async; + backend_metal.interface->get_tensor_async = backend_cpu->interface->get_tensor_async; + backend_metal.interface->synchronize = backend_cpu->interface->synchronize; + backend_metal.interface->cpy_tensor_from = backend_cpu->interface->cpy_tensor_from; + backend_metal.interface->cpy_tensor_to = backend_cpu->interface->cpy_tensor_to; + backend_metal.interface->graph_plan_create = backend_cpu->interface->graph_plan_create; + backend_metal.interface->graph_plan_free = backend_cpu->interface->graph_plan_free; + backend_metal.interface->graph_plan_compute = backend_cpu->interface->graph_plan_compute; + + return backend_metal; } diff --git a/llama.cpp b/llama.cpp index 867b3e59f..5039f14e9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -968,7 +968,7 @@ static void llama_model_load_internal( #endif #ifdef GGML_USE_METAL if (n_gpu_layers > 0) { - model.backend_metal = ggml_backend_metal_init(); + model.backend_metal = ggml_backend_metal_init(backend_cpu); backend_gpu = &model.backend_metal; } #endif @@ -1008,17 +1008,20 @@ static void llama_model_load_internal( // TODO: generalize support for mmap size_t mmap_size = 0; if (ml->use_mmap) { - mmap_size = ctx_sizes[backend_cpu]; - ctx_sizes[backend_cpu] = 0; + for (auto & it : ctx_sizes) { + if (it.first->is_ram_shared) { + mmap_size += it.second; + ctx_sizes[it.first] = 0; + } + } } fprintf(stderr, "%s: ggml ctx sizes:\n", __func__); for (const auto & it : ctx_sizes) { - fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0); - if (it.first->is_ram_shared && ml->use_mmap) { - fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0); - } - fprintf(stderr, "\n"); + fprintf(stderr, "%8s = %7.2f MB\n", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0); + } + if (mmap_size > 0) { + fprintf(stderr, "%8s = %7.2f MB\n", "mmap", mmap_size / 1024.0 / 1024.0); } // create the buffers and contexts