diff --git a/ggml-backend.c b/ggml-backend.c index 5dd61d32d..8e95247a3 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -94,6 +94,7 @@ struct ggml_backend_buffer * ggml_allocator_simple_init(void * data, size_t size *allocator = (struct ggml_backend_buffer){ /* .interface = */ ggml_allocator_simple_interface, /* .context = */ ctx, + /* .backend_size = */ 0, /* .backend_data = */ NULL, }; return allocator; @@ -192,6 +193,7 @@ static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(struct ggml_ba struct ggml_backend_buffer * buffer = ggml_allocator_simple_init(data, size, TENSOR_ALIGNMENT); buffer->interface.free_data = ggml_backend_cpu_free_buffer; + buffer->backend_size = size; buffer->backend_data = data; return buffer; diff --git a/ggml-backend.h b/ggml-backend.h index f29b55591..37a6addb4 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -27,6 +27,7 @@ extern "C" { struct ggml_backend_buffer { struct ggml_backend_buffer_interface interface; ggml_buffer_context_t context; + size_t backend_size; void * backend_data; }; diff --git a/ggml-metal.h b/ggml-metal.h index 89d616bb5..efde14544 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -34,9 +34,17 @@ extern "C" { #endif -// GG: maybe return ptr and avoid the "ggml.h" include struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu); +// TODO: temporary - move to backend interface +bool ggml_backend_metal_map_buffer( + struct ggml_backend * backend, + const char * name, + void * data, + size_t size, + size_t max_size); + + //struct ggml_metal_context; // //// number of command buffers to use diff --git a/ggml-metal.m b/ggml-metal.m index 00a75777e..8c0771b70 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -242,12 +242,13 @@ static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru return nil; } +// TODO: rename to ggml_metal_map_buffer bool ggml_metal_add_buffer( struct ggml_metal_context * ctx, - const char * name, - void * data, - size_t size, - size_t max_size) { + const char * name, + void * data, + size_t size, + size_t max_size) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { fprintf(stderr, "%s: too many buffers\n", __func__); return false; @@ -993,38 +994,42 @@ void ggml_metal_graph_compute( } } -static const char * ggml_backend_metal_name(ggml_backend_context_t ctx) { +bool ggml_backend_metal_map_buffer( + struct ggml_backend * backend, + const char * name, + void * data, + size_t size, + size_t max_size) { + return ggml_metal_add_buffer(backend->context, name, data, size, max_size); +} + +static const char * ggml_backend_metal_name(struct ggml_backend * ctx) { return "Metal"; UNUSED(ctx); } -static void ggml_backend_metal_graph_compute(ggml_backend_context_t ctx, struct ggml_cgraph * cgraph) { - struct ggml_metal_context * ctx_metal = (struct ggml_metal_context *) ctx; - - ggml_metal_graph_compute(ctx_metal, cgraph); +static void ggml_backend_metal_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { + ggml_metal_graph_compute(backend->context, cgraph); } static struct ggml_backend_interface metal_backend_interface = { /* .get_name = */ ggml_backend_metal_name, - /* .free_context = */ NULL, //ggml_backend_metal_free_context, - /* .alloc_buffer = */ NULL, //ggml_backend_metal_alloc_buffer, - /* .free_buffer = */ NULL, //ggml_backend_metal_free_buffer, - /* .reset_buffer = */ NULL, //ggml_backend_metal_reset_buffer, - /* .alloc_tensor = */ NULL, //ggml_backend_metal_alloc_tensor, - /* .set_tensor_async = */ NULL, //ggml_backend_metal_set_tensor_async, - /* .get_tensor_async = */ NULL, //ggml_backend_metal_get_tensor_async, - /* .synchronize = */ NULL, //ggml_backend_metal_synchronize, - /* .cpy_tensor_from = */ NULL, //nullptr, - /* .cpy_tensor_to = */ NULL, //nullptr, - /* .graph_plan_create = */ NULL, //ggml_backend_metal_graph_plan_create, - /* .graph_plan_free = */ NULL, //ggml_backend_metal_graph_plan_free, - /* .graph_plan_compute = */ NULL, //ggml_backend_metal_graph_plan_compute, + /* .free = */ NULL, //ggml_backend_metal_alloc_buffer, + /* .alloc_buffer = */ NULL, //ggml_backend_metal_free_buffer, + /* .set_tensor_async = */ NULL, //ggml_backend_metal_reset_buffer, + /* .get_tensor_async = */ NULL, //ggml_backend_metal_alloc_tensor, + /* .synchronize = */ NULL, //ggml_backend_metal_set_tensor_async, + /* .cpy_tensor_from = */ NULL, //ggml_backend_metal_get_tensor_async, + /* .cpy_tensor_to = */ NULL, //ggml_backend_metal_synchronize, + /* .graph_plan_create = */ NULL, //nullptr, + /* .graph_plan_free = */ NULL, //nullptr, + /* .graph_plan_compute = */ NULL, //ggml_backend_metal_graph_plan_create, /* .graph_compute = */ ggml_backend_metal_graph_compute, }; struct ggml_backend * ggml_backend_metal_init(struct ggml_backend * backend_cpu) { - struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + struct ggml_metal_context * ctx = ggml_metal_init(8); struct ggml_backend * backend_metal = malloc(sizeof(struct ggml_backend)); *backend_metal = (struct ggml_backend){ diff --git a/llama.cpp b/llama.cpp index e531d9a64..3bbe73894 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2817,6 +2817,44 @@ struct llama_context * llama_new_context_with_model( } } +#ifdef GGML_USE_METAL + if (params.n_gpu_layers > 0) { + void * data_ptr = NULL; + size_t data_size = 0; + + if (params.use_mmap) { + data_ptr = ctx->model.mapping->addr; + data_size = ctx->model.mapping->size; + } else { + data_ptr = ggml_get_mem_buffer(ctx->model.ctx_metal); + data_size = ggml_get_mem_size (ctx->model.ctx_metal); + } + + const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx_metal); + + printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + +#define LLAMA_METAL_CHECK_BUF(result) \ + if (!(result)) { \ + fprintf(stderr, "%s: failed to add buffer\n", __func__); \ + llama_free(ctx); \ + return NULL; \ + } + + LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "data", data_ptr, data_size, max_size)); + + struct ggml_backend_buffer * buf_compute = ctx->buf_compute_metal->backend_buffer; + struct ggml_backend_buffer * buf_kv = ctx->kv_self.buf->backend_buffer; + + LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "eval", buf_compute->backend_data, buf_compute->backend_size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "kv", buf_kv->backend_data, buf_kv->backend_size, 0)); + + //LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); + //LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); +#undef LLAMA_METAL_CHECK_BUF + } +#endif + fprintf(stderr, "%s: layer backends: ", __func__); fprintf(stderr, "input: %s, ", ggml_backend_name(ctx->model.backend_inp));