From 4adb6f1441a3e04f1bbf9c1a98485227d73e7aa6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 17 Jun 2023 20:45:31 +0300 Subject: [PATCH] metal : minimize view overlap to try to utilize device memory better --- Makefile | 2 +- ggml-metal.h | 5 ++++- ggml-metal.m | 18 ++++++++++-------- ggml.c | 24 ++++++++++++++++++++++-- ggml.h | 5 +++-- llama.cpp | 26 ++++++++++++++++---------- 6 files changed, 56 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 72d6ad40c..e7c213f83 100644 --- a/Makefile +++ b/Makefile @@ -252,7 +252,7 @@ $(info ) ggml.o: ggml.c ggml.h ggml-cuda.h $(CC) $(CFLAGS) -c $< -o $@ -llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h +llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h $(CXX) $(CXXFLAGS) -c $< -o $@ common.o: examples/common.cpp examples/common.h diff --git a/ggml-metal.h b/ggml-metal.h index 033c4d86a..b9e50ac74 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -41,12 +41,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx); // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute // - the mapping is used during computation to determine the arguments of the compute kernels // - you don't need to keep the host memory buffer allocated as it is never accessed by Metal +// - max_size specifies the maximum size of a tensor and is used to create shared views such +// that it is guaranteed that the tensor will fit in at least one of the views // bool ggml_metal_add_buffer( struct ggml_metal_context * ctx, const char * name, void * data, - size_t size); + size_t size, + size_t max_size); // set data from host memory into the device void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); diff --git a/ggml-metal.m b/ggml-metal.m index e476fba2b..a7e104dc7 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -240,14 +240,14 @@ bool ggml_metal_add_buffer( if (data) { // verify that the buffer does not overlap with any of the existing buffers - //for (int i = 0; i < ctx->n_buffers; ++i) { - // const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; + for (int i = 0; i < ctx->n_buffers; ++i) { + const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; - // if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { - // fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); - // return false; - // } - //} + if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { + fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); + return false; + } + } const size_t size_page = getpagesize(); @@ -273,7 +273,9 @@ bool ggml_metal_add_buffer( ++ctx->n_buffers; } else { - const size_t size_ovlp = (max_size + size_page - 1) / size_page * size_page; + // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into + // one of the views + const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case const size_t size_step = ctx->device.maxBufferLength - size_ovlp; const size_t size_view = ctx->device.maxBufferLength; diff --git a/ggml.c b/ggml.c index 0eda7f338..78c365354 100644 --- a/ggml.c +++ b/ggml.c @@ -4154,14 +4154,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) { ctx->no_alloc = no_alloc; } -void * ggml_get_mem_buffer(struct ggml_context * ctx) { +void * ggml_get_mem_buffer(const struct ggml_context * ctx) { return ctx->mem_buffer; } -size_t ggml_get_mem_size(struct ggml_context * ctx) { +size_t ggml_get_mem_size(const struct ggml_context * ctx) { return ctx->mem_size; } +size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { + size_t max_size = 0; + + struct ggml_object * obj = ctx->objects_begin; + + while (obj != NULL) { + struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs); + + const size_t size = ggml_nbytes(tensor); + + if (max_size < size) { + max_size = size; + } + + obj = obj->next; + } + + return max_size; +} + // IMPORTANT: // when creating "opt" tensors, always save and load the scratch buffer // this is an error prone process, but it is necessary to support inplace diff --git a/ggml.h b/ggml.h index 9b0c846f8..1380c530f 100644 --- a/ggml.h +++ b/ggml.h @@ -500,8 +500,9 @@ extern "C" { GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch); GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); - GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx); - GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx); + GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx); + GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx); + GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx); GGML_API struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, diff --git a/llama.cpp b/llama.cpp index a2916b3e8..c165d3239 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2696,16 +2696,21 @@ struct llama_context * llama_init_from_file( // this allocates all Metal resources and memory buffers ctx->ctx_metal = ggml_metal_init(); - void *data_ptr = NULL; + void * data_ptr = NULL; size_t data_size = 0; + if (params.use_mmap) { - data_ptr = ctx->model.mapping->addr; - data_size= ctx->model.mapping->size; + data_ptr = ctx->model.mapping->addr; + data_size = ctx->model.mapping->size; } else { - data_ptr = ggml_get_mem_buffer(ctx->model.ctx); - data_size= ggml_get_mem_size(ctx->model.ctx); + data_ptr = ggml_get_mem_buffer(ctx->model.ctx); + data_size = ggml_get_mem_size (ctx->model.ctx); } + const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); + + printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + #define LLAMA_METAL_CHECK_BUF(result) \ if (!(result)) { \ fprintf(stderr, "%s: failed to add buffer\n", __func__); \ @@ -2713,12 +2718,13 @@ struct llama_context * llama_init_from_file( return NULL; \ } - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0)); + + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); #undef LLAMA_METAL_CHECK_BUF } #endif