From 652c849643a81d0fee3f178b90f093f71d1f49f5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Jul 2023 18:51:02 +0300 Subject: [PATCH] ggml : add is_ram_shared to ggml_backend Metal can share the RAM memory and can utilize mmap without temp buffer --- ggml-backend.c | 5 ++-- ggml-backend.h | 14 ++++++++++- ggml-cuda.cu | 5 ++-- llama.cpp | 65 +++++++++++++++++++++++++++++++++++++------------- 4 files changed, 68 insertions(+), 21 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 85a6cac05..bd97a5b49 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -255,8 +255,9 @@ struct ggml_backend ggml_backend_cpu_init(void) { ctx->work_size = 0; struct ggml_backend cpu_backend = { - /* .interface = */ &cpu_backend_interface, - /* .context = */ ctx + /* .interface = */ &cpu_backend_interface, + /* .context = */ ctx, + /* .is_ram_shared = */ true, }; return cpu_backend; } diff --git a/ggml-backend.h b/ggml-backend.h index 44b9f785f..635555719 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -61,7 +61,10 @@ extern "C" { struct ggml_backend { struct ggml_backend_interface * interface; + ggml_backend_context_t context; + + bool is_ram_shared; }; // backend helper functions @@ -78,7 +81,16 @@ extern "C" { static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface->graph_compute(backend->context, cgraph); } // buffer and tensor allocation - GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors); // GG: probably return ptr + // TODO: + // - return "struct ggml_buffer *" + // - fix namings: + // - ggml_backend_alloc_buffer -> ggml_backend_buffer_alloc + // - ggml_backend_free_buffer -> ggml_backend_buffer_free + // - ggml_backend_reset_buffer -> ggml_backend_buffer_reset + // - ggml_backend_alloc_tensor -> ggml_backend_tensor_alloc + // - ggml_backend_tensor_cpy -> ggml_backend_tensor_copy + // + GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors); GGML_API void ggml_backend_free_buffer(struct ggml_buffer * buffer); static inline void ggml_backend_reset_buffer(struct ggml_buffer * buffer) { buffer->backend->interface->reset_buffer(buffer->backend->context, buffer->backend_buffer); } static inline void ggml_backend_alloc_tensor(struct ggml_buffer * buffer, struct ggml_tensor * tensor) { buffer->backend->interface->alloc_tensor(buffer->backend->context, buffer->backend_buffer, tensor); } diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 343eda0b2..a2d7c545b 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1834,8 +1834,9 @@ ggml_backend ggml_backend_cuda_init(void) { ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context; ggml_backend cuda_backend = { - /* .interface = */ &cuda_backend_interface, - /* .context = */ ctx + /* .interface = = */ &cuda_backend_interface, + /* .context = */ ctx, + /* .is_ram_shared = */ false, }; return cuda_backend; } diff --git a/llama.cpp b/llama.cpp index e4a566df0..c234cdf3f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -225,6 +225,7 @@ struct llama_model { llama_vocab vocab; // backends + // TODO: change to pointers ggml_backend backend_cpu; ggml_buffer buf_cpu; ggml_context * ctx_cpu = NULL; @@ -298,6 +299,7 @@ struct llama_context { // memory buffers used to evaluate the model ggml_buffer buf_compute_cpu = {}; + #ifdef GGML_USE_CUDA ggml_buffer buf_compute_cuda = {}; #endif @@ -612,7 +614,7 @@ struct llama_model_loader { } } - void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { size_t data_size = 0; size_t lock_size = 0; for (const llama_load_tensor & lt : tensors_map.tensors) { @@ -634,11 +636,11 @@ struct llama_model_loader { } LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already - bool is_cpu = lt.ggml_tensor->backend == &model->backend_cpu; + const bool is_ram_shared = lt.ggml_tensor->backend->is_ram_shared; // select buffer to load data into if (!use_mmap) { - if (is_cpu) { + if (is_ram_shared) { lt.data = (uint8_t *) lt.ggml_tensor->data; } else { // read to temporary buffer @@ -649,7 +651,7 @@ struct llama_model_loader { load_data_for(lt); - if (is_cpu) { + if (is_ram_shared) { if (use_mmap) { lt.ggml_tensor->data = lt.data; // TODO: this assumes that the data to lock is contiguous, which may not always be the case @@ -671,7 +673,7 @@ struct llama_model_loader { } } - void load_data_for(llama_load_tensor & lt) { + void load_data_for(llama_load_tensor & lt) const { if (use_mmap) { lt.data = (uint8_t *) mapping->addr + lt.file_off; } else { @@ -957,6 +959,7 @@ static void llama_model_load_internal( ggml_backend * backend_cpu = &model.backend_cpu; ggml_backend * backend_gpu = &model.backend_cpu; // hack until we have a proper backend selection + #ifdef GGML_USE_CUDA if (n_gpu_layers > 0) { model.backend_cuda = ggml_backend_cuda_init(); @@ -965,13 +968,14 @@ static void llama_model_load_internal( #endif #ifdef GGML_USE_METAL if (n_gpu_layers > 0) { - model.backend_metal = ggml_backend_cpu_init(); + model.backend_metal = ggml_backend_metal_init(); backend_gpu = &model.backend_metal; } #endif // assign splits to the backends const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers); + model.backend_inp = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu; model.backend_out = n_gpu_layers > 0 ? backend_gpu : backend_cpu; @@ -1011,7 +1015,7 @@ static void llama_model_load_internal( fprintf(stderr, "%s: ggml ctx sizes:\n", __func__); for (const auto & it : ctx_sizes) { fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0); - if (it.first == backend_cpu && ml->use_mmap) { + if (it.first->is_ram_shared && ml->use_mmap) { fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0); } fprintf(stderr, "\n"); @@ -1135,12 +1139,10 @@ static void llama_model_load_internal( ctx_sum += it.second; } - const size_t mem_required = - ctx_sum + MEM_REQ_EVAL().at(model.type); + const size_t mem_required = ctx_sum + MEM_REQ_EVAL().at(model.type); // this is the memory required by one llama_state - const size_t mem_required_state = - scale*MEM_REQ_KV_SELF().at(model.type); + const size_t mem_required_state = scale*MEM_REQ_KV_SELF().at(model.type); fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); @@ -1162,6 +1164,7 @@ static void llama_model_load_internal( // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = ggml_time_us() - model.t_start_us; + } static bool llama_model_load( @@ -1226,6 +1229,7 @@ static ggml_graph_splits llama_build_graph( // initialize contexts for every backend struct ggml_context * ctx_cpu = nullptr; + if (lctx.buf_compute_cpu.mem_size > 0) { struct ggml_init_params params = ggml_init_params_default(); params.buffer = &lctx.buf_compute_cpu; @@ -1235,6 +1239,7 @@ static ggml_graph_splits llama_build_graph( #ifdef GGML_USE_CUDA struct ggml_context * ctx_cuda = nullptr; + if (lctx.buf_compute_cuda.mem_size > 0) { struct ggml_init_params params = ggml_init_params_default(); params.buffer = &lctx.buf_compute_cuda; @@ -1243,30 +1248,54 @@ static ggml_graph_splits llama_build_graph( } #endif +#ifdef GGML_USE_METAL + struct ggml_context * ctx_metal = nullptr; + + if (lctx.buf_compute_metal.mem_size > 0) { + struct ggml_init_params params = ggml_init_params_default(); + params.buffer = &lctx.buf_compute_metal; + params.compute_type = compute_type; + ctx_metal = ggml_init(params); + } +#endif + // TODO: clean this struct ggml_context * ctx_i = nullptr; - struct ggml_context * ctx_ls[80] = {nullptr}; struct ggml_context * ctx_o = nullptr; struct ggml_context * ctx_kv = nullptr; + struct ggml_context * ctx_ls[80] = {nullptr}; if (lctx.model.backend_inp == &lctx.model.backend_cpu) ctx_i = ctx_cpu; if (lctx.model.backend_out == &lctx.model.backend_cpu) ctx_o = ctx_cpu; + #ifdef GGML_USE_CUDA if (lctx.model.backend_inp == &lctx.model.backend_cuda) ctx_i = ctx_cuda; if (lctx.model.backend_out == &lctx.model.backend_cuda) ctx_o = ctx_cuda; #endif +#ifdef GGML_USE_METAL + if (lctx.model.backend_inp == &lctx.model.backend_metal) ctx_i = ctx_metal; + if (lctx.model.backend_out == &lctx.model.backend_metal) ctx_o = ctx_metal; +#endif for (int il = 0; il < n_layer; il++) { - if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu) ctx_ls[il] = ctx_cpu; + if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu) ctx_ls[il] = ctx_cpu; + #ifdef GGML_USE_CUDA if (lctx.model.backend_layers[il] == &lctx.model.backend_cuda) ctx_ls[il] = ctx_cuda; +#endif +#ifdef GGML_USE_METAL + if (lctx.model.backend_layers[il] == &lctx.model.backend_metal) ctx_ls[il] = ctx_metal; #endif } - if (lctx.backend_kv == &lctx.model.backend_cpu) ctx_kv = ctx_cpu; + if (lctx.backend_kv == &lctx.model.backend_cpu) ctx_kv = ctx_cpu; + #ifdef GGML_USE_CUDA if (lctx.backend_kv == &lctx.model.backend_cuda) ctx_kv = ctx_cuda; #endif +#ifdef GGML_USE_METAL + if (lctx.backend_kv == &lctx.model.backend_metal) ctx_kv = ctx_metal; +#endif struct ggml_tensor * inpL; @@ -1522,7 +1551,7 @@ static ggml_graph_splits llama_build_graph( //} #ifdef LLAMA_1L_GRAPH_DUMP - if (N==1 && n_past == 0) { + if (N == 1 && n_past == 0) { ggml_graph_dump_dot(gf, NULL, "llama.dot"); printf("graph for N=%i, n_past=%i dumped to llama.dot\n", N, n_past); exit(0); @@ -1547,6 +1576,11 @@ static ggml_graph_splits llama_build_graph( ggml_free(ctx_cuda); } #endif +#ifdef GGML_USE_METAL + if (ctx_metal != nullptr) { + ggml_free(ctx_metal); + } +#endif return splits; } @@ -2651,7 +2685,6 @@ struct llama_context * llama_new_context_with_model( ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; - // TODO: choose backend depending on n_layers/low_vram #ifdef GGML_USE_CUDA if ((uint32_t)params.n_gpu_layers >= model->hparams.n_layer/2) {