From a1ab35c682cb5835ae110b9f01232b766a1136ce Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 5 Jan 2024 03:14:06 +0100 Subject: [PATCH] fix unmap after loading --- llama.cpp | 84 ++++++++++++++++++++++++------------------------------- 1 file changed, 37 insertions(+), 47 deletions(-) diff --git a/llama.cpp b/llama.cpp index 12ec49ebc..f7c581ca0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2401,7 +2401,7 @@ struct llama_model_loader { return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx); } - void init_mapping(bool prefetch = true) { + void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) { /* // prefetch only CPU tensors if (use_mmap) { @@ -2421,6 +2421,18 @@ struct llama_model_loader { if (use_mmap) { mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa())); } + + for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { + struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); + size_data += ggml_nbytes(cur); + } + + if (use_mmap && mapping) { + if (lmlock) { + lmlock->init(mapping->addr); + } + mmap_used_first = mapping->size; + } } // for backwards compatibility, does not support ggml-backend @@ -2439,29 +2451,15 @@ struct llama_model_loader { size_t size_done = 0; size_t size_data = 0; - size_t mmap_first = -1; - size_t mmap_last = 0; + size_t mmap_used_first = -1; + size_t mmap_used_last = 0; // Returns false if cancelled by progress_callback bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) { - // TODO: move to a better place - if (size_data == 0) { - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); - size_data += ggml_nbytes(cur); - } - - if (use_mmap && buf_mmap) { - // FIXME - //if (lmlock) { - // lmlock->init(mapping->addr); - //} - } - } + GGML_ASSERT(size_data != 0 && "call init_mapping() first"); std::vector> read_buf; - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); if (!cur) { @@ -2477,15 +2475,14 @@ struct llama_model_loader { const size_t offs = file_offset(ggml_get_name(cur)); - // FIXME if (use_mmap && mapping) { if (buf_mmap && cur->data == nullptr) { ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); if (lmlock) { lmlock->grow_to(offs + ggml_nbytes(cur)); } - mmap_first = std::min(mmap_first, offs); - mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur)); + mmap_used_first = std::min(mmap_used_first, offs); + mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur)); } else { ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur)); } @@ -2504,20 +2501,23 @@ struct llama_model_loader { size_done += ggml_nbytes(cur); } - if (progress_callback && size_done >= size_data) { - // Even though the model is done loading, we still honor - // cancellation since we need to free allocations. - return progress_callback(1.0f, progress_callback_user_data); + // check if this is the last call and do final cleanup + if (size_done >= size_data) { + // unmap offloaded tensors and metadata + if (use_mmap && mapping) { + mapping->unmap_fragment(0, mmap_used_first); + if (mmap_used_last != 0) { + mapping->unmap_fragment(mmap_used_last, mapping->size); + } + } + if (progress_callback) { + // Even though the model is done loading, we still honor + // cancellation since we need to free allocations. + return progress_callback(1.0f, progress_callback_user_data); + } } - return true; - } - void unmap_fragments() { - // unmap offloaded tensors and metadata - if (use_mmap && mapping) { - mapping->unmap_fragment(0, mmap_first); - mapping->unmap_fragment(mmap_last, mapping->size); - } + return true; } }; @@ -3700,16 +3700,7 @@ static bool llm_load_tensors( ml.done_getting_tensors(); - ml.init_mapping(); - - // TODO: move to ggml - //auto ggml_n_tensors = [](struct ggml_context * ctx) { - // int n = 0; - // for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { - // ++n; - // } - // return n; - //}; + ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr); // create backend buffers @@ -3720,9 +3711,9 @@ static bool llm_load_tensors( ggml_context * ctx = it.second; ggml_backend_buffer_t buf = nullptr; - // TODO: do not use whole model mapping for the buffer, only the region containing the tensors - // this is important for metal: if the entire model could be mapped, then we could use metal for all layers - if (ml.use_mmap && buft == ggml_backend_cpu_buffer_type()) { + // TODO: do not use the whole model mapping for the buffer, only the region containing the tensors + // this is important for metal: if the entire model could be mapped to a metal buffer, then we could use metal for all layers + if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) { buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size); } #ifdef GGML_USE_METAL @@ -3780,7 +3771,6 @@ static bool llm_load_tensors( return false; } } - ml.unmap_fragments(); model.mapping = std::move(ml.mapping);