From cd4a8cd28c05458a8c8ee37a7161a034a297d28d Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 28 Jul 2023 00:36:48 +0200 Subject: [PATCH] llama.cpp : better memory usage prints with allocator --- llama.cpp | 52 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/llama.cpp b/llama.cpp index 02582c483..6c57a2be9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1252,12 +1252,16 @@ static void llama_model_load_internal( const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; // this is the total memory required to run the inference - const size_t mem_required = + size_t mem_required = ctx_size + - mmapped_size - vram_weights + // weights in VRAM not in memory + mmapped_size - vram_weights; // weights in VRAM not in memory + +#ifndef LLAMA_USE_ALLOCATOR + mem_required += MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + MEM_REQ_SCRATCH1().at(model.type) + MEM_REQ_EVAL().at(model.type); +#endif // this is the memory required by one llama_state const size_t mem_required_state = @@ -3272,30 +3276,38 @@ struct llama_context * llama_new_context_with_model( } #ifdef LLAMA_USE_ALLOCATOR - static const size_t tensor_alignment = 32; - ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); + { + static const size_t tensor_alignment = 32; + // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data + ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); - // measure memory requirements for worst-case graph - ctx->alloc = ggml_allocator_new_measure(tensor_alignment); + // create measure allocator + ctx->alloc = ggml_allocator_new_measure(tensor_alignment); - // build worst-case graph - int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); - int n_past = hparams.n_ctx - n_tokens; - llama_token token = llama_token_bos(); - ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); + // build worst-case graph + int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); + int n_past = hparams.n_ctx - n_tokens; + llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); - size_t size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf) + tensor_alignment; - fprintf(stderr, "%s: worst-case graph size = %7.2f MB\n", __func__, size / 1024.0 / 1024.0); - fprintf(stderr, "%s: compute buffer total size: %7.2f MB\n", __func__, (ctx->buf_compute.size + size) / 1024.0 / 1024.0); + // measure memory requirements for the graph + size_t alloc_size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf) + tensor_alignment; - size_t prev_req = MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) + MEM_REQ_SCRATCH1().at(ctx->model.type) + MEM_REQ_EVAL().at(ctx->model.type); - fprintf(stderr, "%s: equivalent with scratch buffer: %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); + fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); + // debug - for comparison with scratch buffer + //size_t prev_req = + // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) + + // MEM_REQ_SCRATCH1().at(ctx->model.type) + + // MEM_REQ_EVAL().at(ctx->model.type); + //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); - // recreate allocator with exact memory requirements - ggml_allocator_free(ctx->alloc); - ctx->buf_alloc.resize(size); - ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); + // recreate allocator with exact memory requirements + ggml_allocator_free(ctx->alloc); + + ctx->buf_alloc.resize(alloc_size); + ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); + } #else ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); #endif