llama.cpp : free allocator when deleting context, cleanup

2023-07-27 18:02:53 +02:00 · 2023-07-27 18:02:53 +02:00 · af7bd42b2a
commit af7bd42b2a
parent 64584d56a7
1 changed files with 33 additions and 35 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -57,13 +57,13 @@
 #endif
 #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
-#  include "ggml-alloc.h"
+#include "ggml-alloc.h"
-#  define LLAMA_USE_ALLOCATOR
+#define LLAMA_USE_ALLOCATOR
 #else
-#  define LLAMA_USE_SCRATCH
+#define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
 #endif
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
 // available llama models
 enum e_model {
@ -333,13 +333,22 @@ struct llama_model {
 struct llama_context {
    llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
 #ifdef GGML_USE_METAL
    ~llama_context() {
        if (model_owner) {
            delete &model;
        }
 #ifdef GGML_USE_METAL
        if (ctx_metal) {
            ggml_metal_free(ctx_metal);
        }
 #endif
 #ifdef LLAMA_USE_ALLOCATOR
        if (alloc) {
            ggml_allocator_free(alloc);
        }
 #endif
    }
    std::mt19937 rng;
    bool has_evaluated_once = false;
@ -1397,7 +1406,6 @@ static struct ggml_cgraph * llama_build_graph(
    const int64_t n_head      = hparams.n_head;
    const int64_t n_head_kv   = hparams.n_head_kv;
    const int64_t n_embd_head = hparams.n_embd_head();
    //const int64_t n_vocab     = hparams.n_vocab;
    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
    LLAMA_ASSERT(n_embd_head == hparams.n_rot);
@ -1408,6 +1416,7 @@ static struct ggml_cgraph * llama_build_graph(
    const int n_gpu_layers = model.n_gpu_layers;
    auto & mem_per_token = lctx.mem_per_token;
    auto & buf_compute   = lctx.buf_compute;
@ -1730,9 +1739,22 @@ static struct ggml_cgraph * llama_build_graph(
    ggml_build_forward_expand(gf, cur);
-    // outputs: cur, embeddings
+    if (mem_per_token == 0) {
        mem_per_token = ggml_used_mem(ctx0)/N;
    }
 #if 0
    printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
            ggml_used_mem(ctx0)/1024.0/1024.0,
            lctx.get_buf_max_mem(0)/1024.0/1024.0,
            lctx.get_buf_max_mem(1)/1024.0/1024.0,
            lctx.work_buffer.size()/1024.0/1024.0,
            n_past, N);
 #endif
    ggml_free(ctx0);
    // outputs: cur, embeddings
    return gf;
 #ifdef LLAMA_USE_ALLOCATOR
@ -1779,15 +1801,7 @@ static bool llama_eval_internal(
    LLAMA_ASSERT(!!kv_self.ctx);
    const int64_t n_embd      = hparams.n_embd;
    const int64_t n_layer     = hparams.n_layer;
    //const int64_t n_ctx       = hparams.n_ctx;
    //const int64_t n_head      = hparams.n_head;
    //const int64_t n_head_kv   = hparams.n_head_kv;
    //const int64_t n_embd_head = hparams.n_embd_head();
    const int64_t n_vocab     = hparams.n_vocab;
    //const int64_t n_embd_gqa  = hparams.n_embd_gqa();
    //auto & mem_per_token = lctx.mem_per_token;
 #ifdef LLAMA_USE_ALLOCATOR
    ggml_allocator_reset(lctx.alloc);
@ -1796,8 +1810,7 @@ static bool llama_eval_internal(
    ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
 #ifdef LLAMA_USE_ALLOCATOR
-    size_t sz = ggml_allocator_alloc_graph_tensors(lctx.alloc, gf);
+    ggml_allocator_alloc_graph_tensors(lctx.alloc, gf);
    //fprintf(stderr, "%s: compute buffer size: %.3f MB\n", __func__, sz / 1024.0 / 1024.0);
 #endif
    // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
@ -1807,6 +1820,7 @@ static bool llama_eval_internal(
    n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
 #if GGML_USE_MPI
    const int64_t n_layer = hparams.n_layer;
    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
 #endif
@ -1892,19 +1906,6 @@ static bool llama_eval_internal(
        memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
    }
    //if (mem_per_token == 0) {
    //    mem_per_token = ggml_used_mem(ctx0)/N;
    //}
 #if 0
    printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
            ggml_used_mem(ctx0)/1024.0/1024.0,
            lctx.get_buf_max_mem(0)/1024.0/1024.0,
            lctx.get_buf_max_mem(1)/1024.0/1024.0,
            lctx.work_buffer.size()/1024.0/1024.0,
            n_past, N);
 #endif
    // measure the performance only for the single-token evals
    if (N == 1) {
        lctx.t_eval_us += ggml_time_us() - t_start_us;
@ -3272,7 +3273,7 @@ struct llama_context * llama_new_context_with_model(
 #ifdef LLAMA_USE_ALLOCATOR
        static const size_t tensor_alignment = 32;
-        ctx->buf_compute.resize(ggml_tensor_overhead() * 3072 + ggml_graph_overhead());
+        ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
        // measure memory requirements for worst-case graph
        ctx->alloc = ggml_allocator_new_measure(tensor_alignment);
@ -3372,9 +3373,6 @@ struct llama_context * llama_init_from_file(
 }
 void llama_free(struct llama_context * ctx) {
    if (ctx->model_owner) {
        delete &ctx->model;
    }
    delete ctx;
 }