diff --git a/llama.cpp b/llama.cpp index 6815c2196..a8c8014a0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -57,13 +57,13 @@ #endif #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) -# include "ggml-alloc.h" -# define LLAMA_USE_ALLOCATOR +#include "ggml-alloc.h" +#define LLAMA_USE_ALLOCATOR #else -# define LLAMA_USE_SCRATCH +#define LLAMA_USE_SCRATCH +#define LLAMA_MAX_SCRATCH_BUFFERS 16 #endif -#define LLAMA_MAX_SCRATCH_BUFFERS 16 // available llama models enum e_model { @@ -333,13 +333,22 @@ struct llama_model { struct llama_context { llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} -#ifdef GGML_USE_METAL ~llama_context() { + if (model_owner) { + delete &model; + } +#ifdef GGML_USE_METAL if (ctx_metal) { ggml_metal_free(ctx_metal); } - } #endif +#ifdef LLAMA_USE_ALLOCATOR + if (alloc) { + ggml_allocator_free(alloc); + } +#endif + } + std::mt19937 rng; bool has_evaluated_once = false; @@ -1397,7 +1406,6 @@ static struct ggml_cgraph * llama_build_graph( const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); - //const int64_t n_vocab = hparams.n_vocab; const int64_t n_embd_gqa = hparams.n_embd_gqa(); LLAMA_ASSERT(n_embd_head == hparams.n_rot); @@ -1408,6 +1416,7 @@ static struct ggml_cgraph * llama_build_graph( const int n_gpu_layers = model.n_gpu_layers; + auto & mem_per_token = lctx.mem_per_token; auto & buf_compute = lctx.buf_compute; @@ -1730,9 +1739,22 @@ static struct ggml_cgraph * llama_build_graph( ggml_build_forward_expand(gf, cur); - // outputs: cur, embeddings + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; + } + +#if 0 + printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__, + ggml_used_mem(ctx0)/1024.0/1024.0, + lctx.get_buf_max_mem(0)/1024.0/1024.0, + lctx.get_buf_max_mem(1)/1024.0/1024.0, + lctx.work_buffer.size()/1024.0/1024.0, + n_past, N); +#endif + ggml_free(ctx0); + // outputs: cur, embeddings return gf; #ifdef LLAMA_USE_ALLOCATOR @@ -1779,15 +1801,7 @@ static bool llama_eval_internal( LLAMA_ASSERT(!!kv_self.ctx); const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - //const int64_t n_ctx = hparams.n_ctx; - //const int64_t n_head = hparams.n_head; - //const int64_t n_head_kv = hparams.n_head_kv; - //const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_vocab = hparams.n_vocab; - //const int64_t n_embd_gqa = hparams.n_embd_gqa(); - - //auto & mem_per_token = lctx.mem_per_token; #ifdef LLAMA_USE_ALLOCATOR ggml_allocator_reset(lctx.alloc); @@ -1796,8 +1810,7 @@ static bool llama_eval_internal( ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past); #ifdef LLAMA_USE_ALLOCATOR - size_t sz = ggml_allocator_alloc_graph_tensors(lctx.alloc, gf); - //fprintf(stderr, "%s: compute buffer size: %.3f MB\n", __func__, sz / 1024.0 / 1024.0); + ggml_allocator_alloc_graph_tensors(lctx.alloc, gf); #endif // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs); @@ -1807,6 +1820,7 @@ static bool llama_eval_internal( n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; #if GGML_USE_MPI + const int64_t n_layer = hparams.n_layer; ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); #endif @@ -1892,19 +1906,6 @@ static bool llama_eval_internal( memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); } - //if (mem_per_token == 0) { - // mem_per_token = ggml_used_mem(ctx0)/N; - //} - -#if 0 - printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__, - ggml_used_mem(ctx0)/1024.0/1024.0, - lctx.get_buf_max_mem(0)/1024.0/1024.0, - lctx.get_buf_max_mem(1)/1024.0/1024.0, - lctx.work_buffer.size()/1024.0/1024.0, - n_past, N); -#endif - // measure the performance only for the single-token evals if (N == 1) { lctx.t_eval_us += ggml_time_us() - t_start_us; @@ -3272,7 +3273,7 @@ struct llama_context * llama_new_context_with_model( #ifdef LLAMA_USE_ALLOCATOR static const size_t tensor_alignment = 32; - ctx->buf_compute.resize(ggml_tensor_overhead() * 3072 + ggml_graph_overhead()); + ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); // measure memory requirements for worst-case graph ctx->alloc = ggml_allocator_new_measure(tensor_alignment); @@ -3372,9 +3373,6 @@ struct llama_context * llama_init_from_file( } void llama_free(struct llama_context * ctx) { - if (ctx->model_owner) { - delete &ctx->model; - } delete ctx; }