diff --git a/llama.cpp b/llama.cpp index 9f12cc0d9..f478aca22 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3271,18 +3271,19 @@ struct llama_context * llama_new_context_with_model( } #ifdef LLAMA_USE_ALLOCATOR + static const size_t tensor_alignment = 32; ctx->buf_compute.resize(ggml_tensor_overhead() * 3072 + ggml_graph_overhead()); // measure memory requirements for worst-case graph - ctx->alloc = ggml_allocator_new_measure(32); + ctx->alloc = ggml_allocator_new_measure(tensor_alignment); // build worst-case graph int n_tokens = std::min((int)hparams.n_ctx, params.n_batch); int n_past = hparams.n_ctx - n_tokens; - std::vector tokens(n_tokens, llama_token_bos()); - ggml_cgraph * gf = llama_build_graph(*ctx, tokens.data(), NULL, n_tokens, n_past); + llama_token token = llama_token_bos(); + ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); - size_t size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf); + size_t size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf) + tensor_alignment; fprintf(stderr, "%s: worst-case graph size = %7.2f MB\n", __func__, size / 1024.0 / 1024.0); fprintf(stderr, "%s: compute buffer total size: %7.2f MB\n", __func__, (ctx->buf_compute.size + size) / 1024.0 / 1024.0); @@ -3293,7 +3294,7 @@ struct llama_context * llama_new_context_with_model( // recreate allocator with exact memory requirements ggml_allocator_free(ctx->alloc); ctx->buf_alloc.resize(size); - ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, 32); + ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); #else ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); #endif