adjust buffer size to account for alignment
This commit is contained in:
parent
768ecfcc28
commit
598a9ada8f
1 changed files with 6 additions and 5 deletions
11
llama.cpp
11
llama.cpp
|
@ -3271,18 +3271,19 @@ struct llama_context * llama_new_context_with_model(
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef LLAMA_USE_ALLOCATOR
|
#ifdef LLAMA_USE_ALLOCATOR
|
||||||
|
static const size_t tensor_alignment = 32;
|
||||||
ctx->buf_compute.resize(ggml_tensor_overhead() * 3072 + ggml_graph_overhead());
|
ctx->buf_compute.resize(ggml_tensor_overhead() * 3072 + ggml_graph_overhead());
|
||||||
|
|
||||||
// measure memory requirements for worst-case graph
|
// measure memory requirements for worst-case graph
|
||||||
ctx->alloc = ggml_allocator_new_measure(32);
|
ctx->alloc = ggml_allocator_new_measure(tensor_alignment);
|
||||||
|
|
||||||
// build worst-case graph
|
// build worst-case graph
|
||||||
int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
|
int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
|
||||||
int n_past = hparams.n_ctx - n_tokens;
|
int n_past = hparams.n_ctx - n_tokens;
|
||||||
std::vector<llama_token> tokens(n_tokens, llama_token_bos());
|
llama_token token = llama_token_bos();
|
||||||
ggml_cgraph * gf = llama_build_graph(*ctx, tokens.data(), NULL, n_tokens, n_past);
|
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
||||||
|
|
||||||
size_t size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf);
|
size_t size = ggml_allocator_alloc_graph_tensors(ctx->alloc, gf) + tensor_alignment;
|
||||||
fprintf(stderr, "%s: worst-case graph size = %7.2f MB\n", __func__, size / 1024.0 / 1024.0);
|
fprintf(stderr, "%s: worst-case graph size = %7.2f MB\n", __func__, size / 1024.0 / 1024.0);
|
||||||
fprintf(stderr, "%s: compute buffer total size: %7.2f MB\n", __func__, (ctx->buf_compute.size + size) / 1024.0 / 1024.0);
|
fprintf(stderr, "%s: compute buffer total size: %7.2f MB\n", __func__, (ctx->buf_compute.size + size) / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
@ -3293,7 +3294,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
// recreate allocator with exact memory requirements
|
// recreate allocator with exact memory requirements
|
||||||
ggml_allocator_free(ctx->alloc);
|
ggml_allocator_free(ctx->alloc);
|
||||||
ctx->buf_alloc.resize(size);
|
ctx->buf_alloc.resize(size);
|
||||||
ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, 32);
|
ctx->alloc = ggml_allocator_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
||||||
#else
|
#else
|
||||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue