llama : separate compute buffer for metal
This commit is contained in:
parent
652c849643
commit
ed960fa1ab
1 changed files with 6 additions and 1 deletions
|
@ -1616,7 +1616,6 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
LLAMA_ASSERT(lctx.graph_logits != nullptr);
|
LLAMA_ASSERT(lctx.graph_logits != nullptr);
|
||||||
|
|
||||||
|
|
||||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||||
n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
|
n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
|
||||||
|
@ -2719,11 +2718,17 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
// TODO: size the buffers more accurately - depends on improved memory management
|
// TODO: size the buffers more accurately - depends on improved memory management
|
||||||
ctx->buf_compute_cpu = ggml_backend_alloc_buffer(&model->backend_cpu, MEM_REQ_EVAL().at(ctx->model.type), 2048);
|
ctx->buf_compute_cpu = ggml_backend_alloc_buffer(&model->backend_cpu, MEM_REQ_EVAL().at(ctx->model.type), 2048);
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
if (params.n_gpu_layers > 0) {
|
if (params.n_gpu_layers > 0) {
|
||||||
ctx->buf_compute_cuda = ggml_backend_alloc_buffer(&model->backend_cuda, MEM_REQ_EVAL().at(ctx->model.type), 2048);
|
ctx->buf_compute_cuda = ggml_backend_alloc_buffer(&model->backend_cuda, MEM_REQ_EVAL().at(ctx->model.type), 2048);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
if (params.n_gpu_layers > 0) {
|
||||||
|
ctx->buf_compute_metal = ggml_backend_alloc_buffer(&model->backend_metal, MEM_REQ_EVAL().at(ctx->model.type), 2048);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// initialize the graph input/output buffers
|
// initialize the graph input/output buffers
|
||||||
// input buffer
|
// input buffer
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue