use a host buffer for the cpu compute buffer for faster copies to the gpu
This commit is contained in:
parent
458674c022
commit
53ae0dd862
5 changed files with 35 additions and 11 deletions
17
ggml-alloc.c
17
ggml-alloc.c
|
@ -265,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
|
|||
return alloc;
|
||||
}
|
||||
|
||||
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
|
||||
ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
|
||||
// create a backend buffer to get the correct tensor allocation sizes
|
||||
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1);
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
|
||||
|
||||
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
|
||||
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
||||
|
@ -277,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe
|
|||
return alloc;
|
||||
}
|
||||
|
||||
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
|
||||
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
|
||||
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
|
||||
return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
|
||||
}
|
||||
|
||||
ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
|
||||
// create a backend buffer to get the correct tensor allocation sizes
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
||||
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
||||
alloc->buffer_owned = true;
|
||||
return alloc;
|
||||
}
|
||||
|
||||
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
|
||||
return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
|
||||
}
|
||||
|
||||
ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
||||
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
||||
|
||||
|
|
|
@ -52,8 +52,10 @@ typedef struct ggml_tallocr * ggml_tallocr_t;
|
|||
|
||||
GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
|
||||
GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
|
||||
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
||||
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
|
||||
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
|
||||
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
||||
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
|
||||
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
|
||||
|
||||
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
|
||||
|
|
|
@ -776,6 +776,7 @@ struct ggml_backend_sched {
|
|||
|
||||
int n_backends;
|
||||
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
||||
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
||||
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
|
||||
|
||||
ggml_gallocr_t galloc;
|
||||
|
@ -1334,7 +1335,7 @@ static void sched_reset(ggml_backend_sched_t sched) {
|
|||
sched->is_reset = true;
|
||||
}
|
||||
|
||||
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size) {
|
||||
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
||||
GGML_ASSERT(n_backends > 0);
|
||||
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
|
||||
|
||||
|
@ -1348,13 +1349,14 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_bac
|
|||
sched->n_backends = n_backends;
|
||||
for (int i = 0; i < n_backends; i++) {
|
||||
sched->backends[i] = backends[i];
|
||||
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
|
||||
}
|
||||
|
||||
sched->galloc = ggml_gallocr_new();
|
||||
|
||||
// init measure allocs for each backend
|
||||
for (int i = 0; i < n_backends; i++) {
|
||||
sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]);
|
||||
sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
|
||||
}
|
||||
|
||||
sched_reset(sched);
|
||||
|
@ -1387,7 +1389,7 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
|
|||
for (int i = 0; i < sched->n_backends; i++) {
|
||||
size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
|
||||
ggml_tallocr_free(sched->tallocs[i]);
|
||||
sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size);
|
||||
sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
|
||||
}
|
||||
|
||||
sched_reset(sched);
|
||||
|
|
|
@ -149,7 +149,7 @@ extern "C" {
|
|||
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
||||
|
||||
// Initialize a backend scheduler
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size);
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||
// Initialize backend buffers from a measure graph
|
||||
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
||||
|
|
15
llama.cpp
15
llama.cpp
|
@ -9368,10 +9368,21 @@ struct llama_context * llama_new_context_with_model(
|
|||
}
|
||||
|
||||
{
|
||||
// buffer types used for the compute buffer of each backend
|
||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||
for (auto * backend : ctx->backends) {
|
||||
if (ggml_backend_is_cpu(backend)) {
|
||||
// use host buffers for the CPU backend compute buffer
|
||||
backend_buft.push_back(llama_default_buffer_type_cpu(true));
|
||||
} else {
|
||||
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
|
||||
}
|
||||
}
|
||||
|
||||
// buffer used to store the computation graph and the tensor meta data
|
||||
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
||||
|
||||
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
||||
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
||||
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
||||
|
||||
// build worst-case graph
|
||||
|
@ -9390,7 +9401,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
for (ggml_backend_t backend : ctx->backends) {
|
||||
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
|
||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_name(backend),
|
||||
ggml_backend_buffer_name(buf),
|
||||
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue