use a host buffer for the cpu compute buffer for faster copies to the gpu

This commit is contained in:
slaren 2024-01-12 17:40:55 +01:00
parent 458674c022
commit 53ae0dd862
5 changed files with 35 additions and 11 deletions

View file

@ -265,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
return alloc;
}
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
// create a backend buffer to get the correct tensor allocation sizes
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1);
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
@ -277,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe
return alloc;
}
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
}
ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
// create a backend buffer to get the correct tensor allocation sizes
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
alloc->buffer_owned = true;
return alloc;
}
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
}
ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));

View file

@ -52,8 +52,10 @@ typedef struct ggml_tallocr * ggml_tallocr_t;
GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);

View file

@ -776,6 +776,7 @@ struct ggml_backend_sched {
int n_backends;
ggml_backend_t backends[GGML_MAX_BACKENDS];
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
ggml_gallocr_t galloc;
@ -1334,7 +1335,7 @@ static void sched_reset(ggml_backend_sched_t sched) {
sched->is_reset = true;
}
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size) {
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
@ -1348,13 +1349,14 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_bac
sched->n_backends = n_backends;
for (int i = 0; i < n_backends; i++) {
sched->backends[i] = backends[i];
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
}
sched->galloc = ggml_gallocr_new();
// init measure allocs for each backend
for (int i = 0; i < n_backends; i++) {
sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]);
sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
}
sched_reset(sched);
@ -1387,7 +1389,7 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
for (int i = 0; i < sched->n_backends; i++) {
size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
ggml_tallocr_free(sched->tallocs[i]);
sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size);
sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
}
sched_reset(sched);

View file

@ -149,7 +149,7 @@ extern "C" {
typedef struct ggml_backend_sched * ggml_backend_sched_t;
// Initialize a backend scheduler
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size);
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);

View file

@ -9368,10 +9368,21 @@ struct llama_context * llama_new_context_with_model(
}
{
// buffer types used for the compute buffer of each backend
std::vector<ggml_backend_buffer_type_t> backend_buft;
for (auto * backend : ctx->backends) {
if (ggml_backend_is_cpu(backend)) {
// use host buffers for the CPU backend compute buffer
backend_buft.push_back(llama_default_buffer_type_cpu(true));
} else {
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
}
}
// buffer used to store the computation graph and the tensor meta data
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), ctx->backends.size(), LLAMA_MAX_NODES);
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
// build worst-case graph
@ -9390,7 +9401,7 @@ struct llama_context * llama_new_context_with_model(
for (ggml_backend_t backend : ctx->backends) {
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
ggml_backend_name(backend),
ggml_backend_buffer_name(buf),
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
}
}