backend : add event API

This commit is contained in:
slaren 2024-01-21 17:55:12 +01:00
parent 940c01eb09
commit 963a122398
7 changed files with 133 additions and 30 deletions

View file

@ -202,7 +202,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");

View file

@ -320,6 +320,11 @@ struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
}
void ggml_tallocr_set_buffer(ggml_tallocr_t talloc, struct ggml_backend_buffer * buffer) {
GGML_ASSERT(talloc->measure == false);
// FIXME: buffer ownership semantics
// if the user is doing this, they probably want to take ownership of the buffer
// or they need to restore the original buffer before freeing the allocator
//talloc->buffer_owned = false;
talloc->buffer = buffer;
talloc->base = ggml_backend_buffer_get_base(buffer);
talloc->alignment = ggml_backend_buffer_get_alignment(buffer);

View file

@ -85,7 +85,7 @@ extern "C" {
// (optional) complete all pending operations
void (*GGML_CALL synchronize)(ggml_backend_t backend);
// compute graph with a plan
// compute graph with a plan (not used currently)
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
@ -95,14 +95,25 @@ extern "C" {
// check if the backend supports an operation
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// (optional) event synchronization
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
void (*GGML_CALL event_free) (ggml_backend_event_t event);
void (*GGML_CALL event_record) (ggml_backend_event_t event);
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
};
struct ggml_backend {
struct ggml_backend_i iface;
ggml_backend_context_t context;
};
struct ggml_backend_event {
ggml_backend_t backend;
void * context;
};
//
// Backend registry
//

View file

@ -303,6 +303,28 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
}
}
// events
ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
return backend->iface.event_new(backend);
}
void ggml_backend_event_free(ggml_backend_event_t event) {
event->backend->iface.event_free(event);
free(event);
}
void ggml_backend_event_record(ggml_backend_event_t event) {
event->backend->iface.event_record(event);
}
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
event->backend->iface.event_synchronize(event);
}
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
backend->iface.event_wait(backend, event);
}
// backend registry
@ -716,6 +738,11 @@ static struct ggml_backend_i cpu_backend_i = {
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .supports_op = */ ggml_backend_cpu_supports_op,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .event_synchronize = */ NULL,
};
ggml_backend_t ggml_backend_cpu_init(void) {
@ -853,6 +880,8 @@ static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_
return sched->tallocs[i];
}
}
fprintf(stderr, "%s: error: no backend supports buffer type %s\n", __func__, ggml_backend_buffer_name(buffer));
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
}
@ -1336,7 +1365,6 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
ggml_graph_dump_dot(split->graph, NULL, split_filename);
#endif
uint64_t compute_start_us = ggml_time_us();
if (!sched->callback_eval) {
ggml_backend_graph_compute(split_backend, &split->graph);

View file

@ -9,6 +9,7 @@ extern "C" {
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend_event * ggml_backend_event_t;
typedef struct ggml_backend * ggml_backend_t;
typedef void * ggml_backend_graph_plan_t;
@ -47,7 +48,6 @@ extern "C" {
// Backend
//
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
GGML_API void ggml_backend_free(ggml_backend_t backend);
@ -74,6 +74,13 @@ extern "C" {
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t src_backend, ggml_backend_t dst_backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
// events
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
GGML_API void ggml_backend_event_record (ggml_backend_event_t event); // can only be called from the backend that created the event
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); // can only be called from the backend that created the event
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // can be called from any backend
//
// CPU backend
//
@ -118,17 +125,21 @@ extern "C" {
/*
Example usage:
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS
// will be assigned preferrably to run on the buffer backend by ggml_backend_sched
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE);
// sched is initialized with measure allocators and cannot be used until allocated with a measure graph
// initialize buffers from a measure graph
measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
// in build_graph:
build_graph(...) {
void build_graph(...) {
// allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
ggml_allocr_alloc(alloc_cpu, tensor);
alloc_cpu = ggml_backend_sched_get_tallocr(sched, backend_cpu);
ggml_tallocr_alloc(alloc_cpu, tensor);
// manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
@ -143,6 +154,7 @@ extern "C" {
// compute
graph = build_graph(sched);
ggml_backend_sched_graph_compute(sched, graph);
*/
struct ggml_backend_sched;

View file

@ -11070,6 +11070,58 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
UNUSED(backend);
}
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_device(cuda_ctx->device);
cudaEvent_t event;
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
return new ggml_backend_event {
/* .backend = */ backend,
/* .context = */ event,
};
}
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
delete event;
}
static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
ggml_cuda_set_device(cuda_ctx->device);
CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0]));
}
static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
if (ggml_backend_is_cuda(event->backend)) {
ggml_cuda_set_device(cuda_ctx->device);
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
} else {
auto wait_fn = [](void * user_data) {
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
ggml_backend_event_synchronize(event);
};
CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
}
}
static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
assert(backend == event->backend);
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
}
static ggml_backend_i ggml_backend_cuda_interface = {
/* .get_name = */ ggml_backend_cuda_name,
/* .free = */ ggml_backend_cuda_free,
@ -11083,6 +11135,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .supports_op = */ ggml_backend_cuda_supports_op,
/* .event_new = */ ggml_backend_cuda_event_new,
/* .event_free = */ ggml_backend_cuda_event_free,
/* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait,
/* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
};
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {

View file

@ -6601,8 +6601,6 @@ static int llama_decode_internal(
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
//const auto n_batch = cparams.n_batch;
GGML_ASSERT((!all_batch.token && all_batch.embd) || (all_batch.token && !all_batch.embd)); // NOLINT
GGML_ASSERT(n_tokens_all <= cparams.n_ctx);
@ -6623,16 +6621,6 @@ static int llama_decode_internal(
auto * logits_out = lctx.logits;
/*
if (all_batch.logits) {
logits_out.resize(n_vocab * n_tokens_all);
} else if (lctx.logits_all) {
logits_out.resize(n_vocab * n_tokens_all);
} else {
logits_out.resize(n_vocab);
}
*/
#ifndef NDEBUG
auto & logits_valid = lctx.logits_valid;
logits_valid.clear();
@ -6643,7 +6631,8 @@ static int llama_decode_internal(
const uint32_t n_ubatch = cparams.n_ubatch;
//const uint32_t n_microbatch = 256;
//printf("n_tokens_all = %u, n_ubatch = %u\n", n_tokens_all, n_ubatch);
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
@ -10016,9 +10005,17 @@ struct llama_context * llama_new_context_with_model(
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
ctx->alloc_cpu = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
for (ggml_backend_t backend : ctx->backends) {
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
ggml_backend_buffer_name(buf),
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
}
// duplicate cpu buffers for microbatching
const int n_ub = 16;
const int n_ub = (cparams.n_batch + cparams.n_ubatch - 1) / cparams.n_ubatch;
ctx->n_compute_bufs = n_ub;
LLAMA_LOG_INFO("%s: allocating %d compute buffers\n", __func__, n_ub);
for (ggml_backend_t b : ctx->backends) {
ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr(ctx->sched, b);
@ -10049,13 +10046,6 @@ struct llama_context * llama_new_context_with_model(
LLAMA_LOG_INFO("%s: logits buffer size = %8.2f MiB, type = %s\n", __func__,
ggml_backend_buffer_get_size(ctx->buf_logits) / 1024.0 / 1024.0,
ggml_backend_buffer_name(ctx->buf_logits));
for (ggml_backend_t backend : ctx->backends) {
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
ggml_backend_buffer_name(buf),
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
}
}
}