backend : add event API
This commit is contained in:
parent
940c01eb09
commit
963a122398
7 changed files with 133 additions and 30 deletions
|
@ -202,7 +202,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
|
||||
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||
|
|
|
@ -320,6 +320,11 @@ struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
|
|||
}
|
||||
|
||||
void ggml_tallocr_set_buffer(ggml_tallocr_t talloc, struct ggml_backend_buffer * buffer) {
|
||||
GGML_ASSERT(talloc->measure == false);
|
||||
// FIXME: buffer ownership semantics
|
||||
// if the user is doing this, they probably want to take ownership of the buffer
|
||||
// or they need to restore the original buffer before freeing the allocator
|
||||
//talloc->buffer_owned = false;
|
||||
talloc->buffer = buffer;
|
||||
talloc->base = ggml_backend_buffer_get_base(buffer);
|
||||
talloc->alignment = ggml_backend_buffer_get_alignment(buffer);
|
||||
|
|
|
@ -85,7 +85,7 @@ extern "C" {
|
|||
// (optional) complete all pending operations
|
||||
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
||||
|
||||
// compute graph with a plan
|
||||
// compute graph with a plan (not used currently)
|
||||
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
|
@ -95,14 +95,25 @@ extern "C" {
|
|||
|
||||
// check if the backend supports an operation
|
||||
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// (optional) event synchronization
|
||||
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
||||
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
||||
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
||||
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
||||
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
||||
};
|
||||
|
||||
struct ggml_backend {
|
||||
struct ggml_backend_i iface;
|
||||
|
||||
ggml_backend_context_t context;
|
||||
};
|
||||
|
||||
struct ggml_backend_event {
|
||||
ggml_backend_t backend;
|
||||
void * context;
|
||||
};
|
||||
|
||||
//
|
||||
// Backend registry
|
||||
//
|
||||
|
|
|
@ -303,6 +303,28 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
|
|||
}
|
||||
}
|
||||
|
||||
// events
|
||||
|
||||
ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
|
||||
return backend->iface.event_new(backend);
|
||||
}
|
||||
|
||||
void ggml_backend_event_free(ggml_backend_event_t event) {
|
||||
event->backend->iface.event_free(event);
|
||||
free(event);
|
||||
}
|
||||
|
||||
void ggml_backend_event_record(ggml_backend_event_t event) {
|
||||
event->backend->iface.event_record(event);
|
||||
}
|
||||
|
||||
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
||||
event->backend->iface.event_synchronize(event);
|
||||
}
|
||||
|
||||
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
backend->iface.event_wait(backend, event);
|
||||
}
|
||||
|
||||
// backend registry
|
||||
|
||||
|
@ -716,6 +738,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
|||
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_t ggml_backend_cpu_init(void) {
|
||||
|
@ -853,6 +880,8 @@ static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_
|
|||
return sched->tallocs[i];
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: error: no backend supports buffer type %s\n", __func__, ggml_backend_buffer_name(buffer));
|
||||
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
||||
}
|
||||
|
||||
|
@ -1336,7 +1365,6 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|||
ggml_graph_dump_dot(split->graph, NULL, split_filename);
|
||||
#endif
|
||||
|
||||
|
||||
uint64_t compute_start_us = ggml_time_us();
|
||||
if (!sched->callback_eval) {
|
||||
ggml_backend_graph_compute(split_backend, &split->graph);
|
||||
|
|
|
@ -9,6 +9,7 @@ extern "C" {
|
|||
|
||||
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||
typedef struct ggml_backend_event * ggml_backend_event_t;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
typedef void * ggml_backend_graph_plan_t;
|
||||
|
||||
|
@ -47,7 +48,6 @@ extern "C" {
|
|||
// Backend
|
||||
//
|
||||
|
||||
|
||||
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
||||
|
||||
|
@ -74,6 +74,13 @@ extern "C" {
|
|||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t src_backend, ggml_backend_t dst_backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
|
||||
|
||||
// events
|
||||
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_record (ggml_backend_event_t event); // can only be called from the backend that created the event
|
||||
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); // can only be called from the backend that created the event
|
||||
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // can be called from any backend
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
//
|
||||
|
@ -118,17 +125,21 @@ extern "C" {
|
|||
/*
|
||||
Example usage:
|
||||
|
||||
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
|
||||
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS
|
||||
// will be assigned preferrably to run on the buffer backend by ggml_backend_sched
|
||||
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
|
||||
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE);
|
||||
// sched is initialized with measure allocators and cannot be used until allocated with a measure graph
|
||||
|
||||
// initialize buffers from a measure graph
|
||||
measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
|
||||
|
||||
// in build_graph:
|
||||
build_graph(...) {
|
||||
void build_graph(...) {
|
||||
// allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
|
||||
alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
|
||||
ggml_allocr_alloc(alloc_cpu, tensor);
|
||||
alloc_cpu = ggml_backend_sched_get_tallocr(sched, backend_cpu);
|
||||
ggml_tallocr_alloc(alloc_cpu, tensor);
|
||||
|
||||
// manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
|
||||
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
||||
|
@ -143,6 +154,7 @@ extern "C" {
|
|||
// compute
|
||||
graph = build_graph(sched);
|
||||
ggml_backend_sched_graph_compute(sched, graph);
|
||||
|
||||
*/
|
||||
|
||||
struct ggml_backend_sched;
|
||||
|
|
57
ggml-cuda.cu
57
ggml-cuda.cu
|
@ -11070,6 +11070,58 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
|
||||
cudaEvent_t event;
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
|
||||
|
||||
return new ggml_backend_event {
|
||||
/* .backend = */ backend,
|
||||
/* .context = */ event,
|
||||
};
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
|
||||
CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context));
|
||||
|
||||
delete event;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_event_record(ggml_backend_event_t event) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context;
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
|
||||
CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0]));
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
if (ggml_backend_is_cuda(event->backend)) {
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
|
||||
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
|
||||
} else {
|
||||
auto wait_fn = [](void * user_data) {
|
||||
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
|
||||
ggml_backend_event_synchronize(event);
|
||||
};
|
||||
|
||||
CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
|
||||
assert(backend == event->backend);
|
||||
|
||||
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
||||
}
|
||||
|
||||
static ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_name,
|
||||
/* .free = */ ggml_backend_cuda_free,
|
||||
|
@ -11083,6 +11135,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
|||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
||||
/* .event_new = */ ggml_backend_cuda_event_new,
|
||||
/* .event_free = */ ggml_backend_cuda_event_free,
|
||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||
/* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
|
||||
};
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||
|
|
32
llama.cpp
32
llama.cpp
|
@ -6601,8 +6601,6 @@ static int llama_decode_internal(
|
|||
const auto & hparams = model.hparams;
|
||||
const auto & cparams = lctx.cparams;
|
||||
|
||||
//const auto n_batch = cparams.n_batch;
|
||||
|
||||
GGML_ASSERT((!all_batch.token && all_batch.embd) || (all_batch.token && !all_batch.embd)); // NOLINT
|
||||
|
||||
GGML_ASSERT(n_tokens_all <= cparams.n_ctx);
|
||||
|
@ -6623,16 +6621,6 @@ static int llama_decode_internal(
|
|||
|
||||
auto * logits_out = lctx.logits;
|
||||
|
||||
/*
|
||||
if (all_batch.logits) {
|
||||
logits_out.resize(n_vocab * n_tokens_all);
|
||||
} else if (lctx.logits_all) {
|
||||
logits_out.resize(n_vocab * n_tokens_all);
|
||||
} else {
|
||||
logits_out.resize(n_vocab);
|
||||
}
|
||||
*/
|
||||
|
||||
#ifndef NDEBUG
|
||||
auto & logits_valid = lctx.logits_valid;
|
||||
logits_valid.clear();
|
||||
|
@ -6643,7 +6631,8 @@ static int llama_decode_internal(
|
|||
|
||||
|
||||
const uint32_t n_ubatch = cparams.n_ubatch;
|
||||
//const uint32_t n_microbatch = 256;
|
||||
|
||||
//printf("n_tokens_all = %u, n_ubatch = %u\n", n_tokens_all, n_ubatch);
|
||||
|
||||
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
|
||||
const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
|
||||
|
@ -10016,9 +10005,17 @@ struct llama_context * llama_new_context_with_model(
|
|||
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
|
||||
ctx->alloc_cpu = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
||||
|
||||
for (ggml_backend_t backend : ctx->backends) {
|
||||
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
|
||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buffer_name(buf),
|
||||
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
// duplicate cpu buffers for microbatching
|
||||
const int n_ub = 16;
|
||||
const int n_ub = (cparams.n_batch + cparams.n_ubatch - 1) / cparams.n_ubatch;
|
||||
ctx->n_compute_bufs = n_ub;
|
||||
LLAMA_LOG_INFO("%s: allocating %d compute buffers\n", __func__, n_ub);
|
||||
|
||||
for (ggml_backend_t b : ctx->backends) {
|
||||
ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr(ctx->sched, b);
|
||||
|
@ -10049,13 +10046,6 @@ struct llama_context * llama_new_context_with_model(
|
|||
LLAMA_LOG_INFO("%s: logits buffer size = %8.2f MiB, type = %s\n", __func__,
|
||||
ggml_backend_buffer_get_size(ctx->buf_logits) / 1024.0 / 1024.0,
|
||||
ggml_backend_buffer_name(ctx->buf_logits));
|
||||
|
||||
for (ggml_backend_t backend : ctx->backends) {
|
||||
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
|
||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buffer_name(buf),
|
||||
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue