diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index be6a4860b..f6eb515a1 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -202,7 +202,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); - printf(" -ts, --tensor_split (default: 0)\n"); + printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); diff --git a/ggml-alloc.c b/ggml-alloc.c index beb557997..410ce1626 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -320,6 +320,11 @@ struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) { } void ggml_tallocr_set_buffer(ggml_tallocr_t talloc, struct ggml_backend_buffer * buffer) { + GGML_ASSERT(talloc->measure == false); + // FIXME: buffer ownership semantics + // if the user is doing this, they probably want to take ownership of the buffer + // or they need to restore the original buffer before freeing the allocator + //talloc->buffer_owned = false; talloc->buffer = buffer; talloc->base = ggml_backend_buffer_get_base(buffer); talloc->alignment = ggml_backend_buffer_get_alignment(buffer); diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index 8e4592bcd..fb4980d94 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -85,7 +85,7 @@ extern "C" { // (optional) complete all pending operations void (*GGML_CALL synchronize)(ggml_backend_t backend); - // compute graph with a plan + // compute graph with a plan (not used currently) ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); void (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); @@ -95,14 +95,25 @@ extern "C" { // check if the backend supports an operation bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + + // (optional) event synchronization + ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); + void (*GGML_CALL event_free) (ggml_backend_event_t event); + void (*GGML_CALL event_record) (ggml_backend_event_t event); + void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event); + void (*GGML_CALL event_synchronize) (ggml_backend_event_t event); }; struct ggml_backend { struct ggml_backend_i iface; - ggml_backend_context_t context; }; + struct ggml_backend_event { + ggml_backend_t backend; + void * context; + }; + // // Backend registry // diff --git a/ggml-backend.c b/ggml-backend.c index 278497096..f1358c07a 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -303,6 +303,28 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b } } +// events + +ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) { + return backend->iface.event_new(backend); +} + +void ggml_backend_event_free(ggml_backend_event_t event) { + event->backend->iface.event_free(event); + free(event); +} + +void ggml_backend_event_record(ggml_backend_event_t event) { + event->backend->iface.event_record(event); +} + +void ggml_backend_event_synchronize(ggml_backend_event_t event) { + event->backend->iface.event_synchronize(event); +} + +void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { + backend->iface.event_wait(backend, event); +} // backend registry @@ -716,6 +738,11 @@ static struct ggml_backend_i cpu_backend_i = { /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .supports_op = */ ggml_backend_cpu_supports_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .event_synchronize = */ NULL, }; ggml_backend_t ggml_backend_cpu_init(void) { @@ -853,6 +880,8 @@ static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_ return sched->tallocs[i]; } } + + fprintf(stderr, "%s: error: no backend supports buffer type %s\n", __func__, ggml_backend_buffer_name(buffer)); GGML_ASSERT(false && "tensor buffer type not supported by any backend"); } @@ -1336,7 +1365,6 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { ggml_graph_dump_dot(split->graph, NULL, split_filename); #endif - uint64_t compute_start_us = ggml_time_us(); if (!sched->callback_eval) { ggml_backend_graph_compute(split_backend, &split->graph); diff --git a/ggml-backend.h b/ggml-backend.h index 05666380c..03f7dc1c4 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -9,6 +9,7 @@ extern "C" { typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; typedef struct ggml_backend_buffer * ggml_backend_buffer_t; + typedef struct ggml_backend_event * ggml_backend_event_t; typedef struct ggml_backend * ggml_backend_t; typedef void * ggml_backend_graph_plan_t; @@ -47,7 +48,6 @@ extern "C" { // Backend // - GGML_API const char * ggml_backend_name(ggml_backend_t backend); GGML_API void ggml_backend_free(ggml_backend_t backend); @@ -74,6 +74,13 @@ extern "C" { GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t src_backend, ggml_backend_t dst_backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy + // events + GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend); + GGML_API void ggml_backend_event_free (ggml_backend_event_t event); + GGML_API void ggml_backend_event_record (ggml_backend_event_t event); // can only be called from the backend that created the event + GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); // can only be called from the backend that created the event + GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // can be called from any backend + // // CPU backend // @@ -118,17 +125,21 @@ extern "C" { /* Example usage: - sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends); + // operations that use tensors allocated in a buffer with USAGE_WEIGHTS + // will be assigned preferrably to run on the buffer backend by ggml_backend_sched + ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE); // sched is initialized with measure allocators and cannot be used until allocated with a measure graph // initialize buffers from a measure graph measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed // in build_graph: - build_graph(...) { + void build_graph(...) { // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer) - alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu); - ggml_allocr_alloc(alloc_cpu, tensor); + alloc_cpu = ggml_backend_sched_get_tallocr(sched, backend_cpu); + ggml_tallocr_alloc(alloc_cpu, tensor); // manually assigning nodes to a backend (optional, shouldn't be needed in most cases) struct ggml_tensor * node = ggml_mul_mat(ctx, ...); @@ -143,6 +154,7 @@ extern "C" { // compute graph = build_graph(sched); ggml_backend_sched_graph_compute(sched, graph); + */ struct ggml_backend_sched; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 4dd112915..b11dd8cdb 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -11070,6 +11070,58 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons UNUSED(backend); } +static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + ggml_cuda_set_device(cuda_ctx->device); + + cudaEvent_t event; + CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + + return new ggml_backend_event { + /* .backend = */ backend, + /* .context = */ event, + }; +} + +static void ggml_backend_cuda_event_free(ggml_backend_event_t event) { + CUDA_CHECK(cudaEventDestroy((cudaEvent_t)event->context)); + + delete event; +} + +static void ggml_backend_cuda_event_record(ggml_backend_event_t event) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context; + + ggml_cuda_set_device(cuda_ctx->device); + + CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0])); +} + +static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + if (ggml_backend_is_cuda(event->backend)) { + + ggml_cuda_set_device(cuda_ctx->device); + + CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0)); + } else { + auto wait_fn = [](void * user_data) { + ggml_backend_event_t event = (ggml_backend_event_t)user_data; + ggml_backend_event_synchronize(event); + }; + + CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event)); + } +} + +static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) { + assert(backend == event->backend); + + CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context)); +} + static ggml_backend_i ggml_backend_cuda_interface = { /* .get_name = */ ggml_backend_cuda_name, /* .free = */ ggml_backend_cuda_free, @@ -11083,6 +11135,11 @@ static ggml_backend_i ggml_backend_cuda_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .supports_op = */ ggml_backend_cuda_supports_op, + /* .event_new = */ ggml_backend_cuda_event_new, + /* .event_free = */ ggml_backend_cuda_event_free, + /* .event_record = */ ggml_backend_cuda_event_record, + /* .event_wait = */ ggml_backend_cuda_event_wait, + /* .event_synchronize = */ ggml_backend_cuda_event_synchronize, }; GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { diff --git a/llama.cpp b/llama.cpp index 697030bd3..6479d1064 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6601,8 +6601,6 @@ static int llama_decode_internal( const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; - //const auto n_batch = cparams.n_batch; - GGML_ASSERT((!all_batch.token && all_batch.embd) || (all_batch.token && !all_batch.embd)); // NOLINT GGML_ASSERT(n_tokens_all <= cparams.n_ctx); @@ -6623,16 +6621,6 @@ static int llama_decode_internal( auto * logits_out = lctx.logits; - /* - if (all_batch.logits) { - logits_out.resize(n_vocab * n_tokens_all); - } else if (lctx.logits_all) { - logits_out.resize(n_vocab * n_tokens_all); - } else { - logits_out.resize(n_vocab); - } - */ - #ifndef NDEBUG auto & logits_valid = lctx.logits_valid; logits_valid.clear(); @@ -6643,7 +6631,8 @@ static int llama_decode_internal( const uint32_t n_ubatch = cparams.n_ubatch; - //const uint32_t n_microbatch = 256; + + //printf("n_tokens_all = %u, n_ubatch = %u\n", n_tokens_all, n_ubatch); for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) { const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token); @@ -10016,9 +10005,17 @@ struct llama_context * llama_new_context_with_model( LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits); ctx->alloc_cpu = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); + for (ggml_backend_t backend : ctx->backends) { + ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend); + LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buffer_name(buf), + ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); + } + // duplicate cpu buffers for microbatching - const int n_ub = 16; + const int n_ub = (cparams.n_batch + cparams.n_ubatch - 1) / cparams.n_ubatch; ctx->n_compute_bufs = n_ub; + LLAMA_LOG_INFO("%s: allocating %d compute buffers\n", __func__, n_ub); for (ggml_backend_t b : ctx->backends) { ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr(ctx->sched, b); @@ -10049,13 +10046,6 @@ struct llama_context * llama_new_context_with_model( LLAMA_LOG_INFO("%s: logits buffer size = %8.2f MiB, type = %s\n", __func__, ggml_backend_buffer_get_size(ctx->buf_logits) / 1024.0 / 1024.0, ggml_backend_buffer_name(ctx->buf_logits)); - - for (ggml_backend_t backend : ctx->backends) { - ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend); - LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, - ggml_backend_buffer_name(buf), - ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); - } } }