From bc955204919ea8152b7443e7d48a48cc18dea448 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 14 Oct 2021 10:19:53 -0700 Subject: [PATCH] drm/i915/guc: Insert submit fences between requests in parent-child relationship The GuC must receive requests in the order submitted for contexts in a parent-child relationship to function correctly. To ensure this, insert a submit fence between the current request and last request submitted for requests / contexts in a parent child relationship. This is conceptually similar to a single timeline. Signed-off-by: Matthew Brost Cc: John Harrison Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20211014172005.27155-14-matthew.brost@intel.com --- drivers/gpu/drm/i915/gt/intel_context.h | 5 + drivers/gpu/drm/i915/gt/intel_context_types.h | 6 + .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 5 +- drivers/gpu/drm/i915/i915_request.c | 130 +++++++++++++----- 4 files changed, 113 insertions(+), 33 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h index 9f0995150a7a..edf12caaade3 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.h +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -77,6 +77,11 @@ intel_context_to_parent(struct intel_context *ce) } } +static inline bool intel_context_is_parallel(struct intel_context *ce) +{ + return intel_context_is_child(ce) || intel_context_is_parent(ce); +} + void intel_context_bind_parent_child(struct intel_context *parent, struct intel_context *child); diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 48decb5ee954..8309d1141d0a 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -237,6 +237,12 @@ struct intel_context { }; /** @parent: pointer to parent if child */ struct intel_context *parent; + /** + * @last_rq: last request submitted on a parallel context, used + * to insert submit fences between requests in the parallel + * context + */ + struct i915_request *last_rq; /** @number_children: number of children if parent */ u8 number_children; /** @guc: GuC specific members for parallel submission */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 71ae5eb69849..ebb64fb50396 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -684,8 +684,7 @@ static inline int rq_prio(const struct i915_request *rq) static bool is_multi_lrc_rq(struct i915_request *rq) { - return intel_context_is_child(rq->context) || - intel_context_is_parent(rq->context); + return intel_context_is_parallel(rq->context); } static bool can_merge_rq(struct i915_request *rq, @@ -2873,6 +2872,8 @@ static void guc_parent_context_unpin(struct intel_context *ce) GEM_BUG_ON(!intel_context_is_parent(ce)); GEM_BUG_ON(!intel_engine_is_virtual(ce->engine)); + if (ce->parallel.last_rq) + i915_request_put(ce->parallel.last_rq); unpin_guc_id(guc, ce); lrc_unpin(ce); } diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index ed64fa9defdf..d29e46a001b4 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -1549,6 +1549,91 @@ i915_request_await_object(struct i915_request *to, return ret; } +static inline bool is_parallel_rq(struct i915_request *rq) +{ + return intel_context_is_parallel(rq->context); +} + +static inline struct intel_context *request_to_parent(struct i915_request *rq) +{ + return intel_context_to_parent(rq->context); +} + +static struct i915_request * +__i915_request_ensure_parallel_ordering(struct i915_request *rq, + struct intel_timeline *timeline) +{ + struct i915_request *prev; + + GEM_BUG_ON(!is_parallel_rq(rq)); + + prev = request_to_parent(rq)->parallel.last_rq; + if (prev) { + if (!__i915_request_is_complete(prev)) { + i915_sw_fence_await_sw_fence(&rq->submit, + &prev->submit, + &rq->submitq); + + if (rq->engine->sched_engine->schedule) + __i915_sched_node_add_dependency(&rq->sched, + &prev->sched, + &rq->dep, + 0); + } + i915_request_put(prev); + } + + request_to_parent(rq)->parallel.last_rq = i915_request_get(rq); + + return to_request(__i915_active_fence_set(&timeline->last_request, + &rq->fence)); +} + +static struct i915_request * +__i915_request_ensure_ordering(struct i915_request *rq, + struct intel_timeline *timeline) +{ + struct i915_request *prev; + + GEM_BUG_ON(is_parallel_rq(rq)); + + prev = to_request(__i915_active_fence_set(&timeline->last_request, + &rq->fence)); + + if (prev && !__i915_request_is_complete(prev)) { + bool uses_guc = intel_engine_uses_guc(rq->engine); + bool pow2 = is_power_of_2(READ_ONCE(prev->engine)->mask | + rq->engine->mask); + bool same_context = prev->context == rq->context; + + /* + * The requests are supposed to be kept in order. However, + * we need to be wary in case the timeline->last_request + * is used as a barrier for external modification to this + * context. + */ + GEM_BUG_ON(same_context && + i915_seqno_passed(prev->fence.seqno, + rq->fence.seqno)); + + if ((same_context && uses_guc) || (!uses_guc && pow2)) + i915_sw_fence_await_sw_fence(&rq->submit, + &prev->submit, + &rq->submitq); + else + __i915_sw_fence_await_dma_fence(&rq->submit, + &prev->fence, + &rq->dmaq); + if (rq->engine->sched_engine->schedule) + __i915_sched_node_add_dependency(&rq->sched, + &prev->sched, + &rq->dep, + 0); + } + + return prev; +} + static struct i915_request * __i915_request_add_to_timeline(struct i915_request *rq) { @@ -1574,38 +1659,21 @@ __i915_request_add_to_timeline(struct i915_request *rq) * complete (to maximise our greedy late load balancing) and this * precludes optimising to use semaphores serialisation of a single * timeline across engines. + * + * We do not order parallel submission requests on the timeline as each + * parallel submission context has its own timeline and the ordering + * rules for parallel requests are that they must be submitted in the + * order received from the execbuf IOCTL. So rather than using the + * timeline we store a pointer to last request submitted in the + * relationship in the gem context and insert a submission fence + * between that request and request passed into this function or + * alternatively we use completion fence if gem context has a single + * timeline and this is the first submission of an execbuf IOCTL. */ - prev = to_request(__i915_active_fence_set(&timeline->last_request, - &rq->fence)); - if (prev && !__i915_request_is_complete(prev)) { - bool uses_guc = intel_engine_uses_guc(rq->engine); - - /* - * The requests are supposed to be kept in order. However, - * we need to be wary in case the timeline->last_request - * is used as a barrier for external modification to this - * context. - */ - GEM_BUG_ON(prev->context == rq->context && - i915_seqno_passed(prev->fence.seqno, - rq->fence.seqno)); - - if ((!uses_guc && - is_power_of_2(READ_ONCE(prev->engine)->mask | rq->engine->mask)) || - (uses_guc && prev->context == rq->context)) - i915_sw_fence_await_sw_fence(&rq->submit, - &prev->submit, - &rq->submitq); - else - __i915_sw_fence_await_dma_fence(&rq->submit, - &prev->fence, - &rq->dmaq); - if (rq->engine->sched_engine->schedule) - __i915_sched_node_add_dependency(&rq->sched, - &prev->sched, - &rq->dep, - 0); - } + if (likely(!is_parallel_rq(rq))) + prev = __i915_request_ensure_ordering(rq, timeline); + else + prev = __i915_request_ensure_parallel_ordering(rq, timeline); /* * Make sure that no request gazumped us - if it was allocated after