diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index e381c1c73902..7b47e00fa082 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -227,6 +227,7 @@ struct intel_engine_execlists { * @queue: queue of requests, in priority lists */ struct rb_root_cached queue; + struct rb_root_cached virtual; /** * @csb_write: control register for Context Switch buffer @@ -445,6 +446,7 @@ struct intel_engine_cs { #define I915_ENGINE_HAS_PREEMPTION BIT(2) #define I915_ENGINE_HAS_SEMAPHORES BIT(3) #define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(4) +#define I915_ENGINE_IS_VIRTUAL BIT(5) unsigned int flags; /* @@ -534,6 +536,12 @@ intel_engine_needs_breadcrumb_tasklet(const struct intel_engine_cs *engine) return engine->flags & I915_ENGINE_NEEDS_BREADCRUMB_TASKLET; } +static inline bool +intel_engine_is_virtual(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_IS_VIRTUAL; +} + #define instdone_slice_mask(dev_priv__) \ (IS_GEN(dev_priv__, 7) ? \ 1 : RUNTIME_INFO(dev_priv__)->sseu.slice_mask) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index f263a8374273..affa5e2dfce1 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -164,6 +164,42 @@ #define WA_TAIL_DWORDS 2 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS) +struct virtual_engine { + struct intel_engine_cs base; + struct intel_context context; + + /* + * We allow only a single request through the virtual engine at a time + * (each request in the timeline waits for the completion fence of + * the previous before being submitted). By restricting ourselves to + * only submitting a single request, each request is placed on to a + * physical to maximise load spreading (by virtue of the late greedy + * scheduling -- each real engine takes the next available request + * upon idling). + */ + struct i915_request *request; + + /* + * We keep a rbtree of available virtual engines inside each physical + * engine, sorted by priority. Here we preallocate the nodes we need + * for the virtual engine, indexed by physical_engine->id. + */ + struct ve_node { + struct rb_node rb; + int prio; + } nodes[I915_NUM_ENGINES]; + + /* And finally, which physical engines this virtual engine maps onto. */ + unsigned int num_siblings; + struct intel_engine_cs *siblings[0]; +}; + +static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) +{ + GEM_BUG_ON(!intel_engine_is_virtual(engine)); + return container_of(engine, struct virtual_engine, base); +} + static int execlists_context_deferred_alloc(struct intel_context *ce, struct intel_engine_cs *engine); static void execlists_init_reg_state(u32 *reg_state, @@ -216,7 +252,8 @@ static int queue_prio(const struct intel_engine_execlists *execlists) } static inline bool need_preempt(const struct intel_engine_cs *engine, - const struct i915_request *rq) + const struct i915_request *rq, + struct rb_node *rb) { int last_prio; @@ -251,6 +288,25 @@ static inline bool need_preempt(const struct intel_engine_cs *engine, rq_prio(list_next_entry(rq, link)) > last_prio) return true; + if (rb) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + bool preempt = false; + + if (engine == ve->siblings[0]) { /* only preempt one sibling */ + struct i915_request *next; + + rcu_read_lock(); + next = READ_ONCE(ve->request); + if (next) + preempt = rq_prio(next) > last_prio; + rcu_read_unlock(); + } + + if (preempt) + return preempt; + } + /* * If the inflight context did not trigger the preemption, then maybe * it was the set of queued requests? Pick the highest priority in @@ -369,6 +425,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine) list_for_each_entry_safe_reverse(rq, rn, &engine->timeline.requests, link) { + struct intel_engine_cs *owner; + if (i915_request_completed(rq)) break; @@ -377,16 +435,29 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine) GEM_BUG_ON(rq->hw_context->active); - GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); - if (rq_prio(rq) != prio) { - prio = rq_prio(rq); - pl = i915_sched_lookup_priolist(engine, prio); + /* + * Push the request back into the queue for later resubmission. + * If this request is not native to this physical engine (i.e. + * it came from a virtual source), push it back onto the virtual + * engine so that it can be moved across onto another physical + * engine as load dictates. + */ + owner = rq->hw_context->engine; + if (likely(owner == engine)) { + GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); + if (rq_prio(rq) != prio) { + prio = rq_prio(rq); + pl = i915_sched_lookup_priolist(engine, prio); + } + GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); + + list_add(&rq->sched.link, pl); + active = rq; + } else { + rq->engine = owner; + owner->submit_request(rq); + active = NULL; } - GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); - - list_add(&rq->sched.link, pl); - - active = rq; } return active; @@ -622,6 +693,90 @@ static void complete_preempt_context(struct intel_engine_execlists *execlists) execlists)); } +static void virtual_update_register_offsets(u32 *regs, + struct intel_engine_cs *engine) +{ + u32 base = engine->mmio_base; + + /* Must match execlists_init_reg_state()! */ + + regs[CTX_CONTEXT_CONTROL] = + i915_mmio_reg_offset(RING_CONTEXT_CONTROL(base)); + regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base)); + regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base)); + regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base)); + regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base)); + + regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base)); + regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base)); + regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base)); + regs[CTX_SECOND_BB_HEAD_U] = + i915_mmio_reg_offset(RING_SBBADDR_UDW(base)); + regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base)); + regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base)); + + regs[CTX_CTX_TIMESTAMP] = + i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base)); + regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 3)); + regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 3)); + regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 2)); + regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 2)); + regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 1)); + regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 1)); + regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0)); + regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 0)); + + if (engine->class == RENDER_CLASS) { + regs[CTX_RCS_INDIRECT_CTX] = + i915_mmio_reg_offset(RING_INDIRECT_CTX(base)); + regs[CTX_RCS_INDIRECT_CTX_OFFSET] = + i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base)); + regs[CTX_BB_PER_CTX_PTR] = + i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base)); + + regs[CTX_R_PWR_CLK_STATE] = + i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE); + } +} + +static bool virtual_matches(const struct virtual_engine *ve, + const struct i915_request *rq, + const struct intel_engine_cs *engine) +{ + const struct intel_engine_cs *active; + + /* + * We track when the HW has completed saving the context image + * (i.e. when we have seen the final CS event switching out of + * the context) and must not overwrite the context image before + * then. This restricts us to only using the active engine + * while the previous virtualized request is inflight (so + * we reuse the register offsets). This is a very small + * hystersis on the greedy seelction algorithm. + */ + active = READ_ONCE(ve->context.active); + if (active && active != engine) + return false; + + return true; +} + +static void virtual_xfer_breadcrumbs(struct virtual_engine *ve, + struct intel_engine_cs *engine) +{ + struct intel_engine_cs *old = ve->siblings[0]; + + /* All unattached (rq->engine == old) must already be completed */ + + spin_lock(&old->breadcrumbs.irq_lock); + if (!list_empty(&ve->context.signal_link)) { + list_move_tail(&ve->context.signal_link, + &engine->breadcrumbs.signalers); + intel_engine_queue_breadcrumbs(engine); + } + spin_unlock(&old->breadcrumbs.irq_lock); +} + static void execlists_dequeue(struct intel_engine_cs *engine) { struct intel_engine_execlists * const execlists = &engine->execlists; @@ -654,6 +809,26 @@ static void execlists_dequeue(struct intel_engine_cs *engine) * and context switches) submission. */ + for (rb = rb_first_cached(&execlists->virtual); rb; ) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + struct i915_request *rq = READ_ONCE(ve->request); + + if (!rq) { /* lazily cleanup after another engine handled rq */ + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + rb = rb_first_cached(&execlists->virtual); + continue; + } + + if (!virtual_matches(ve, rq, engine)) { + rb = rb_next(rb); + continue; + } + + break; + } + if (last) { /* * Don't resubmit or switch until all outstanding @@ -675,7 +850,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK)) return; - if (need_preempt(engine, last)) { + if (need_preempt(engine, last, rb)) { inject_preempt_context(engine); return; } @@ -715,6 +890,92 @@ static void execlists_dequeue(struct intel_engine_cs *engine) last->tail = last->wa_tail; } + while (rb) { /* XXX virtual is always taking precedence */ + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + struct i915_request *rq; + + spin_lock(&ve->base.timeline.lock); + + rq = ve->request; + if (unlikely(!rq)) { /* lost the race to a sibling */ + spin_unlock(&ve->base.timeline.lock); + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + rb = rb_first_cached(&execlists->virtual); + continue; + } + + GEM_BUG_ON(rq != ve->request); + GEM_BUG_ON(rq->engine != &ve->base); + GEM_BUG_ON(rq->hw_context != &ve->context); + + if (rq_prio(rq) >= queue_prio(execlists)) { + if (!virtual_matches(ve, rq, engine)) { + spin_unlock(&ve->base.timeline.lock); + rb = rb_next(rb); + continue; + } + + if (last && !can_merge_rq(last, rq)) { + spin_unlock(&ve->base.timeline.lock); + return; /* leave this rq for another engine */ + } + + GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n", + engine->name, + rq->fence.context, + rq->fence.seqno, + i915_request_completed(rq) ? "!" : + i915_request_started(rq) ? "*" : + "", + yesno(engine != ve->siblings[0])); + + ve->request = NULL; + ve->base.execlists.queue_priority_hint = INT_MIN; + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + + rq->engine = engine; + + if (engine != ve->siblings[0]) { + u32 *regs = ve->context.lrc_reg_state; + unsigned int n; + + GEM_BUG_ON(READ_ONCE(ve->context.active)); + virtual_update_register_offsets(regs, engine); + + if (!list_empty(&ve->context.signals)) + virtual_xfer_breadcrumbs(ve, engine); + + /* + * Move the bound engine to the top of the list + * for future execution. We then kick this + * tasklet first before checking others, so that + * we preferentially reuse this set of bound + * registers. + */ + for (n = 1; n < ve->num_siblings; n++) { + if (ve->siblings[n] == engine) { + swap(ve->siblings[n], + ve->siblings[0]); + break; + } + } + + GEM_BUG_ON(ve->siblings[0] != engine); + } + + __i915_request_submit(rq); + trace_i915_request_in(rq, port_index(port, execlists)); + submit = true; + last = rq; + } + + spin_unlock(&ve->base.timeline.lock); + break; + } + while ((rb = rb_first_cached(&execlists->queue))) { struct i915_priolist *p = to_priolist(rb); struct i915_request *rq, *rn; @@ -1838,6 +2099,25 @@ static void reset_csb_pointers(struct intel_engine_execlists *execlists) &execlists->csb_status[reset_value]); } +static struct i915_request *active_request(struct i915_request *rq) +{ + const struct list_head * const list = &rq->engine->timeline.requests; + const struct intel_context * const context = rq->hw_context; + struct i915_request *active = NULL; + + list_for_each_entry_from_reverse(rq, list, link) { + if (i915_request_completed(rq)) + break; + + if (rq->hw_context != context) + break; + + active = rq; + } + + return active; +} + static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) { struct intel_engine_execlists * const execlists = &engine->execlists; @@ -1858,7 +2138,8 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) if (!port_isset(execlists->port)) goto out_clear; - ce = port_request(execlists->port)->hw_context; + rq = port_request(execlists->port); + ce = rq->hw_context; /* * Catch up with any missed context-switch interrupts. @@ -1871,16 +2152,10 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) */ execlists_cancel_port_requests(execlists); - /* Push back any incomplete requests for replay after the reset. */ - rq = __unwind_incomplete_requests(engine); + rq = active_request(rq); if (!rq) goto out_replay; - if (rq->hw_context != ce) { /* caught just before a CS event */ - rq = NULL; - goto out_replay; - } - /* * If this request hasn't started yet, e.g. it is waiting on a * semaphore, we need to avoid skipping the request or else we @@ -1927,13 +2202,16 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) } execlists_init_reg_state(regs, ce, engine, ce->ring); - /* Rerun the request; its payload has been neutered (if guilty). */ out_replay: + /* Rerun the request; its payload has been neutered (if guilty). */ ce->ring->head = rq ? intel_ring_wrap(ce->ring, rq->head) : ce->ring->tail; intel_ring_update_space(ce->ring); __execlists_update_reg_state(ce, engine); + /* Push back any incomplete requests for replay after the reset. */ + __unwind_incomplete_requests(engine); + out_clear: execlists_clear_all_active(execlists); } @@ -2007,6 +2285,26 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine) i915_priolist_free(p); } + /* Cancel all attached virtual engines */ + while ((rb = rb_first_cached(&execlists->virtual))) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + + spin_lock(&ve->base.timeline.lock); + if (ve->request) { + ve->request->engine = engine; + __i915_request_submit(ve->request); + dma_fence_set_error(&ve->request->fence, -EIO); + i915_request_mark_complete(ve->request); + ve->base.execlists.queue_priority_hint = INT_MIN; + ve->request = NULL; + } + spin_unlock(&ve->base.timeline.lock); + } + /* Remaining _unready_ requests will be nop'ed when submitted */ execlists->queue_priority_hint = INT_MIN; @@ -2491,12 +2789,15 @@ static void execlists_init_reg_state(u32 *regs, bool rcs = engine->class == RENDER_CLASS; u32 base = engine->mmio_base; - /* A context is actually a big batch buffer with several + /* + * A context is actually a big batch buffer with several * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The * values we are setting here are only for the first context restore: * on a subsequent save, the GPU will recreate this batchbuffer with new * values (including all the missing MI_LOAD_REGISTER_IMM commands that * we are not initializing here). + * + * Must keep consistent with virtual_update_register_offsets(). */ regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) | MI_LRI_FORCE_POSTED; @@ -2715,6 +3016,320 @@ static int execlists_context_deferred_alloc(struct intel_context *ce, return ret; } +static void virtual_context_destroy(struct kref *kref) +{ + struct virtual_engine *ve = + container_of(kref, typeof(*ve), context.ref); + unsigned int n; + + GEM_BUG_ON(ve->request); + GEM_BUG_ON(ve->context.active); + + for (n = 0; n < ve->num_siblings; n++) { + struct intel_engine_cs *sibling = ve->siblings[n]; + struct rb_node *node = &ve->nodes[sibling->id].rb; + + if (RB_EMPTY_NODE(node)) + continue; + + spin_lock_irq(&sibling->timeline.lock); + + /* Detachment is lazily performed in the execlists tasklet */ + if (!RB_EMPTY_NODE(node)) + rb_erase_cached(node, &sibling->execlists.virtual); + + spin_unlock_irq(&sibling->timeline.lock); + } + GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); + + if (ve->context.state) + __execlists_context_fini(&ve->context); + + i915_timeline_fini(&ve->base.timeline); + kfree(ve); +} + +static void virtual_engine_initial_hint(struct virtual_engine *ve) +{ + int swp; + + /* + * Pick a random sibling on starting to help spread the load around. + * + * New contexts are typically created with exactly the same order + * of siblings, and often started in batches. Due to the way we iterate + * the array of sibling when submitting requests, sibling[0] is + * prioritised for dequeuing. If we make sure that sibling[0] is fairly + * randomised across the system, we also help spread the load by the + * first engine we inspect being different each time. + * + * NB This does not force us to execute on this engine, it will just + * typically be the first we inspect for submission. + */ + swp = prandom_u32_max(ve->num_siblings); + if (!swp) + return; + + swap(ve->siblings[swp], ve->siblings[0]); + virtual_update_register_offsets(ve->context.lrc_reg_state, + ve->siblings[0]); +} + +static int virtual_context_pin(struct intel_context *ce) +{ + struct virtual_engine *ve = container_of(ce, typeof(*ve), context); + int err; + + /* Note: we must use a real engine class for setting up reg state */ + err = __execlists_context_pin(ce, ve->siblings[0]); + if (err) + return err; + + virtual_engine_initial_hint(ve); + return 0; +} + +static void virtual_context_enter(struct intel_context *ce) +{ + struct virtual_engine *ve = container_of(ce, typeof(*ve), context); + unsigned int n; + + for (n = 0; n < ve->num_siblings; n++) + intel_engine_pm_get(ve->siblings[n]); +} + +static void virtual_context_exit(struct intel_context *ce) +{ + struct virtual_engine *ve = container_of(ce, typeof(*ve), context); + unsigned int n; + + ce->saturated = 0; + for (n = 0; n < ve->num_siblings; n++) + intel_engine_pm_put(ve->siblings[n]); +} + +static const struct intel_context_ops virtual_context_ops = { + .pin = virtual_context_pin, + .unpin = execlists_context_unpin, + + .enter = virtual_context_enter, + .exit = virtual_context_exit, + + .destroy = virtual_context_destroy, +}; + +static void virtual_submission_tasklet(unsigned long data) +{ + struct virtual_engine * const ve = (struct virtual_engine *)data; + const int prio = ve->base.execlists.queue_priority_hint; + unsigned int n; + + local_irq_disable(); + for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) { + struct intel_engine_cs *sibling = ve->siblings[n]; + struct ve_node * const node = &ve->nodes[sibling->id]; + struct rb_node **parent, *rb; + bool first; + + spin_lock(&sibling->timeline.lock); + + if (!RB_EMPTY_NODE(&node->rb)) { + /* + * Cheat and avoid rebalancing the tree if we can + * reuse this node in situ. + */ + first = rb_first_cached(&sibling->execlists.virtual) == + &node->rb; + if (prio == node->prio || (prio > node->prio && first)) + goto submit_engine; + + rb_erase_cached(&node->rb, &sibling->execlists.virtual); + } + + rb = NULL; + first = true; + parent = &sibling->execlists.virtual.rb_root.rb_node; + while (*parent) { + struct ve_node *other; + + rb = *parent; + other = rb_entry(rb, typeof(*other), rb); + if (prio > other->prio) { + parent = &rb->rb_left; + } else { + parent = &rb->rb_right; + first = false; + } + } + + rb_link_node(&node->rb, rb, parent); + rb_insert_color_cached(&node->rb, + &sibling->execlists.virtual, + first); + +submit_engine: + GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); + node->prio = prio; + if (first && prio > sibling->execlists.queue_priority_hint) { + sibling->execlists.queue_priority_hint = prio; + tasklet_hi_schedule(&sibling->execlists.tasklet); + } + + spin_unlock(&sibling->timeline.lock); + } + local_irq_enable(); +} + +static void virtual_submit_request(struct i915_request *rq) +{ + struct virtual_engine *ve = to_virtual_engine(rq->engine); + + GEM_TRACE("%s: rq=%llx:%lld\n", + ve->base.name, + rq->fence.context, + rq->fence.seqno); + + GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); + + GEM_BUG_ON(ve->request); + ve->base.execlists.queue_priority_hint = rq_prio(rq); + WRITE_ONCE(ve->request, rq); + + tasklet_schedule(&ve->base.execlists.tasklet); +} + +struct intel_context * +intel_execlists_create_virtual(struct i915_gem_context *ctx, + struct intel_engine_cs **siblings, + unsigned int count) +{ + struct virtual_engine *ve; + unsigned int n; + int err; + + if (count == 0) + return ERR_PTR(-EINVAL); + + if (count == 1) + return intel_context_create(ctx, siblings[0]); + + ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); + if (!ve) + return ERR_PTR(-ENOMEM); + + ve->base.i915 = ctx->i915; + ve->base.id = -1; + ve->base.class = OTHER_CLASS; + ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; + ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; + ve->base.flags = I915_ENGINE_IS_VIRTUAL; + + snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); + + err = i915_timeline_init(ctx->i915, &ve->base.timeline, NULL); + if (err) + goto err_put; + i915_timeline_set_subclass(&ve->base.timeline, TIMELINE_VIRTUAL); + + intel_engine_init_execlists(&ve->base); + + ve->base.cops = &virtual_context_ops; + ve->base.request_alloc = execlists_request_alloc; + + ve->base.schedule = i915_schedule; + ve->base.submit_request = virtual_submit_request; + + ve->base.execlists.queue_priority_hint = INT_MIN; + tasklet_init(&ve->base.execlists.tasklet, + virtual_submission_tasklet, + (unsigned long)ve); + + intel_context_init(&ve->context, ctx, &ve->base); + + for (n = 0; n < count; n++) { + struct intel_engine_cs *sibling = siblings[n]; + + GEM_BUG_ON(!is_power_of_2(sibling->mask)); + if (sibling->mask & ve->base.mask) { + DRM_DEBUG("duplicate %s entry in load balancer\n", + sibling->name); + err = -EINVAL; + goto err_put; + } + + /* + * The virtual engine implementation is tightly coupled to + * the execlists backend -- we push out request directly + * into a tree inside each physical engine. We could support + * layering if we handle cloning of the requests and + * submitting a copy into each backend. + */ + if (sibling->execlists.tasklet.func != + execlists_submission_tasklet) { + err = -ENODEV; + goto err_put; + } + + GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); + RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); + + ve->siblings[ve->num_siblings++] = sibling; + ve->base.mask |= sibling->mask; + + /* + * All physical engines must be compatible for their emission + * functions (as we build the instructions during request + * construction and do not alter them before submission + * on the physical engine). We use the engine class as a guide + * here, although that could be refined. + */ + if (ve->base.class != OTHER_CLASS) { + if (ve->base.class != sibling->class) { + DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", + sibling->class, ve->base.class); + err = -EINVAL; + goto err_put; + } + continue; + } + + ve->base.class = sibling->class; + ve->base.uabi_class = sibling->uabi_class; + snprintf(ve->base.name, sizeof(ve->base.name), + "v%dx%d", ve->base.class, count); + ve->base.context_size = sibling->context_size; + + ve->base.emit_bb_start = sibling->emit_bb_start; + ve->base.emit_flush = sibling->emit_flush; + ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; + ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; + ve->base.emit_fini_breadcrumb_dw = + sibling->emit_fini_breadcrumb_dw; + } + + return &ve->context; + +err_put: + intel_context_put(&ve->context); + return ERR_PTR(err); +} + +struct intel_context * +intel_execlists_clone_virtual(struct i915_gem_context *ctx, + struct intel_engine_cs *src) +{ + struct virtual_engine *se = to_virtual_engine(src); + struct intel_context *dst; + + dst = intel_execlists_create_virtual(ctx, + se->siblings, + se->num_siblings); + if (IS_ERR(dst)) + return dst; + + return dst; +} + void intel_execlists_show_requests(struct intel_engine_cs *engine, struct drm_printer *m, void (*show_request)(struct drm_printer *m, @@ -2772,6 +3387,29 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine, show_request(m, last, "\t\tQ "); } + last = NULL; + count = 0; + for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + struct i915_request *rq = READ_ONCE(ve->request); + + if (rq) { + if (count++ < max - 1) + show_request(m, rq, "\t\tV "); + else + last = rq; + } + } + if (last) { + if (count > max) { + drm_printf(m, + "\t\t...skipping %d virtual requests...\n", + count - max); + } + show_request(m, last, "\t\tV "); + } + spin_unlock_irqrestore(&engine->timeline.lock, flags); } diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.h b/drivers/gpu/drm/i915/gt/intel_lrc.h index a0dc907a7249..5530606052e5 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.h +++ b/drivers/gpu/drm/i915/gt/intel_lrc.h @@ -114,4 +114,13 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine, const char *prefix), unsigned int max); +struct intel_context * +intel_execlists_create_virtual(struct i915_gem_context *ctx, + struct intel_engine_cs **siblings, + unsigned int count); + +struct intel_context * +intel_execlists_clone_virtual(struct i915_gem_context *ctx, + struct intel_engine_cs *src); + #endif /* _INTEL_LRC_H_ */ diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c index 5b3d8e33f1cf..f880271fb9ba 100644 --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c @@ -1310,6 +1310,185 @@ static int live_preempt_smoke(void *arg) return err; } +static int nop_virtual_engine(struct drm_i915_private *i915, + struct intel_engine_cs **siblings, + unsigned int nsibling, + unsigned int nctx, + unsigned int flags) +#define CHAIN BIT(0) +{ + IGT_TIMEOUT(end_time); + struct i915_request *request[16]; + struct i915_gem_context *ctx[16]; + struct intel_context *ve[16]; + unsigned long n, prime, nc; + struct igt_live_test t; + ktime_t times[2] = {}; + int err; + + GEM_BUG_ON(!nctx || nctx > ARRAY_SIZE(ctx)); + + for (n = 0; n < nctx; n++) { + ctx[n] = kernel_context(i915); + if (!ctx[n]) { + err = -ENOMEM; + nctx = n; + goto out; + } + + ve[n] = intel_execlists_create_virtual(ctx[n], + siblings, nsibling); + if (IS_ERR(ve[n])) { + kernel_context_close(ctx[n]); + err = PTR_ERR(ve[n]); + nctx = n; + goto out; + } + + err = intel_context_pin(ve[n]); + if (err) { + intel_context_put(ve[n]); + kernel_context_close(ctx[n]); + nctx = n; + goto out; + } + } + + err = igt_live_test_begin(&t, i915, __func__, ve[0]->engine->name); + if (err) + goto out; + + for_each_prime_number_from(prime, 1, 8192) { + times[1] = ktime_get_raw(); + + if (flags & CHAIN) { + for (nc = 0; nc < nctx; nc++) { + for (n = 0; n < prime; n++) { + request[nc] = + i915_request_create(ve[nc]); + if (IS_ERR(request[nc])) { + err = PTR_ERR(request[nc]); + goto out; + } + + i915_request_add(request[nc]); + } + } + } else { + for (n = 0; n < prime; n++) { + for (nc = 0; nc < nctx; nc++) { + request[nc] = + i915_request_create(ve[nc]); + if (IS_ERR(request[nc])) { + err = PTR_ERR(request[nc]); + goto out; + } + + i915_request_add(request[nc]); + } + } + } + + for (nc = 0; nc < nctx; nc++) { + if (i915_request_wait(request[nc], + I915_WAIT_LOCKED, + HZ / 10) < 0) { + pr_err("%s(%s): wait for %llx:%lld timed out\n", + __func__, ve[0]->engine->name, + request[nc]->fence.context, + request[nc]->fence.seqno); + + GEM_TRACE("%s(%s) failed at request %llx:%lld\n", + __func__, ve[0]->engine->name, + request[nc]->fence.context, + request[nc]->fence.seqno); + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + break; + } + } + + times[1] = ktime_sub(ktime_get_raw(), times[1]); + if (prime == 1) + times[0] = times[1]; + + if (__igt_timeout(end_time, NULL)) + break; + } + + err = igt_live_test_end(&t); + if (err) + goto out; + + pr_info("Requestx%d latencies on %s: 1 = %lluns, %lu = %lluns\n", + nctx, ve[0]->engine->name, ktime_to_ns(times[0]), + prime, div64_u64(ktime_to_ns(times[1]), prime)); + +out: + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + err = -EIO; + + for (nc = 0; nc < nctx; nc++) { + intel_context_unpin(ve[nc]); + intel_context_put(ve[nc]); + kernel_context_close(ctx[nc]); + } + return err; +} + +static int live_virtual_engine(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + struct intel_engine_cs *engine; + enum intel_engine_id id; + unsigned int class, inst; + int err = -ENODEV; + + if (USES_GUC_SUBMISSION(i915)) + return 0; + + mutex_lock(&i915->drm.struct_mutex); + + for_each_engine(engine, i915, id) { + err = nop_virtual_engine(i915, &engine, 1, 1, 0); + if (err) { + pr_err("Failed to wrap engine %s: err=%d\n", + engine->name, err); + goto out_unlock; + } + } + + for (class = 0; class <= MAX_ENGINE_CLASS; class++) { + int nsibling, n; + + nsibling = 0; + for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) { + if (!i915->engine_class[class][inst]) + continue; + + siblings[nsibling++] = i915->engine_class[class][inst]; + } + if (nsibling < 2) + continue; + + for (n = 1; n <= nsibling + 1; n++) { + err = nop_virtual_engine(i915, siblings, nsibling, + n, 0); + if (err) + goto out_unlock; + } + + err = nop_virtual_engine(i915, siblings, nsibling, n, CHAIN); + if (err) + goto out_unlock; + } + +out_unlock: + mutex_unlock(&i915->drm.struct_mutex); + return err; +} + int intel_execlists_live_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { @@ -1322,6 +1501,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915) SUBTEST(live_chain_preempt), SUBTEST(live_preempt_hang), SUBTEST(live_preempt_smoke), + SUBTEST(live_virtual_engine), }; if (!HAS_EXECLISTS(i915)) diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h index 67f8a4a807a0..fe82d3571072 100644 --- a/drivers/gpu/drm/i915/i915_gem.h +++ b/drivers/gpu/drm/i915/i915_gem.h @@ -91,4 +91,9 @@ static inline bool __tasklet_enable(struct tasklet_struct *t) return atomic_dec_and_test(&t->count); } +static inline bool __tasklet_is_scheduled(struct tasklet_struct *t) +{ + return test_bit(TASKLET_STATE_SCHED, &t->state); +} + #endif /* __I915_GEM_H__ */ diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 24736fcd463d..68db6b28e606 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -86,6 +86,7 @@ */ #include +#include #include @@ -1218,7 +1219,6 @@ __intel_context_reconfigure_sseu(struct intel_context *ce, int ret; GEM_BUG_ON(INTEL_GEN(ce->gem_context->i915) < 8); - GEM_BUG_ON(ce->engine->id != RCS0); ret = intel_context_lock_pinned(ce); if (ret) @@ -1412,7 +1412,100 @@ struct set_engines { struct i915_gem_engines *engines; }; +static int +set_engines__load_balance(struct i915_user_extension __user *base, void *data) +{ + struct i915_context_engines_load_balance __user *ext = + container_of_user(base, typeof(*ext), base); + const struct set_engines *set = data; + struct intel_engine_cs *stack[16]; + struct intel_engine_cs **siblings; + struct intel_context *ce; + u16 num_siblings, idx; + unsigned int n; + int err; + + if (!HAS_EXECLISTS(set->ctx->i915)) + return -ENODEV; + + if (USES_GUC_SUBMISSION(set->ctx->i915)) + return -ENODEV; /* not implement yet */ + + if (get_user(idx, &ext->engine_index)) + return -EFAULT; + + if (idx >= set->engines->num_engines) { + DRM_DEBUG("Invalid placement value, %d >= %d\n", + idx, set->engines->num_engines); + return -EINVAL; + } + + idx = array_index_nospec(idx, set->engines->num_engines); + if (set->engines->engines[idx]) { + DRM_DEBUG("Invalid placement[%d], already occupied\n", idx); + return -EEXIST; + } + + if (get_user(num_siblings, &ext->num_siblings)) + return -EFAULT; + + err = check_user_mbz(&ext->flags); + if (err) + return err; + + err = check_user_mbz(&ext->mbz64); + if (err) + return err; + + siblings = stack; + if (num_siblings > ARRAY_SIZE(stack)) { + siblings = kmalloc_array(num_siblings, + sizeof(*siblings), + GFP_KERNEL); + if (!siblings) + return -ENOMEM; + } + + for (n = 0; n < num_siblings; n++) { + struct i915_engine_class_instance ci; + + if (copy_from_user(&ci, &ext->engines[n], sizeof(ci))) { + err = -EFAULT; + goto out_siblings; + } + + siblings[n] = intel_engine_lookup_user(set->ctx->i915, + ci.engine_class, + ci.engine_instance); + if (!siblings[n]) { + DRM_DEBUG("Invalid sibling[%d]: { class:%d, inst:%d }\n", + n, ci.engine_class, ci.engine_instance); + err = -EINVAL; + goto out_siblings; + } + } + + ce = intel_execlists_create_virtual(set->ctx, siblings, n); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto out_siblings; + } + + if (cmpxchg(&set->engines->engines[idx], NULL, ce)) { + intel_context_put(ce); + err = -EEXIST; + goto out_siblings; + } + +out_siblings: + if (siblings != stack) + kfree(siblings); + + return err; +} + static const i915_user_extension_fn set_engines__extensions[] = { + [I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_engines__load_balance, }; static int @@ -1737,14 +1830,29 @@ static int clone_engines(struct i915_gem_context *dst, clone->i915 = dst->i915; for (n = 0; n < e->num_engines; n++) { + struct intel_engine_cs *engine; + if (!e->engines[n]) { clone->engines[n] = NULL; continue; } + engine = e->engines[n]->engine; - clone->engines[n] = - intel_context_create(dst, e->engines[n]->engine); - if (!clone->engines[n]) { + /* + * Virtual engines are singletons; they can only exist + * inside a single context, because they embed their + * HW context... As each virtual context implies a single + * timeline (each engine can only dequeue a single request + * at any time), it would be surprising for two contexts + * to use the same engine. So let's create a copy of + * the virtual engine instead. + */ + if (intel_engine_is_virtual(engine)) + clone->engines[n] = + intel_execlists_clone_virtual(dst, engine); + else + clone->engines[n] = intel_context_create(dst, engine); + if (IS_ERR_OR_NULL(clone->engines[n])) { __free_engines(clone, n); goto err_unlock; } diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c index d215dcdf0b1e..78ceb56d7801 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.c +++ b/drivers/gpu/drm/i915/i915_scheduler.c @@ -150,17 +150,26 @@ sched_lock_engine(const struct i915_sched_node *node, struct intel_engine_cs *locked, struct sched_cache *cache) { - struct intel_engine_cs *engine = node_to_request(node)->engine; + const struct i915_request *rq = node_to_request(node); + struct intel_engine_cs *engine; GEM_BUG_ON(!locked); - if (engine != locked) { + /* + * Virtual engines complicate acquiring the engine timeline lock, + * as their rq->engine pointer is not stable until under that + * engine lock. The simple ploy we use is to take the lock then + * check that the rq still belongs to the newly locked engine. + */ + while (locked != (engine = READ_ONCE(rq->engine))) { spin_unlock(&locked->timeline.lock); memset(cache, 0, sizeof(*cache)); spin_lock(&engine->timeline.lock); + locked = engine; } - return engine; + GEM_BUG_ON(locked != engine); + return locked; } static inline int rq_prio(const struct i915_request *rq) @@ -272,6 +281,7 @@ static void __i915_schedule(struct i915_sched_node *node, spin_lock(&engine->timeline.lock); /* Fifo and depth-first replacement ensure our deps execute before us */ + engine = sched_lock_engine(node, engine, &cache); list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) { INIT_LIST_HEAD(&dep->dfs_link); @@ -283,8 +293,11 @@ static void __i915_schedule(struct i915_sched_node *node, if (prio <= node->attr.priority || node_signaled(node)) continue; + GEM_BUG_ON(node_to_request(node)->engine != engine); + node->attr.priority = prio; if (!list_empty(&node->link)) { + GEM_BUG_ON(intel_engine_is_virtual(engine)); if (!cache.priolist) cache.priolist = i915_sched_lookup_priolist(engine, diff --git a/drivers/gpu/drm/i915/i915_timeline_types.h b/drivers/gpu/drm/i915/i915_timeline_types.h index 5256a0b5c5f7..1688705f4a2b 100644 --- a/drivers/gpu/drm/i915/i915_timeline_types.h +++ b/drivers/gpu/drm/i915/i915_timeline_types.h @@ -26,6 +26,7 @@ struct i915_timeline { spinlock_t lock; #define TIMELINE_CLIENT 0 /* default subclass */ #define TIMELINE_ENGINE 1 +#define TIMELINE_VIRTUAL 2 struct mutex mutex; /* protects the flow of requests */ unsigned int pin_count; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 62396d575e28..f9770948161c 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -137,6 +137,7 @@ struct i915_engine_class_instance { __u16 engine_class; /* see enum drm_i915_gem_engine_class */ __u16 engine_instance; #define I915_ENGINE_CLASS_INVALID_NONE -1 +#define I915_ENGINE_CLASS_INVALID_VIRTUAL -2 }; /** @@ -1608,8 +1609,46 @@ struct drm_i915_gem_context_param_sseu { __u32 rsvd; }; +/* + * i915_context_engines_load_balance: + * + * Enable load balancing across this set of engines. + * + * Into the I915_EXEC_DEFAULT slot [0], a virtual engine is created that when + * used will proxy the execbuffer request onto one of the set of engines + * in such a way as to distribute the load evenly across the set. + * + * The set of engines must be compatible (e.g. the same HW class) as they + * will share the same logical GPU context and ring. + * + * To intermix rendering with the virtual engine and direct rendering onto + * the backing engines (bypassing the load balancing proxy), the context must + * be defined to use a single timeline for all engines. + */ +struct i915_context_engines_load_balance { + struct i915_user_extension base; + + __u16 engine_index; + __u16 num_siblings; + __u32 flags; /* all undefined flags must be zero */ + + __u64 mbz64; /* reserved for future use; must be zero */ + + struct i915_engine_class_instance engines[0]; +} __attribute__((packed)); + +#define I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(name__, N__) struct { \ + struct i915_user_extension base; \ + __u16 engine_index; \ + __u16 num_siblings; \ + __u32 flags; \ + __u64 mbz64; \ + struct i915_engine_class_instance engines[N__]; \ +} __attribute__((packed)) name__ + struct i915_context_param_engines { __u64 extensions; /* linked chain of extension blocks, 0 terminates */ +#define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */ struct i915_engine_class_instance engines[0]; } __attribute__((packed));