diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h index 1db2d3efc71f..c2a5640ae055 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine.h +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -240,6 +240,9 @@ __printf(3, 4) void intel_engine_dump(struct intel_engine_cs *engine, struct drm_printer *m, const char *header, ...); +void intel_engine_dump_active_requests(struct list_head *requests, + struct i915_request *hung_rq, + struct drm_printer *m); ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now); diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 0da7868c5a13..dea0e522c5c7 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -1684,6 +1684,98 @@ static void print_properties(struct intel_engine_cs *engine, read_ul(&engine->defaults, p->offset)); } +static void engine_dump_request(struct i915_request *rq, struct drm_printer *m, const char *msg) +{ + struct intel_timeline *tl = get_timeline(rq); + + i915_request_show(m, rq, msg, 0); + + drm_printf(m, "\t\tring->start: 0x%08x\n", + i915_ggtt_offset(rq->ring->vma)); + drm_printf(m, "\t\tring->head: 0x%08x\n", + rq->ring->head); + drm_printf(m, "\t\tring->tail: 0x%08x\n", + rq->ring->tail); + drm_printf(m, "\t\tring->emit: 0x%08x\n", + rq->ring->emit); + drm_printf(m, "\t\tring->space: 0x%08x\n", + rq->ring->space); + + if (tl) { + drm_printf(m, "\t\tring->hwsp: 0x%08x\n", + tl->hwsp_offset); + intel_timeline_put(tl); + } + + print_request_ring(m, rq); + + if (rq->context->lrc_reg_state) { + drm_printf(m, "Logical Ring Context:\n"); + hexdump(m, rq->context->lrc_reg_state, PAGE_SIZE); + } +} + +void intel_engine_dump_active_requests(struct list_head *requests, + struct i915_request *hung_rq, + struct drm_printer *m) +{ + struct i915_request *rq; + const char *msg; + enum i915_request_state state; + + list_for_each_entry(rq, requests, sched.link) { + if (rq == hung_rq) + continue; + + state = i915_test_request_state(rq); + if (state < I915_REQUEST_QUEUED) + continue; + + if (state == I915_REQUEST_ACTIVE) + msg = "\t\tactive on engine"; + else + msg = "\t\tactive in queue"; + + engine_dump_request(rq, m, msg); + } +} + +static void engine_dump_active_requests(struct intel_engine_cs *engine, struct drm_printer *m) +{ + struct i915_request *hung_rq = NULL; + struct intel_context *ce; + bool guc; + + /* + * No need for an engine->irq_seqno_barrier() before the seqno reads. + * The GPU is still running so requests are still executing and any + * hardware reads will be out of date by the time they are reported. + * But the intention here is just to report an instantaneous snapshot + * so that's fine. + */ + lockdep_assert_held(&engine->sched_engine->lock); + + drm_printf(m, "\tRequests:\n"); + + guc = intel_uc_uses_guc_submission(&engine->gt->uc); + if (guc) { + ce = intel_engine_get_hung_context(engine); + if (ce) + hung_rq = intel_context_find_active_request(ce); + } else { + hung_rq = intel_engine_execlist_find_hung_request(engine); + } + + if (hung_rq) + engine_dump_request(hung_rq, m, "\t\thung"); + + if (guc) + intel_guc_dump_active_requests(engine, hung_rq, m); + else + intel_engine_dump_active_requests(&engine->sched_engine->requests, + hung_rq, m); +} + void intel_engine_dump(struct intel_engine_cs *engine, struct drm_printer *m, const char *header, ...) @@ -1728,39 +1820,9 @@ void intel_engine_dump(struct intel_engine_cs *engine, i915_reset_count(error)); print_properties(engine, m); - drm_printf(m, "\tRequests:\n"); - spin_lock_irqsave(&engine->sched_engine->lock, flags); - rq = intel_engine_execlist_find_hung_request(engine); - if (rq) { - struct intel_timeline *tl = get_timeline(rq); + engine_dump_active_requests(engine, m); - i915_request_show(m, rq, "\t\tactive ", 0); - - drm_printf(m, "\t\tring->start: 0x%08x\n", - i915_ggtt_offset(rq->ring->vma)); - drm_printf(m, "\t\tring->head: 0x%08x\n", - rq->ring->head); - drm_printf(m, "\t\tring->tail: 0x%08x\n", - rq->ring->tail); - drm_printf(m, "\t\tring->emit: 0x%08x\n", - rq->ring->emit); - drm_printf(m, "\t\tring->space: 0x%08x\n", - rq->ring->space); - - if (tl) { - drm_printf(m, "\t\tring->hwsp: 0x%08x\n", - tl->hwsp_offset); - intel_timeline_put(tl); - } - - print_request_ring(m, rq); - - if (rq->context->lrc_reg_state) { - drm_printf(m, "Logical Ring Context:\n"); - hexdump(m, rq->context->lrc_reg_state, PAGE_SIZE); - } - } drm_printf(m, "\tOn hold?: %lu\n", list_count(&engine->sched_engine->hold)); spin_unlock_irqrestore(&engine->sched_engine->lock, flags); @@ -1834,13 +1896,6 @@ intel_engine_create_virtual(struct intel_engine_cs **siblings, return siblings[0]->cops->create_virtual(siblings, count); } -static bool match_ring(struct i915_request *rq) -{ - u32 ring = ENGINE_READ(rq->engine, RING_START); - - return ring == i915_ggtt_offset(rq->ring->vma); -} - struct i915_request * intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine) { @@ -1884,14 +1939,7 @@ intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine) list_for_each_entry(request, &engine->sched_engine->requests, sched.link) { - if (__i915_request_is_complete(request)) - continue; - - if (!__i915_request_has_started(request)) - continue; - - /* More than one preemptible request may match! */ - if (!match_ring(request)) + if (i915_test_request_state(request) != I915_REQUEST_ACTIVE) continue; active = request; diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c index 0b16f19c384e..74775ae961b2 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c @@ -90,6 +90,14 @@ reset_engine(struct intel_engine_cs *engine, struct i915_request *rq) if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) show_heartbeat(rq, engine); + if (intel_engine_uses_guc(engine)) + /* + * GuC itself is toast or GuC's hang detection + * is disabled. Either way, need to find the + * hang culprit manually. + */ + intel_guc_find_hung_context(engine); + intel_gt_handle_error(engine->gt, engine->mask, I915_ERROR_CAPTURE, "stopped heartbeat on %s", diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 721a10e2215e..4d281bc8a38c 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -156,7 +156,7 @@ void __i915_request_reset(struct i915_request *rq, bool guilty) if (guilty) { i915_request_set_error_once(rq, -EIO); __i915_request_skip(rq); - if (mark_guilty(rq)) + if (mark_guilty(rq) && !intel_engine_uses_guc(rq->engine)) skip_context(rq); } else { i915_request_set_error_once(rq, -EAGAIN); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h index f3c69160cb7e..f355a70bbec4 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h @@ -275,6 +275,8 @@ int intel_guc_context_reset_process_msg(struct intel_guc *guc, int intel_guc_engine_failure_process_msg(struct intel_guc *guc, const u32 *msg, u32 len); +void intel_guc_find_hung_context(struct intel_engine_cs *engine); + void intel_guc_submission_reset_prepare(struct intel_guc *guc); void intel_guc_submission_reset(struct intel_guc *guc, bool stalled); void intel_guc_submission_reset_finish(struct intel_guc *guc); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 7d6ca0d54f9e..76c2d927e1c3 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -2277,6 +2277,73 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc, return 0; } +void intel_guc_find_hung_context(struct intel_engine_cs *engine) +{ + struct intel_guc *guc = &engine->gt->uc.guc; + struct intel_context *ce; + struct i915_request *rq; + unsigned long index; + + /* Reset called during driver load? GuC not yet initialised! */ + if (unlikely(!guc_submission_initialized(guc))) + return; + + xa_for_each(&guc->context_lookup, index, ce) { + if (!intel_context_is_pinned(ce)) + continue; + + if (intel_engine_is_virtual(ce->engine)) { + if (!(ce->engine->mask & engine->mask)) + continue; + } else { + if (ce->engine != engine) + continue; + } + + list_for_each_entry(rq, &ce->guc_active.requests, sched.link) { + if (i915_test_request_state(rq) != I915_REQUEST_ACTIVE) + continue; + + intel_engine_set_hung_context(engine, ce); + + /* Can only cope with one hang at a time... */ + return; + } + } +} + +void intel_guc_dump_active_requests(struct intel_engine_cs *engine, + struct i915_request *hung_rq, + struct drm_printer *m) +{ + struct intel_guc *guc = &engine->gt->uc.guc; + struct intel_context *ce; + unsigned long index; + unsigned long flags; + + /* Reset called during driver load? GuC not yet initialised! */ + if (unlikely(!guc_submission_initialized(guc))) + return; + + xa_for_each(&guc->context_lookup, index, ce) { + if (!intel_context_is_pinned(ce)) + continue; + + if (intel_engine_is_virtual(ce->engine)) { + if (!(ce->engine->mask & engine->mask)) + continue; + } else { + if (ce->engine != engine) + continue; + } + + spin_lock_irqsave(&ce->guc_active.lock, flags); + intel_engine_dump_active_requests(&ce->guc_active.requests, + hung_rq, m); + spin_unlock_irqrestore(&ce->guc_active.lock, flags); + } +} + void intel_guc_submission_print_info(struct intel_guc *guc, struct drm_printer *p) { diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h index 08ff77c5c50e..03bc1c83a4d2 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h @@ -25,6 +25,9 @@ void intel_guc_submission_print_info(struct intel_guc *guc, struct drm_printer *p); void intel_guc_submission_print_context_info(struct intel_guc *guc, struct drm_printer *p); +void intel_guc_dump_active_requests(struct intel_engine_cs *engine, + struct i915_request *hung_rq, + struct drm_printer *m); bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve); diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index aeef45679897..28f38b02a5d2 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -2041,6 +2041,47 @@ void i915_request_show(struct drm_printer *m, name); } +static bool engine_match_ring(struct intel_engine_cs *engine, struct i915_request *rq) +{ + u32 ring = ENGINE_READ(engine, RING_START); + + return ring == i915_ggtt_offset(rq->ring->vma); +} + +static bool match_ring(struct i915_request *rq) +{ + struct intel_engine_cs *engine; + bool found; + int i; + + if (!intel_engine_is_virtual(rq->engine)) + return engine_match_ring(rq->engine, rq); + + found = false; + i = 0; + while ((engine = intel_engine_get_sibling(rq->engine, i++))) { + found = engine_match_ring(engine, rq); + if (found) + break; + } + + return found; +} + +enum i915_request_state i915_test_request_state(struct i915_request *rq) +{ + if (i915_request_completed(rq)) + return I915_REQUEST_COMPLETE; + + if (!i915_request_started(rq)) + return I915_REQUEST_PENDING; + + if (match_ring(rq)) + return I915_REQUEST_ACTIVE; + + return I915_REQUEST_QUEUED; +} + #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) #include "selftests/mock_request.c" #include "selftests/i915_request.c" diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index 128030f43bbf..ac0e3326c067 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -649,4 +649,14 @@ i915_request_active_engine(struct i915_request *rq, void i915_request_notify_execute_cb_imm(struct i915_request *rq); +enum i915_request_state { + I915_REQUEST_UNKNOWN = 0, + I915_REQUEST_COMPLETE, + I915_REQUEST_PENDING, + I915_REQUEST_QUEUED, + I915_REQUEST_ACTIVE, +}; + +enum i915_request_state i915_test_request_state(struct i915_request *rq); + #endif /* I915_REQUEST_H */