mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-09-15 23:25:07 +00:00
drm/i915/execlists: Read the context-status buffer from the HWSP
The engine provides a mirror of the CSB in the HWSP. If we use the cacheable reads from the HWSP, we can shave off a few mmio reads per context-switch interrupt (which are quite frequent!). Just removing a couple of mmio is not enough to actually reduce any latency, but a small reduction in overall cpu usage. Much appreciation for Ben dropping the bombshell that the CSB was in the HWSP and for Michel in digging out the details. v2: Don't be lazy, add the defines for the indices. v3: Include the HWSP in debugfs/i915_engine_info v4: Check for GVT-g, it currently depends on intercepting CSB mmio v5: Fixup GVT-g mmio path v6: Disable HWSP if VT-d is active as the iommu adds unpredictable memory latency. (Mika) v7: Also markup the CSB read with READ_ONCE() as it may still be an mmio read and we want to stop the compiler from issuing a later (v.slow) reload. Suggested-by: Ben Widawsky <benjamin.widawsky@intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Michel Thierry <michel.thierry@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Cc: Zhenyu Wang <zhenyuw@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Acked-by: Michel Thierry <michel.thierry@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20170913133534.26927-1-chris@chris-wilson.co.uk Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
This commit is contained in:
parent
34a04e5e46
commit
6d2cb5aa38
3 changed files with 38 additions and 7 deletions
|
@ -3315,6 +3315,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
|
||||||
upper_32_bits(addr), lower_32_bits(addr));
|
upper_32_bits(addr), lower_32_bits(addr));
|
||||||
|
|
||||||
if (i915.enable_execlists) {
|
if (i915.enable_execlists) {
|
||||||
|
const u32 *hws = &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
|
||||||
u32 ptr, read, write;
|
u32 ptr, read, write;
|
||||||
unsigned int idx;
|
unsigned int idx;
|
||||||
|
|
||||||
|
@ -3337,10 +3338,12 @@ static int i915_engine_info(struct seq_file *m, void *unused)
|
||||||
write += GEN8_CSB_ENTRIES;
|
write += GEN8_CSB_ENTRIES;
|
||||||
while (read < write) {
|
while (read < write) {
|
||||||
idx = ++read % GEN8_CSB_ENTRIES;
|
idx = ++read % GEN8_CSB_ENTRIES;
|
||||||
seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n",
|
seq_printf(m, "\tExeclist CSB[%d]: 0x%08x [0x%08x in hwsp], context: %d [%d in hwsp]\n",
|
||||||
idx,
|
idx,
|
||||||
I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
|
I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
|
||||||
I915_READ(RING_CONTEXT_STATUS_BUF_HI(engine, idx)));
|
hws[idx * 2],
|
||||||
|
I915_READ(RING_CONTEXT_STATUS_BUF_HI(engine, idx)),
|
||||||
|
hws[idx * 2 + 1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
|
|
|
@ -541,10 +541,17 @@ static void intel_lrc_irq_handler(unsigned long data)
|
||||||
while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
|
while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
|
||||||
u32 __iomem *csb_mmio =
|
u32 __iomem *csb_mmio =
|
||||||
dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
|
dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
|
||||||
u32 __iomem *buf =
|
/* The HWSP contains a (cacheable) mirror of the CSB */
|
||||||
dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
|
const u32 *buf =
|
||||||
|
&engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
|
||||||
unsigned int head, tail;
|
unsigned int head, tail;
|
||||||
|
|
||||||
|
/* However GVT emulation depends upon intercepting CSB mmio */
|
||||||
|
if (unlikely(engine->csb_use_mmio)) {
|
||||||
|
buf = (u32 * __force)
|
||||||
|
(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
|
||||||
|
}
|
||||||
|
|
||||||
/* The write will be ordered by the uncached read (itself
|
/* The write will be ordered by the uncached read (itself
|
||||||
* a memory barrier), so we do not need another in the form
|
* a memory barrier), so we do not need another in the form
|
||||||
* of a locked instruction. The race between the interrupt
|
* of a locked instruction. The race between the interrupt
|
||||||
|
@ -584,13 +591,12 @@ static void intel_lrc_irq_handler(unsigned long data)
|
||||||
* status notifier.
|
* status notifier.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
status = readl(buf + 2 * head);
|
status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
|
||||||
if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
|
if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* Check the context/desc id for this event matches */
|
/* Check the context/desc id for this event matches */
|
||||||
GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
|
GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
|
||||||
port->context_id);
|
|
||||||
|
|
||||||
rq = port_unpack(port, &count);
|
rq = port_unpack(port, &count);
|
||||||
GEM_BUG_ON(count == 0);
|
GEM_BUG_ON(count == 0);
|
||||||
|
@ -1720,6 +1726,23 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
|
||||||
engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
|
engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool irq_handler_force_mmio(struct drm_i915_private *i915)
|
||||||
|
{
|
||||||
|
/* GVT emulation depends upon intercepting CSB mmio */
|
||||||
|
if (intel_vgpu_active(i915))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* IOMMU adds unpredictable latency causing the CSB write (from the
|
||||||
|
* GPU into the HWSP) to only be visible some time after the interrupt
|
||||||
|
* (missed breadcrumb syndrome).
|
||||||
|
*/
|
||||||
|
if (intel_vtd_active())
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
logical_ring_setup(struct intel_engine_cs *engine)
|
logical_ring_setup(struct intel_engine_cs *engine)
|
||||||
{
|
{
|
||||||
|
@ -1731,6 +1754,8 @@ logical_ring_setup(struct intel_engine_cs *engine)
|
||||||
/* Intentionally left blank. */
|
/* Intentionally left blank. */
|
||||||
engine->buffer = NULL;
|
engine->buffer = NULL;
|
||||||
|
|
||||||
|
engine->csb_use_mmio = irq_handler_force_mmio(dev_priv);
|
||||||
|
|
||||||
fw_domains = intel_uncore_forcewake_for_reg(dev_priv,
|
fw_domains = intel_uncore_forcewake_for_reg(dev_priv,
|
||||||
RING_ELSP(engine),
|
RING_ELSP(engine),
|
||||||
FW_REG_WRITE);
|
FW_REG_WRITE);
|
||||||
|
|
|
@ -391,6 +391,7 @@ struct intel_engine_cs {
|
||||||
struct rb_root execlist_queue;
|
struct rb_root execlist_queue;
|
||||||
struct rb_node *execlist_first;
|
struct rb_node *execlist_first;
|
||||||
unsigned int fw_domains;
|
unsigned int fw_domains;
|
||||||
|
bool csb_use_mmio;
|
||||||
|
|
||||||
/* Contexts are pinned whilst they are active on the GPU. The last
|
/* Contexts are pinned whilst they are active on the GPU. The last
|
||||||
* context executed remains active whilst the GPU is idle - the
|
* context executed remains active whilst the GPU is idle - the
|
||||||
|
@ -496,6 +497,8 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
|
||||||
#define I915_GEM_HWS_SCRATCH_INDEX 0x40
|
#define I915_GEM_HWS_SCRATCH_INDEX 0x40
|
||||||
#define I915_GEM_HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
|
#define I915_GEM_HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
|
||||||
|
|
||||||
|
#define I915_HWS_CSB_BUF0_INDEX 0x10
|
||||||
|
|
||||||
struct intel_ring *
|
struct intel_ring *
|
||||||
intel_engine_create_ring(struct intel_engine_cs *engine, int size);
|
intel_engine_create_ring(struct intel_engine_cs *engine, int size);
|
||||||
int intel_ring_pin(struct intel_ring *ring,
|
int intel_ring_pin(struct intel_ring *ring,
|
||||||
|
|
Loading…
Reference in a new issue