diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index be0b9cf941c4..19e94af9cc5d 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -454,6 +454,14 @@ static inline void ack_APIC_irq(void) apic_eoi(); } + +static inline bool lapic_vector_set_in_irr(unsigned int vector) +{ + u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + + return !!(irr & (1U << (vector % 32))); +} + static inline unsigned default_get_apic_id(unsigned long x) { unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 7f7533462474..159bd0cb8548 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -23,10 +23,8 @@ static struct irq_domain *msi_default_domain; -static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) { - struct irq_cfg *cfg = irqd_cfg(data); - msg->address_hi = MSI_ADDR_BASE_HI; if (x2apic_enabled()) @@ -47,6 +45,127 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) MSI_DATA_VECTOR(cfg->vector); } +static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + __irq_msi_compose_msg(irqd_cfg(data), msg); +} + +static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg) +{ + struct msi_msg msg[2] = { [1] = { }, }; + + __irq_msi_compose_msg(cfg, msg); + irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg); +} + +static int +msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) +{ + struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd); + struct irq_data *parent = irqd->parent_data; + unsigned int cpu; + int ret; + + /* Save the current configuration */ + cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd)); + old_cfg = *cfg; + + /* Allocate a new target vector */ + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) + return ret; + + /* + * For non-maskable and non-remapped MSI interrupts the migration + * to a different destination CPU and a different vector has to be + * done careful to handle the possible stray interrupt which can be + * caused by the non-atomic update of the address/data pair. + * + * Direct update is possible when: + * - The MSI is maskable (remapped MSI does not use this code path)). + * The quirk bit is not set in this case. + * - The new vector is the same as the old vector + * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up) + * - The new destination CPU is the same as the old destination CPU + */ + if (!irqd_msi_nomask_quirk(irqd) || + cfg->vector == old_cfg.vector || + old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR || + cfg->dest_apicid == old_cfg.dest_apicid) { + irq_msi_update_msg(irqd, cfg); + return ret; + } + + /* + * Paranoia: Validate that the interrupt target is the local + * CPU. + */ + if (WARN_ON_ONCE(cpu != smp_processor_id())) { + irq_msi_update_msg(irqd, cfg); + return ret; + } + + /* + * Redirect the interrupt to the new vector on the current CPU + * first. This might cause a spurious interrupt on this vector if + * the device raises an interrupt right between this update and the + * update to the final destination CPU. + * + * If the vector is in use then the installed device handler will + * denote it as spurious which is no harm as this is a rare event + * and interrupt handlers have to cope with spurious interrupts + * anyway. If the vector is unused, then it is marked so it won't + * trigger the 'No irq handler for vector' warning in do_IRQ(). + * + * This requires to hold vector lock to prevent concurrent updates to + * the affected vector. + */ + lock_vector_lock(); + + /* + * Mark the new target vector on the local CPU if it is currently + * unused. Reuse the VECTOR_RETRIGGERED state which is also used in + * the CPU hotplug path for a similar purpose. This cannot be + * undone here as the current CPU has interrupts disabled and + * cannot handle the interrupt before the whole set_affinity() + * section is done. In the CPU unplug case, the current CPU is + * about to vanish and will not handle any interrupts anymore. The + * vector is cleaned up when the CPU comes online again. + */ + if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector]))) + this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED); + + /* Redirect it to the new vector on the local CPU temporarily */ + old_cfg.vector = cfg->vector; + irq_msi_update_msg(irqd, &old_cfg); + + /* Now transition it to the target CPU */ + irq_msi_update_msg(irqd, cfg); + + /* + * All interrupts after this point are now targeted at the new + * vector/CPU. + * + * Drop vector lock before testing whether the temporary assignment + * to the local CPU was hit by an interrupt raised in the device, + * because the retrigger function acquires vector lock again. + */ + unlock_vector_lock(); + + /* + * Check whether the transition raced with a device interrupt and + * is pending in the local APICs IRR. It is safe to do this outside + * of vector lock as the irq_desc::lock of this interrupt is still + * held and interrupts are disabled: The check is not accessing the + * underlying vector store. It's just checking the local APIC's + * IRR. + */ + if (lapic_vector_set_in_irr(cfg->vector)) + irq_data_get_irq_chip(irqd)->irq_retrigger(irqd); + + return ret; +} + /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. @@ -58,6 +177,7 @@ static struct irq_chip pci_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_set_affinity = msi_set_affinity, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -146,6 +266,8 @@ void __init arch_init_msi_domain(struct irq_domain *parent) } if (!msi_default_domain) pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); + else + msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; } #ifdef CONFIG_IRQ_REMAP diff --git a/include/linux/irq.h b/include/linux/irq.h index 7853eb9301f2..3ed5a055b5f4 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -209,6 +209,8 @@ struct irq_data { * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target * IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set * IRQD_CAN_RESERVE - Can use reservation mode + * IRQD_MSI_NOMASK_QUIRK - Non-maskable MSI quirk for affinity change + * required */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -231,6 +233,7 @@ enum { IRQD_SINGLE_TARGET = (1 << 24), IRQD_DEFAULT_TRIGGER_SET = (1 << 25), IRQD_CAN_RESERVE = (1 << 26), + IRQD_MSI_NOMASK_QUIRK = (1 << 27), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -390,6 +393,21 @@ static inline bool irqd_can_reserve(struct irq_data *d) return __irqd_to_state(d) & IRQD_CAN_RESERVE; } +static inline void irqd_set_msi_nomask_quirk(struct irq_data *d) +{ + __irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK; +} + +static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d) +{ + __irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK; +} + +static inline bool irqd_msi_nomask_quirk(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 3c340dbc5a1f..4da8df57618a 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -206,6 +206,13 @@ enum { /* Irq domain implements MSI remapping */ IRQ_DOMAIN_FLAG_MSI_REMAP = (1 << 5), + /* + * Quirk to handle MSI implementations which do not provide + * masking. Currently known to affect x86, but partially + * handled in core code. + */ + IRQ_DOMAIN_MSI_NOMASK_QUIRK = (1 << 6), + /* * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved * for implementation specific purposes and ignored by the diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index c1eccd4f6520..a949bd39e343 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -114,6 +114,7 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), BIT_MASK_DESCR(IRQD_CAN_RESERVE), + BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK), BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index ad26fbcfbfc8..eb95f6106a1e 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -453,8 +453,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, continue; irq_data = irq_domain_get_irq_data(domain, desc->irq); - if (!can_reserve) + if (!can_reserve) { irqd_clr_can_reserve(irq_data); + if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) + irqd_set_msi_nomask_quirk(irq_data); + } ret = irq_domain_activate_irq(irq_data, can_reserve); if (ret) goto cleanup;