From cc87253e22fce03d801c32220eeace2f9e3ec39d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 7 Mar 2024 19:11:34 +0100 Subject: [PATCH 1/4] xen/grant-dma-iommu: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20240307181135.191192-2-u.kleine-koenig@pengutronix.de Signed-off-by: Juergen Gross --- drivers/xen/grant-dma-iommu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/xen/grant-dma-iommu.c b/drivers/xen/grant-dma-iommu.c index 6a9fe02c6bfc..2ee750a03c2f 100644 --- a/drivers/xen/grant-dma-iommu.c +++ b/drivers/xen/grant-dma-iommu.c @@ -51,14 +51,12 @@ static int grant_dma_iommu_probe(struct platform_device *pdev) return 0; } -static int grant_dma_iommu_remove(struct platform_device *pdev) +static void grant_dma_iommu_remove(struct platform_device *pdev) { struct grant_dma_iommu_device *mmu = platform_get_drvdata(pdev); platform_set_drvdata(pdev, NULL); iommu_device_unregister(&mmu->iommu); - - return 0; } static struct platform_driver grant_dma_iommu_driver = { @@ -67,7 +65,7 @@ static struct platform_driver grant_dma_iommu_driver = { .of_match_table = grant_dma_iommu_of_match, }, .probe = grant_dma_iommu_probe, - .remove = grant_dma_iommu_remove, + .remove_new = grant_dma_iommu_remove, }; static int __init grant_dma_iommu_init(void) From 38620fc4e8934f1801c7811ef39a041914ac4c1d Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Tue, 20 Feb 2024 18:43:41 +0100 Subject: [PATCH 2/4] x86/xen: attempt to inflate the memory balloon on PVH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When running as PVH or HVM Linux will use holes in the memory map as scratch space to map grants, foreign domain pages and possibly miscellaneous other stuff. However the usage of such memory map holes for Xen purposes can be problematic. The request of holesby Xen happen quite early in the kernel boot process (grant table setup already uses scratch map space), and it's possible that by then not all devices have reclaimed their MMIO space. It's not unlikely for chunks of Xen scratch map space to end up using PCI bridge MMIO window memory, which (as expected) causes quite a lot of issues in the system. At least for PVH dom0 we have the possibility of using regions marked as UNUSABLE in the e820 memory map. Either if the region is UNUSABLE in the native memory map, or it has been converted into UNUSABLE in order to hide RAM regions from dom0, the second stage translation page-tables can populate those areas without issues. PV already has this kind of logic, where the balloon driver is inflated at boot. Re-use the current logic in order to also inflate it when running as PVH. onvert UNUSABLE regions up to the ratio specified in EXTRA_MEM_RATIO to RAM, while reserving them using xen_add_extra_mem() (which is also moved so it's no longer tied to CONFIG_PV). [jgross: fixed build for CONFIG_PVH without CONFIG_XEN_PVH] Signed-off-by: Roger Pau Monné Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20240220174341.56131-1-roger.pau@citrix.com Signed-off-by: Juergen Gross --- arch/x86/include/asm/xen/hypervisor.h | 5 ++ arch/x86/platform/pvh/enlighten.c | 3 ++ arch/x86/xen/enlighten.c | 32 +++++++++++++ arch/x86/xen/enlighten_pvh.c | 68 +++++++++++++++++++++++++++ arch/x86/xen/setup.c | 44 ----------------- arch/x86/xen/xen-ops.h | 14 ++++++ drivers/xen/balloon.c | 2 - 7 files changed, 122 insertions(+), 46 deletions(-) diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index a9088250770f..64fbd2dbc5b7 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -62,6 +62,11 @@ void xen_arch_unregister_cpu(int num); #ifdef CONFIG_PVH void __init xen_pvh_init(struct boot_params *boot_params); void __init mem_map_via_hcall(struct boot_params *boot_params_p); +#ifdef CONFIG_XEN_PVH +void __init xen_reserve_extra_memory(struct boot_params *bootp); +#else +static inline void xen_reserve_extra_memory(struct boot_params *bootp) { } +#endif #endif /* Lazy mode for batching updates / context switch */ diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c index 00a92cb2c814..a12117f3d4de 100644 --- a/arch/x86/platform/pvh/enlighten.c +++ b/arch/x86/platform/pvh/enlighten.c @@ -74,6 +74,9 @@ static void __init init_pvh_bootparams(bool xen_guest) } else xen_raw_printk("Warning: Can fit ISA range into e820\n"); + if (xen_guest) + xen_reserve_extra_memory(&pvh_bootparams); + pvh_bootparams.hdr.cmd_line_ptr = pvh_start_info.cmdline_paddr; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 3c61bb98c10e..a01ca255b0c6 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -350,3 +351,34 @@ void xen_arch_unregister_cpu(int num) } EXPORT_SYMBOL(xen_arch_unregister_cpu); #endif + +/* Amount of extra memory space we add to the e820 ranges */ +struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; + +void __init xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns) +{ + unsigned int i; + + /* + * No need to check for zero size, should happen rarely and will only + * write a new entry regarded to be unused due to zero size. + */ + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + /* Add new region. */ + if (xen_extra_mem[i].n_pfns == 0) { + xen_extra_mem[i].start_pfn = start_pfn; + xen_extra_mem[i].n_pfns = n_pfns; + break; + } + /* Append to existing region. */ + if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns == + start_pfn) { + xen_extra_mem[i].n_pfns += n_pfns; + break; + } + } + if (i == XEN_EXTRA_MEM_MAX_REGIONS) + printk(KERN_WARNING "Warning: not enough extra memory regions\n"); + + memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); +} diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index ada3868c02c2..c28f073c1df5 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include @@ -72,3 +73,70 @@ void __init mem_map_via_hcall(struct boot_params *boot_params_p) } boot_params_p->e820_entries = memmap.nr_entries; } + +/* + * Reserve e820 UNUSABLE regions to inflate the memory balloon. + * + * On PVH dom0 the host memory map is used, RAM regions available to dom0 are + * located as the same place as in the native memory map, but since dom0 gets + * less memory than the total amount of host RAM the ranges that can't be + * populated are converted from RAM -> UNUSABLE. Use such regions (up to the + * ratio signaled in EXTRA_MEM_RATIO) in order to inflate the balloon driver at + * boot. Doing so prevents the guest (even if just temporary) from using holes + * in the memory map in order to map grants or foreign addresses, and + * hopefully limits the risk of a clash with a device MMIO region. Ideally the + * hypervisor should notify us which memory ranges are suitable for creating + * foreign mappings, but that's not yet implemented. + */ +void __init xen_reserve_extra_memory(struct boot_params *bootp) +{ + unsigned int i, ram_pages = 0, extra_pages; + + for (i = 0; i < bootp->e820_entries; i++) { + struct boot_e820_entry *e = &bootp->e820_table[i]; + + if (e->type != E820_TYPE_RAM) + continue; + ram_pages += PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr); + } + + /* Max amount of extra memory. */ + extra_pages = EXTRA_MEM_RATIO * ram_pages; + + /* + * Convert UNUSABLE ranges to RAM and reserve them for foreign mapping + * purposes. + */ + for (i = 0; i < bootp->e820_entries && extra_pages; i++) { + struct boot_e820_entry *e = &bootp->e820_table[i]; + unsigned long pages; + + if (e->type != E820_TYPE_UNUSABLE) + continue; + + pages = min(extra_pages, + PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr)); + + if (pages != (PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr))) { + struct boot_e820_entry *next; + + if (bootp->e820_entries == + ARRAY_SIZE(bootp->e820_table)) + /* No space left to split - skip region. */ + continue; + + /* Split entry. */ + next = e + 1; + memmove(next, e, + (bootp->e820_entries - i) * sizeof(*e)); + bootp->e820_entries++; + next->addr = PAGE_ALIGN(e->addr) + PFN_PHYS(pages); + e->size = next->addr - e->addr; + next->size -= e->size; + } + e->type = E820_TYPE_RAM; + extra_pages -= pages; + + xen_add_extra_mem(PFN_UP(e->addr), pages); + } +} diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index b3e37961065a..380591028cb8 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -38,9 +38,6 @@ #define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024) -/* Amount of extra memory space we add to the e820 ranges */ -struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; - /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; @@ -64,18 +61,6 @@ static struct { } xen_remap_buf __initdata __aligned(PAGE_SIZE); static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; -/* - * The maximum amount of extra memory compared to the base size. The - * main scaling factor is the size of struct page. At extreme ratios - * of base:extra, all the base memory can be filled with page - * structures for the extra memory, leaving no space for anything - * else. - * - * 10x seems like a reasonable balance between scaling flexibility and - * leaving a practically usable system. - */ -#define EXTRA_MEM_RATIO (10) - static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB); static void __init xen_parse_512gb(void) @@ -96,35 +81,6 @@ static void __init xen_parse_512gb(void) xen_512gb_limit = val; } -static void __init xen_add_extra_mem(unsigned long start_pfn, - unsigned long n_pfns) -{ - int i; - - /* - * No need to check for zero size, should happen rarely and will only - * write a new entry regarded to be unused due to zero size. - */ - for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - /* Add new region. */ - if (xen_extra_mem[i].n_pfns == 0) { - xen_extra_mem[i].start_pfn = start_pfn; - xen_extra_mem[i].n_pfns = n_pfns; - break; - } - /* Append to existing region. */ - if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns == - start_pfn) { - xen_extra_mem[i].n_pfns += n_pfns; - break; - } - } - if (i == XEN_EXTRA_MEM_MAX_REGIONS) - printk(KERN_WARNING "Warning: not enough extra memory regions\n"); - - memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); -} - static void __init xen_del_extra_mem(unsigned long start_pfn, unsigned long n_pfns) { diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index a87ab36889e7..79cf93f2c92f 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -163,4 +163,18 @@ void xen_hvm_post_suspend(int suspend_cancelled); static inline void xen_hvm_post_suspend(int suspend_cancelled) {} #endif +/* + * The maximum amount of extra memory compared to the base size. The + * main scaling factor is the size of struct page. At extreme ratios + * of base:extra, all the base memory can be filled with page + * structures for the extra memory, leaving no space for anything + * else. + * + * 10x seems like a reasonable balance between scaling flexibility and + * leaving a practically usable system. + */ +#define EXTRA_MEM_RATIO (10) + +void xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns); + #endif /* XEN_OPS_H */ diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 976c6cdf9ee6..aaf2514fcfa4 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -672,7 +672,6 @@ EXPORT_SYMBOL(xen_free_ballooned_pages); static void __init balloon_add_regions(void) { -#if defined(CONFIG_XEN_PV) unsigned long start_pfn, pages; unsigned long pfn, extra_pfn_end; unsigned int i; @@ -696,7 +695,6 @@ static void __init balloon_add_regions(void) balloon_stats.total_pages += extra_pfn_end - start_pfn; } -#endif } static int __init balloon_init(void) From 51c23bd691c0f1fb95b29731c356c6fd69925d17 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 13 Mar 2024 08:14:08 +0100 Subject: [PATCH 3/4] xen/evtchn: avoid WARN() when unbinding an event channel When unbinding a user event channel, the related handler might be called a last time in case the kernel was built with CONFIG_DEBUG_SHIRQ. This might cause a WARN() in the handler. Avoid that by adding an "unbinding" flag to struct user_event which will short circuit the handler. Fixes: 9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers") Reported-by: Demi Marie Obenour Tested-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Link: https://lore.kernel.org/r/20240313071409.25913-2-jgross@suse.com Signed-off-by: Juergen Gross --- drivers/xen/evtchn.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index 59717628ca42..f6a2216c2c87 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -85,6 +85,7 @@ struct user_evtchn { struct per_user_data *user; evtchn_port_t port; bool enabled; + bool unbinding; }; static void evtchn_free_ring(evtchn_port_t *ring) @@ -164,6 +165,10 @@ static irqreturn_t evtchn_interrupt(int irq, void *data) struct per_user_data *u = evtchn->user; unsigned int prod, cons; + /* Handler might be called when tearing down the IRQ. */ + if (evtchn->unbinding) + return IRQ_HANDLED; + WARN(!evtchn->enabled, "Interrupt for port %u, but apparently not enabled; per-user %p\n", evtchn->port, u); @@ -421,6 +426,7 @@ static void evtchn_unbind_from_user(struct per_user_data *u, BUG_ON(irq < 0); + evtchn->unbinding = true; unbind_from_irqhandler(irq, evtchn); del_evtchn(u, evtchn); From d277f9d82802223f242cd9b60c988cfdda1d6be0 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 13 Mar 2024 08:14:09 +0100 Subject: [PATCH 4/4] xen/events: increment refcnt only if event channel is refcounted In bind_evtchn_to_irq_chip() don't increment the refcnt of the event channel blindly. In case the event channel is NOT refcounted, issue a warning instead. Add an additional safety net by doing the refcnt increment only if the caller has specified IRQF_SHARED in the irqflags parameter. Fixes: 9e90e58c11b7 ("xen: evtchn: Allow shared registration of IRQ handers") Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Link: https://lore.kernel.org/r/20240313071409.25913-3-jgross@suse.com Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 3b9f080109d7..27553673e46b 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1190,7 +1190,7 @@ int xen_pirq_from_irq(unsigned irq) EXPORT_SYMBOL_GPL(xen_pirq_from_irq); static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, - struct xenbus_device *dev) + struct xenbus_device *dev, bool shared) { int ret = -ENOMEM; struct irq_info *info; @@ -1224,7 +1224,8 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, */ bind_evtchn_to_cpu(info, 0, false); } else if (!WARN_ON(info->type != IRQT_EVTCHN)) { - info->refcnt++; + if (shared && !WARN_ON(info->refcnt < 0)) + info->refcnt++; } ret = info->irq; @@ -1237,13 +1238,13 @@ out: int bind_evtchn_to_irq(evtchn_port_t evtchn) { - return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip, NULL); + return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip, NULL, false); } EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn) { - return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip, NULL); + return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip, NULL, false); } EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi); @@ -1295,7 +1296,8 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) static int bind_interdomain_evtchn_to_irq_chip(struct xenbus_device *dev, evtchn_port_t remote_port, - struct irq_chip *chip) + struct irq_chip *chip, + bool shared) { struct evtchn_bind_interdomain bind_interdomain; int err; @@ -1307,14 +1309,14 @@ static int bind_interdomain_evtchn_to_irq_chip(struct xenbus_device *dev, &bind_interdomain); return err ? : bind_evtchn_to_irq_chip(bind_interdomain.local_port, - chip, dev); + chip, dev, shared); } int bind_interdomain_evtchn_to_irq_lateeoi(struct xenbus_device *dev, evtchn_port_t remote_port) { return bind_interdomain_evtchn_to_irq_chip(dev, remote_port, - &xen_lateeoi_chip); + &xen_lateeoi_chip, false); } EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi); @@ -1430,7 +1432,8 @@ static int bind_evtchn_to_irqhandler_chip(evtchn_port_t evtchn, { int irq, retval; - irq = bind_evtchn_to_irq_chip(evtchn, chip, NULL); + irq = bind_evtchn_to_irq_chip(evtchn, chip, NULL, + irqflags & IRQF_SHARED); if (irq < 0) return irq; retval = request_irq(irq, handler, irqflags, devname, dev_id); @@ -1471,7 +1474,8 @@ static int bind_interdomain_evtchn_to_irqhandler_chip( { int irq, retval; - irq = bind_interdomain_evtchn_to_irq_chip(dev, remote_port, chip); + irq = bind_interdomain_evtchn_to_irq_chip(dev, remote_port, chip, + irqflags & IRQF_SHARED); if (irq < 0) return irq;