From a5e3b127455d073f146a2a4ea3e7117635d34c5c Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Thu, 2 Nov 2023 10:36:49 +0100 Subject: [PATCH 1/4] swiotlb: do not free decrypted pages if dynamic Fix these two error paths: 1. When set_memory_decrypted() fails, pages may be left fully or partially decrypted. 2. Decrypted pages may be freed if swiotlb_alloc_tlb() determines that the physical address is too high. To fix the first issue, call set_memory_encrypted() on the allocated region after a failed decryption attempt. If that also fails, leak the pages. To fix the second issue, check that the TLB physical address is below the requested limit before decrypting. Let the caller differentiate between unsuitable physical address (=> retry from a lower zone) and allocation failures (=> no point in retrying). Cc: stable@vger.kernel.org Fixes: 79636caad361 ("swiotlb: if swiotlb is full, fall back to a transient memory pool") Signed-off-by: Petr Tesarik Reviewed-by: Rick Edgecombe Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 26202274784f..71392c9fac10 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -558,29 +558,40 @@ void __init swiotlb_exit(void) * alloc_dma_pages() - allocate pages to be used for DMA * @gfp: GFP flags for the allocation. * @bytes: Size of the buffer. + * @phys_limit: Maximum allowed physical address of the buffer. * * Allocate pages from the buddy allocator. If successful, make the allocated * pages decrypted that they can be used for DMA. * - * Return: Decrypted pages, or %NULL on failure. + * Return: Decrypted pages, %NULL on allocation failure, or ERR_PTR(-EAGAIN) + * if the allocated physical address was above @phys_limit. */ -static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes) +static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit) { unsigned int order = get_order(bytes); struct page *page; + phys_addr_t paddr; void *vaddr; page = alloc_pages(gfp, order); if (!page) return NULL; - vaddr = page_address(page); + paddr = page_to_phys(page); + if (paddr + bytes - 1 > phys_limit) { + __free_pages(page, order); + return ERR_PTR(-EAGAIN); + } + + vaddr = phys_to_virt(paddr); if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes))) goto error; return page; error: - __free_pages(page, order); + /* Intentional leak if pages cannot be encrypted again. */ + if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes))) + __free_pages(page, order); return NULL; } @@ -618,11 +629,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes, else if (phys_limit <= DMA_BIT_MASK(32)) gfp |= __GFP_DMA32; - while ((page = alloc_dma_pages(gfp, bytes)) && - page_to_phys(page) + bytes - 1 > phys_limit) { - /* allocated, but too high */ - __free_pages(page, get_order(bytes)); - + while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit))) { if (IS_ENABLED(CONFIG_ZONE_DMA32) && phys_limit < DMA_BIT_MASK(64) && !(gfp & (__GFP_DMA32 | __GFP_DMA))) From 8ae0e970319ac0b516d285650a744bab4ed3dd37 Mon Sep 17 00:00:00 2001 From: Jia He Date: Sat, 28 Oct 2023 10:20:58 +0000 Subject: [PATCH 2/4] dma-mapping: move dma_addressing_limited() out of line This patch moves dma_addressing_limited() out of line, serving as a preliminary step to prevent the introduction of a new publicly accessible low-level helper when validating whether all system RAM is mapped within the DMA mapping range. Suggested-by: Christoph Hellwig Signed-off-by: Jia He Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 19 +++++-------------- kernel/dma/mapping.c | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index f0ccca16a0ac..4a658de44ee9 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -144,6 +144,7 @@ bool dma_pci_p2pdma_supported(struct device *dev); int dma_set_mask(struct device *dev, u64 mask); int dma_set_coherent_mask(struct device *dev, u64 mask); u64 dma_get_required_mask(struct device *dev); +bool dma_addressing_limited(struct device *dev); size_t dma_max_mapping_size(struct device *dev); size_t dma_opt_mapping_size(struct device *dev); bool dma_need_sync(struct device *dev, dma_addr_t dma_addr); @@ -264,6 +265,10 @@ static inline u64 dma_get_required_mask(struct device *dev) { return 0; } +static inline bool dma_addressing_limited(struct device *dev) +{ + return false; +} static inline size_t dma_max_mapping_size(struct device *dev) { return 0; @@ -465,20 +470,6 @@ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask) return dma_set_mask_and_coherent(dev, mask); } -/** - * dma_addressing_limited - return if the device is addressing limited - * @dev: device to check - * - * Return %true if the devices DMA mask is too small to address all memory in - * the system, else %false. Lack of addressing bits is the prime reason for - * bounce buffering, but might not be the only one. - */ -static inline bool dma_addressing_limited(struct device *dev) -{ - return min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < - dma_get_required_mask(dev); -} - static inline unsigned int dma_get_max_seg_size(struct device *dev) { if (dev->dma_parms && dev->dma_parms->max_segment_size) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index e323ca48f7f2..7789c86f7ba3 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -793,6 +793,21 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_coherent_mask); +/** + * dma_addressing_limited - return if the device is addressing limited + * @dev: device to check + * + * Return %true if the devices DMA mask is too small to address all memory in + * the system, else %false. Lack of addressing bits is the prime reason for + * bounce buffering, but might not be the only one. + */ +bool dma_addressing_limited(struct device *dev) +{ + return min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < + dma_get_required_mask(dev); +} +EXPORT_SYMBOL_GPL(dma_addressing_limited); + size_t dma_max_mapping_size(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); From a409d9600959f3c4b2a48946304c8e01b8d04072 Mon Sep 17 00:00:00 2001 From: Jia He Date: Sat, 28 Oct 2023 10:20:59 +0000 Subject: [PATCH 3/4] dma-mapping: fix dma_addressing_limited() if dma_range_map can't cover all system RAM There is an unusual case that the range map covers right up to the top of system RAM, but leaves a hole somewhere lower down. Then it prevents the nvme device dma mapping in the checking path of phys_to_dma() and causes the hangs at boot. E.g. On an Armv8 Ampere server, the dsdt ACPI table is: Method (_DMA, 0, Serialized) // _DMA: Direct Memory Access { Name (RBUF, ResourceTemplate () { QWordMemory (ResourceConsumer, PosDecode, MinFixed, MaxFixed, Cacheable, ReadWrite, 0x0000000000000000, // Granularity 0x0000000000000000, // Range Minimum 0x00000000FFFFFFFF, // Range Maximum 0x0000000000000000, // Translation Offset 0x0000000100000000, // Length ,, , AddressRangeMemory, TypeStatic) QWordMemory (ResourceConsumer, PosDecode, MinFixed, MaxFixed, Cacheable, ReadWrite, 0x0000000000000000, // Granularity 0x0000006010200000, // Range Minimum 0x000000602FFFFFFF, // Range Maximum 0x0000000000000000, // Translation Offset 0x000000001FE00000, // Length ,, , AddressRangeMemory, TypeStatic) QWordMemory (ResourceConsumer, PosDecode, MinFixed, MaxFixed, Cacheable, ReadWrite, 0x0000000000000000, // Granularity 0x00000060F0000000, // Range Minimum 0x00000060FFFFFFFF, // Range Maximum 0x0000000000000000, // Translation Offset 0x0000000010000000, // Length ,, , AddressRangeMemory, TypeStatic) QWordMemory (ResourceConsumer, PosDecode, MinFixed, MaxFixed, Cacheable, ReadWrite, 0x0000000000000000, // Granularity 0x0000007000000000, // Range Minimum 0x000003FFFFFFFFFF, // Range Maximum 0x0000000000000000, // Translation Offset 0x0000039000000000, // Length ,, , AddressRangeMemory, TypeStatic) }) But the System RAM ranges are: cat /proc/iomem |grep -i ram 90000000-91ffffff : System RAM 92900000-fffbffff : System RAM 880000000-fffffffff : System RAM 8800000000-bff5990fff : System RAM bff59d0000-bff5a4ffff : System RAM bff8000000-bfffffffff : System RAM So some RAM ranges are out of dma_range_map. Fix it by checking whether each of the system RAM resources can be properly encompassed within the dma_range_map. Signed-off-by: Jia He Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/dma/direct.h | 1 + kernel/dma/mapping.c | 11 +++++++++-- 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index ed3056eb20b8..73c95815789a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -587,6 +587,46 @@ int dma_direct_supported(struct device *dev, u64 mask) return mask >= phys_to_dma_unencrypted(dev, min_mask); } +/* + * To check whether all ram resource ranges are covered by dma range map + * Returns 0 when further check is needed + * Returns 1 if there is some RAM range can't be covered by dma_range_map + */ +static int check_ram_in_range_map(unsigned long start_pfn, + unsigned long nr_pages, void *data) +{ + unsigned long end_pfn = start_pfn + nr_pages; + const struct bus_dma_region *bdr = NULL; + const struct bus_dma_region *m; + struct device *dev = data; + + while (start_pfn < end_pfn) { + for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) { + unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start); + + if (start_pfn >= cpu_start_pfn && + start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) { + bdr = m; + break; + } + } + if (!bdr) + return 1; + + start_pfn = PFN_DOWN(bdr->cpu_start) + PFN_DOWN(bdr->size); + } + + return 0; +} + +bool dma_direct_all_ram_mapped(struct device *dev) +{ + if (!dev->dma_range_map) + return true; + return !walk_system_ram_range(0, PFN_DOWN(ULONG_MAX) + 1, dev, + check_ram_in_range_map); +} + size_t dma_direct_max_mapping_size(struct device *dev) { /* If SWIOTLB is active, use its maximum mapping size */ diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 97ec892ea0b5..18d346118fe8 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -20,6 +20,7 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr); int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs); +bool dma_direct_all_ram_mapped(struct device *dev); size_t dma_direct_max_mapping_size(struct device *dev); #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 7789c86f7ba3..58db8fd70471 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -803,8 +803,15 @@ EXPORT_SYMBOL(dma_set_coherent_mask); */ bool dma_addressing_limited(struct device *dev) { - return min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < - dma_get_required_mask(dev); + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < + dma_get_required_mask(dev)) + return true; + + if (unlikely(ops)) + return false; + return !dma_direct_all_ram_mapped(dev); } EXPORT_SYMBOL_GPL(dma_addressing_limited); From 53c87e846e335e3c18044c397cc35178163d7827 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Wed, 8 Nov 2023 12:12:49 +0100 Subject: [PATCH 4/4] swiotlb: fix out-of-bounds TLB allocations with CONFIG_SWIOTLB_DYNAMIC Limit the free list length to the size of the IO TLB. Transient pool can be smaller than IO_TLB_SEGSIZE, but the free list is initialized with the assumption that the total number of slots is a multiple of IO_TLB_SEGSIZE. As a result, swiotlb_area_find_slots() may allocate slots past the end of a transient IO TLB buffer. Reported-by: Niklas Schnelle Closes: https://lore.kernel.org/linux-iommu/104a8c8fedffd1ff8a2890983e2ec1c26bff6810.camel@linux.ibm.com/ Fixes: 79636caad361 ("swiotlb: if swiotlb is full, fall back to a transient memory pool") Cc: stable@vger.kernel.org Signed-off-by: Petr Tesarik Reviewed-by: Halil Pasic Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 71392c9fac10..33d942615be5 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -283,7 +283,8 @@ static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start, } for (i = 0; i < mem->nslabs; i++) { - mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); + mem->slots[i].list = min(IO_TLB_SEGSIZE - io_tlb_offset(i), + mem->nslabs - i); mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; }