From 25a4ce564921db0973b74c04e0ea23bd2ee12a3a Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Mon, 20 Feb 2023 16:06:22 +0100 Subject: [PATCH 01/11] dma-direct: cleanup parameters to dma_direct_optimal_gfp_mask Since both callers of dma_direct_optimal_gfp_mask() pass dev->coherent_dma_mask as the second argument, it is better to remove that parameter altogether. Not only is reducing number of parameters good for readability, but the new function signature is also more logical: The optimal flags depend only on data contained in struct device. While touching this code, let's also rename phys_mask to phys_limit in dma_direct_alloc_from_pool(), because it is indeed a limit. Signed-off-by: Petr Tesarik Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 63859a101ed8..5595d1d5cdcc 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -44,10 +44,11 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } -static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, - u64 *phys_limit) +static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit) { - u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); + u64 dma_limit = min_not_zero( + dev->coherent_dma_mask, + dev->bus_dma_limit); /* * Optimistically try the zone that the physical address mask falls @@ -126,8 +127,7 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, if (is_swiotlb_for_alloc(dev)) return dma_direct_alloc_swiotlb(dev, size); - gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_limit); + gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit); page = dma_alloc_contiguous(dev, size, gfp); if (page) { if (!dma_coherent_ok(dev, page_to_phys(page), size) || @@ -172,14 +172,13 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { struct page *page; - u64 phys_mask; + u64 phys_limit; void *ret; if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))) return NULL; - gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_mask); + gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit); page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok); if (!page) return NULL; From 479623fd0c5ad86a43ae11670f7da2a7c462acd4 Mon Sep 17 00:00:00 2001 From: Desnes Nunes Date: Thu, 16 Mar 2023 11:09:10 -0300 Subject: [PATCH 02/11] dma-debug: small dma_debug_entry's comment and variable name updates Small update on dma_debug_entry's struct commentary and also standardize the usage of 'dma_addr' variable name from debug_dma_map_page() on debug_dma_unmap_page(), and similarly on debug_dma_free_coherent() Signed-off-by: Desnes Nunes Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 18c93c2276ca..9aa5100d0d7e 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -53,6 +53,7 @@ enum map_err_types { * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping * @list: node on pre-allocated free_entries list * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent + * @dev_addr: dma address * @size: length of the mapping * @type: single, page, sg, coherent * @direction: enum dma_data_direction @@ -1262,13 +1263,13 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) } EXPORT_SYMBOL(debug_dma_mapping_error); -void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, +void debug_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, int direction) { struct dma_debug_entry ref = { .type = dma_debug_single, .dev = dev, - .dev_addr = addr, + .dev_addr = dma_addr, .size = size, .direction = direction, }; @@ -1403,13 +1404,13 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size, } void debug_dma_free_coherent(struct device *dev, size_t size, - void *virt, dma_addr_t addr) + void *virt, dma_addr_t dma_addr) { struct dma_debug_entry ref = { .type = dma_debug_coherent, .dev = dev, .offset = offset_in_page(virt), - .dev_addr = addr, + .dev_addr = dma_addr, .size = size, .direction = DMA_BIDIRECTIONAL, }; From bd89d69a529fbef3559a16dbe4cee4b04225136a Mon Sep 17 00:00:00 2001 From: Desnes Nunes Date: Thu, 16 Mar 2023 11:09:11 -0300 Subject: [PATCH 03/11] dma-debug: add cacheline to user/kernel space dump messages Having the cacheline also printed on the debug_dma_dump_mappings() and dump_show() is useful for debugging. Furthermore, this also standardizes the messages shown on both dump functions. Signed-off-by: Desnes Nunes Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 122 ++++++++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 58 deletions(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 9aa5100d0d7e..676142072d99 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -396,37 +396,6 @@ static unsigned long long phys_addr(struct dma_debug_entry *entry) return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; } -/* - * Dump mapping entries for debugging purposes - */ -void debug_dma_dump_mappings(struct device *dev) -{ - int idx; - - for (idx = 0; idx < HASH_SIZE; idx++) { - struct hash_bucket *bucket = &dma_entry_hash[idx]; - struct dma_debug_entry *entry; - unsigned long flags; - - spin_lock_irqsave(&bucket->lock, flags); - - list_for_each_entry(entry, &bucket->list, list) { - if (!dev || dev == entry->dev) { - dev_info(entry->dev, - "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n", - type2name[entry->type], idx, - phys_addr(entry), entry->pfn, - entry->dev_addr, entry->size, - dir2name[entry->direction], - maperr2str[entry->map_err_type]); - } - } - - spin_unlock_irqrestore(&bucket->lock, flags); - cond_resched(); - } -} - /* * For each mapping (initial cacheline in the case of * dma_alloc_coherent/dma_map_page, initial cacheline in each page of a @@ -547,6 +516,70 @@ static void active_cacheline_remove(struct dma_debug_entry *entry) spin_unlock_irqrestore(&radix_lock, flags); } +/* + * Dump mappings entries on kernel space for debugging purposes + */ +void debug_dma_dump_mappings(struct device *dev) +{ + int idx; + phys_addr_t cln; + + for (idx = 0; idx < HASH_SIZE; idx++) { + struct hash_bucket *bucket = &dma_entry_hash[idx]; + struct dma_debug_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&bucket->lock, flags); + list_for_each_entry(entry, &bucket->list, list) { + if (!dev || dev == entry->dev) { + cln = to_cacheline_number(entry); + dev_info(entry->dev, + "%s idx %d P=%llx N=%lx D=%llx L=%llx cln=%llx %s %s\n", + type2name[entry->type], idx, + phys_addr(entry), entry->pfn, + entry->dev_addr, entry->size, + cln, dir2name[entry->direction], + maperr2str[entry->map_err_type]); + } + } + spin_unlock_irqrestore(&bucket->lock, flags); + + cond_resched(); + } +} + +/* + * Dump mappings entries on user space via debugfs + */ +static int dump_show(struct seq_file *seq, void *v) +{ + int idx; + phys_addr_t cln; + + for (idx = 0; idx < HASH_SIZE; idx++) { + struct hash_bucket *bucket = &dma_entry_hash[idx]; + struct dma_debug_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&bucket->lock, flags); + list_for_each_entry(entry, &bucket->list, list) { + cln = to_cacheline_number(entry); + seq_printf(seq, + "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx cln=%llx %s %s\n", + dev_driver_string(entry->dev), + dev_name(entry->dev), + type2name[entry->type], idx, + phys_addr(entry), entry->pfn, + entry->dev_addr, entry->size, + cln, dir2name[entry->direction], + maperr2str[entry->map_err_type]); + } + spin_unlock_irqrestore(&bucket->lock, flags); + } + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dump); + /* * Wrapper function for adding an entry to the hash. * This function takes care of locking itself. @@ -765,33 +798,6 @@ static const struct file_operations filter_fops = { .llseek = default_llseek, }; -static int dump_show(struct seq_file *seq, void *v) -{ - int idx; - - for (idx = 0; idx < HASH_SIZE; idx++) { - struct hash_bucket *bucket = &dma_entry_hash[idx]; - struct dma_debug_entry *entry; - unsigned long flags; - - spin_lock_irqsave(&bucket->lock, flags); - list_for_each_entry(entry, &bucket->list, list) { - seq_printf(seq, - "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx %s %s\n", - dev_name(entry->dev), - dev_driver_string(entry->dev), - type2name[entry->type], idx, - phys_addr(entry), entry->pfn, - entry->dev_addr, entry->size, - dir2name[entry->direction], - maperr2str[entry->map_err_type]); - } - spin_unlock_irqrestore(&bucket->lock, flags); - } - return 0; -} -DEFINE_SHOW_ATTRIBUTE(dump); - static int __init dma_debug_fs_init(void) { struct dentry *dentry = debugfs_create_dir("dma-api", NULL); From b31507dcaf3566d8ec3bdbf9922d8d5748b08a0f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 29 Mar 2023 09:14:05 +0200 Subject: [PATCH 04/11] dma-debug: Use %pa to format phys_addr_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 32-bit without LPAE: kernel/dma/debug.c: In function ‘debug_dma_dump_mappings’: kernel/dma/debug.c:537:7: warning: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 9 has type ‘phys_addr_t’ {aka ‘unsigned int’} [-Wformat=] kernel/dma/debug.c: In function ‘dump_show’: kernel/dma/debug.c:568:59: warning: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 11 has type ‘phys_addr_t’ {aka ‘unsigned int’} [-Wformat=] Fixes: bd89d69a529fbef3 ("dma-debug: add cacheline to user/kernel space dump messages") Reported-by: kernel test robot Link: https://lore.kernel.org/r/202303160548.ReyuTsGD-lkp@intel.com Reported-by: noreply@ellerman.id.au Signed-off-by: Geert Uytterhoeven Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 676142072d99..f190651bcadd 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -534,11 +534,11 @@ void debug_dma_dump_mappings(struct device *dev) if (!dev || dev == entry->dev) { cln = to_cacheline_number(entry); dev_info(entry->dev, - "%s idx %d P=%llx N=%lx D=%llx L=%llx cln=%llx %s %s\n", + "%s idx %d P=%llx N=%lx D=%llx L=%llx cln=%pa %s %s\n", type2name[entry->type], idx, phys_addr(entry), entry->pfn, entry->dev_addr, entry->size, - cln, dir2name[entry->direction], + &cln, dir2name[entry->direction], maperr2str[entry->map_err_type]); } } @@ -565,13 +565,13 @@ static int dump_show(struct seq_file *seq, void *v) list_for_each_entry(entry, &bucket->list, list) { cln = to_cacheline_number(entry); seq_printf(seq, - "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx cln=%llx %s %s\n", + "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx cln=%pa %s %s\n", dev_driver_string(entry->dev), dev_name(entry->dev), type2name[entry->type], idx, phys_addr(entry), entry->pfn, entry->dev_addr, entry->size, - cln, dir2name[entry->direction], + &cln, dir2name[entry->direction], maperr2str[entry->map_err_type]); } spin_unlock_irqrestore(&bucket->lock, flags); From fe4e5efa401fe15eab9259e358730145449e83a2 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Sat, 1 Apr 2023 10:15:29 +0100 Subject: [PATCH 05/11] dma-mapping: provide a fallback dma_default_coherent dma_default_coherent was decleared unconditionally at kernel/dma/mapping.c but only decleared when any of non-coherent options is enabled in dma-map-ops.h. Guard the declaration in mapping.c with non-coherent options and provide a fallback definition. Signed-off-by: Jiaxun Yang Signed-off-by: Christoph Hellwig --- include/linux/dma-map-ops.h | 2 ++ kernel/dma/mapping.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 41bf4bdb117a..31f114f486c4 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -269,6 +269,8 @@ static inline bool dev_is_dma_coherent(struct device *dev) return dev->dma_coherent; } #else +#define dma_default_coherent true + static inline bool dev_is_dma_coherent(struct device *dev) { return true; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 68106e3791f6..80f9663ffe26 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -17,7 +17,11 @@ #include "debug.h" #include "direct.h" +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) bool dma_default_coherent; +#endif /* * Managed DMA API From 1d3f56b295302fdb4ac9caf6ce09f5ae7d2e651a Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Sat, 1 Apr 2023 10:15:30 +0100 Subject: [PATCH 06/11] dma-mapping: provide CONFIG_ARCH_DMA_DEFAULT_COHERENT Provide a kconfig option to allow arches to manipulate default value of dma_default_coherent in Kconfig. Signed-off-by: Jiaxun Yang Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 7 +++++++ kernel/dma/mapping.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 56866aaa2ae1..6677d0e64d27 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -76,6 +76,13 @@ config ARCH_HAS_DMA_PREP_COHERENT config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool +# +# Select this option if the architecture assumes DMA devices are coherent +# by default. +# +config ARCH_DMA_DEFAULT_COHERENT + bool + config SWIOTLB bool select NEED_DMA_MAP_STATE diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 80f9663ffe26..9a4db5cce600 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -20,7 +20,7 @@ #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) -bool dma_default_coherent; +bool dma_default_coherent = IS_ENABLED(CONFIG_ARCH_DMA_DEFAULT_COHERENT); #endif /* From c00a60d6f4a14b06264bb6f9fcc754b8ddbf67e3 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Sat, 1 Apr 2023 10:15:31 +0100 Subject: [PATCH 07/11] of: address: always use dma_default_coherent for default coherency As for now all arches have dma_default_coherent reflecting default DMA coherency for of devices, so there is no need to have a standalone config option. Signed-off-by: Jiaxun Yang Reviewed-by: Rob Herring Acked-by: Michael Ellerman (powerpc) Signed-off-by: Christoph Hellwig --- arch/powerpc/Kconfig | 2 +- arch/riscv/Kconfig | 2 +- drivers/of/Kconfig | 4 ---- drivers/of/address.c | 2 +- 4 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a6c4407d3ec8..f4269f5109c7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -113,6 +113,7 @@ config PPC # select ARCH_32BIT_OFF_T if PPC32 select ARCH_DISABLE_KASAN_INLINE if PPC_RADIX_MMU + select ARCH_DMA_DEFAULT_COHERENT if !NOT_COHERENT_CACHE select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_HAS_COPY_MC if PPC64 @@ -272,7 +273,6 @@ config PPC select NEED_PER_CPU_PAGE_FIRST_CHUNK if PPC64 select NEED_SG_DMA_LENGTH select OF - select OF_DMA_DEFAULT_COHERENT if !NOT_COHERENT_CACHE select OF_EARLY_FLATTREE select OLD_SIGACTION if PPC32 select OLD_SIGSUSPEND diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index c5e42cc37604..30727eb63688 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -12,6 +12,7 @@ config 32BIT config RISCV def_bool y + select ARCH_DMA_DEFAULT_COHERENT select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE @@ -121,7 +122,6 @@ config RISCV select MODULES_USE_ELF_RELA if MODULES select MODULE_SECTIONS if MODULES select OF - select OF_DMA_DEFAULT_COHERENT select OF_EARLY_FLATTREE select OF_IRQ select PCI_DOMAINS_GENERIC if PCI diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig index 644386833a7b..e40f10bf2ba4 100644 --- a/drivers/of/Kconfig +++ b/drivers/of/Kconfig @@ -102,8 +102,4 @@ config OF_OVERLAY config OF_NUMA bool -config OF_DMA_DEFAULT_COHERENT - # arches should select this if DMA is coherent by default for OF devices - bool - endif # OF diff --git a/drivers/of/address.c b/drivers/of/address.c index 4c0b169ef9bf..23ade4919853 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -1103,7 +1103,7 @@ phys_addr_t __init of_dma_get_max_cpu_address(struct device_node *np) bool of_dma_is_coherent(struct device_node *np) { struct device_node *node; - bool is_coherent = IS_ENABLED(CONFIG_OF_DMA_DEFAULT_COHERENT); + bool is_coherent = dma_default_coherent; node = of_node_get(np); From a90922fa25370902322e9de6640e58737d459a50 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Fri, 14 Apr 2023 14:29:25 -0700 Subject: [PATCH 08/11] swiotlb: relocate PageHighMem test away from rmem_swiotlb_setup The reservedmem_of_init_fn's are invoked very early at boot before the memory zones have even been defined. This makes it inappropriate to test whether the page corresponding to a PFN is in ZONE_HIGHMEM from within one. Removing the check allows an ARM 32-bit kernel with SPARSEMEM enabled to boot properly since otherwise we would be de-referencing an uninitialized sparsemem map to perform pfn_to_page() check. The arm64 architecture happens to work (and also has no high memory) but other 32-bit architectures could also be having similar issues. While it would be nice to provide early feedback about a reserved DMA pool residing in highmem, it is not possible to do that until the first time we try to use it, which is where the check is moved to. Fixes: 0b84e4f8b793 ("swiotlb: Add restricted DMA pool initialization") Signed-off-by: Doug Berger Signed-off-by: Florian Fainelli Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 5b919ef832b6..1ec38701a7f5 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -998,6 +998,11 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, /* Set Per-device io tlb area to one */ unsigned int nareas = 1; + if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) { + dev_err(dev, "Restricted DMA pool must be accessible within the linear mapping."); + return -EINVAL; + } + /* * Since multiple devices can share the same pool, the private data, * io_tlb_mem struct, will be initialized by the first device attached @@ -1059,11 +1064,6 @@ static int __init rmem_swiotlb_setup(struct reserved_mem *rmem) of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; - if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) { - pr_err("Restricted DMA pool must be accessible within the linear mapping."); - return -EINVAL; - } - rmem->ops = &rmem_swiotlb_ops; pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); From 5499d01c029069044a3b3e50501c77b474c96178 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Thu, 13 Apr 2023 08:37:30 -0700 Subject: [PATCH 09/11] swiotlb: fix debugfs reporting of reserved memory pools For io_tlb_nslabs, the debugfs code reports the correct value for a specific reserved memory pool. But for io_tlb_used, the value reported is always for the default pool, not the specific reserved pool. Fix this. Fixes: 5c850d31880e ("swiotlb: fix passing local variable to debugfs_create_ulong()") Signed-off-by: Michael Kelley Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 1ec38701a7f5..938c959ab19e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -930,7 +930,9 @@ EXPORT_SYMBOL_GPL(is_swiotlb_active); static int io_tlb_used_get(void *data, u64 *val) { - *val = mem_used(&io_tlb_default_mem); + struct io_tlb_mem *mem = data; + + *val = mem_used(mem); return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n"); @@ -943,7 +945,7 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, return; debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); - debugfs_create_file("io_tlb_used", 0400, mem->debugfs, NULL, + debugfs_create_file("io_tlb_used", 0400, mem->debugfs, mem, &fops_io_tlb_used); } From 8b0977ecc8b30a30966e76fcb64cef5041626b02 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Thu, 13 Apr 2023 10:57:37 -0700 Subject: [PATCH 10/11] swiotlb: track and report io_tlb_used high water marks in debugfs swiotlb currently reports the total number of slabs and the instantaneous in-use slabs in debugfs. But with increased usage of swiotlb for all I/O in Confidential Computing (coco) VMs, it has become difficult to know how much memory to allocate for swiotlb bounce buffers, either via the automatic algorithm in the kernel or by specifying a value on the kernel boot line. The current automatic algorithm generously allocates swiotlb bounce buffer memory, and may be wasting significant memory in many use cases. To support better understanding of swiotlb usage, add tracking of the the high water mark for usage of the default swiotlb bounce buffer memory pool and any reserved memory pools. Report these high water marks in debugfs along with the other swiotlb pool metrics. Allow the high water marks to be reset to zero at runtime by writing to them. Signed-off-by: Michael Kelley Signed-off-by: Christoph Hellwig --- include/linux/swiotlb.h | 7 +++++ kernel/dma/swiotlb.c | 66 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index bcef10e20ea4..6dc4598d2260 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -87,6 +87,11 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys, * @for_alloc: %true if the pool is used for memory allocation * @nareas: The area number in the pool. * @area_nslabs: The slot number in the area. + * @total_used: The total number of slots in the pool that are currently used + * across all areas. Used only for calculating used_hiwater in + * debugfs. + * @used_hiwater: The high water mark for total_used. Used only for reporting + * in debugfs. */ struct io_tlb_mem { phys_addr_t start; @@ -102,6 +107,8 @@ struct io_tlb_mem { unsigned int area_nslabs; struct io_tlb_area *areas; struct io_tlb_slot *slots; + atomic_long_t total_used; + atomic_long_t used_hiwater; }; extern struct io_tlb_mem io_tlb_default_mem; diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 938c959ab19e..9bbc2802a444 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -608,6 +608,40 @@ static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index) return index; } +/* + * Track the total used slots with a global atomic value in order to have + * correct information to determine the high water mark. The mem_used() + * function gives imprecise results because there's no locking across + * multiple areas. + */ +#ifdef CONFIG_DEBUG_FS +static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots) +{ + unsigned long old_hiwater, new_used; + + new_used = atomic_long_add_return(nslots, &mem->total_used); + old_hiwater = atomic_long_read(&mem->used_hiwater); + do { + if (new_used <= old_hiwater) + break; + } while (!atomic_long_try_cmpxchg(&mem->used_hiwater, + &old_hiwater, new_used)); +} + +static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) +{ + atomic_long_sub(nslots, &mem->total_used); +} + +#else /* !CONFIG_DEBUG_FS */ +static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots) +{ +} +static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) +{ +} +#endif /* CONFIG_DEBUG_FS */ + /* * Find a suitable number of IO TLB entries size that will fit this request and * allocate a buffer from that IO TLB pool. @@ -702,6 +736,8 @@ found: area->index = wrap_area_index(mem, index + nslots); area->used += nslots; spin_unlock_irqrestore(&area->lock, flags); + + inc_used_and_hiwater(mem, nslots); return slot_index; } @@ -834,6 +870,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) mem->slots[i].list = ++count; area->used -= nslots; spin_unlock_irqrestore(&area->lock, flags); + + dec_used(mem, nslots); } /* @@ -935,11 +973,37 @@ static int io_tlb_used_get(void *data, u64 *val) *val = mem_used(mem); return 0; } + +static int io_tlb_hiwater_get(void *data, u64 *val) +{ + struct io_tlb_mem *mem = data; + + *val = atomic_long_read(&mem->used_hiwater); + return 0; +} + +static int io_tlb_hiwater_set(void *data, u64 val) +{ + struct io_tlb_mem *mem = data; + + /* Only allow setting to zero */ + if (val != 0) + return -EINVAL; + + atomic_long_set(&mem->used_hiwater, val); + return 0; +} + DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get, + io_tlb_hiwater_set, "%llu\n"); static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { + atomic_long_set(&mem->total_used, 0); + atomic_long_set(&mem->used_hiwater, 0); + mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs); if (!mem->nslabs) return; @@ -947,6 +1011,8 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); debugfs_create_file("io_tlb_used", 0400, mem->debugfs, mem, &fops_io_tlb_used); + debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem, + &fops_io_tlb_hiwater); } static int __init __maybe_unused swiotlb_create_default_debugfs(void) From ec274aff21b6a94c7973384ca80a503c1bc3b173 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Thu, 20 Apr 2023 11:58:58 +0200 Subject: [PATCH 11/11] swiotlb: Omit total_used and used_hiwater if !CONFIG_DEBUG_FS The tracking of used_hiwater adds an atomic operation to the hot path. This is acceptable only when debugging the kernel. To make sure that the fields can never be used by mistake, do not even include them in struct io_tlb_mem if CONFIG_DEBUG_FS is not set. The build fails after doing that. To fix it, it is necessary to remove all code specific to debugfs and instead provide a stub implementation of swiotlb_create_debugfs_files(). As a bonus, this change allows to remove one __maybe_unused attribute. Signed-off-by: Petr Tesarik Signed-off-by: Christoph Hellwig --- include/linux/swiotlb.h | 2 ++ kernel/dma/swiotlb.c | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 6dc4598d2260..44767844e12b 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -107,8 +107,10 @@ struct io_tlb_mem { unsigned int area_nslabs; struct io_tlb_area *areas; struct io_tlb_slot *slots; +#ifdef CONFIG_DEBUG_FS atomic_long_t total_used; atomic_long_t used_hiwater; +#endif }; extern struct io_tlb_mem io_tlb_default_mem; diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9bbc2802a444..e67dafc255f2 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -966,6 +966,8 @@ bool is_swiotlb_active(struct device *dev) } EXPORT_SYMBOL_GPL(is_swiotlb_active); +#ifdef CONFIG_DEBUG_FS + static int io_tlb_used_get(void *data, u64 *val) { struct io_tlb_mem *mem = data; @@ -1015,15 +1017,22 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, &fops_io_tlb_hiwater); } -static int __init __maybe_unused swiotlb_create_default_debugfs(void) +static int __init swiotlb_create_default_debugfs(void) { swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb"); return 0; } -#ifdef CONFIG_DEBUG_FS late_initcall(swiotlb_create_default_debugfs); -#endif + +#else /* !CONFIG_DEBUG_FS */ + +static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, + const char *dirname) +{ +} + +#endif /* CONFIG_DEBUG_FS */ #ifdef CONFIG_DMA_RESTRICTED_POOL