From 51ae3f4ac5e94334ac6078145a02bc8f30e8528b Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 9 Jan 2024 17:22:33 -0800 Subject: [PATCH 001/435] mm/cma: fix placement of trace_cma_alloc_start/finish The current placement of trace_cma_alloc_start/finish misses the fail cases: !cma || !cma->count || !cma->bitmap. trace_cma_alloc_finish is also not emitted for the failure case where bitmap_count > bitmap_maxno. Fix these missed cases by moving the start event before the failure checks and moving the finish event to the out label. Link: https://lkml.kernel.org/r/20240110012234.3793639-1-kaleshsingh@google.com Fixes: 7bc1aec5e287 ("mm: cma: add trace events for CMA alloc perf testing") Signed-off-by: Kalesh Singh Cc: Minchan Kim Cc: Liam Mark Signed-off-by: Andrew Morton --- mm/cma.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 7c09c47e530b..e12cf41d8354 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -436,6 +436,9 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned long i; struct page *page = NULL; int ret = -ENOMEM; + const char *name = cma ? cma->name : NULL; + + trace_cma_alloc_start(name, count, align); if (!cma || !cma->count || !cma->bitmap) goto out; @@ -446,8 +449,6 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, if (!count) goto out; - trace_cma_alloc_start(cma->name, count, align); - mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); bitmap_maxno = cma_bitmap_maxno(cma); @@ -496,8 +497,6 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, start = bitmap_no + mask + 1; } - trace_cma_alloc_finish(cma->name, pfn, page, count, align, ret); - /* * CMA can allocate multiple page blocks, which results in different * blocks being marked with different tags. Reset the tags to ignore @@ -516,6 +515,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, pr_debug("%s(): returned %p\n", __func__, page); out: + trace_cma_alloc_finish(name, pfn, page, count, align, ret); if (page) { count_vm_event(CMA_ALLOC_SUCCESS); cma_sysfs_account_success_pages(cma, count); From e755c43eb4a33a29f92bca6df30e5a558845c2f7 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 9 Jan 2024 14:31:19 -0800 Subject: [PATCH 002/435] maple_tree: fix comment describing mas_node_count_gfp() The function description comment for mas_node_count_gfp() mistakingly refers to the function as mas_node_count(). Change it to refer to the correct function. Link: https://lkml.kernel.org/r/20240109223119.162357-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Peng Zhang Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 6f241bb38799..7b161802860b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1307,8 +1307,8 @@ static inline void mas_free(struct ma_state *mas, struct maple_enode *used) } /* - * mas_node_count() - Check if enough nodes are allocated and request more if - * there is not enough nodes. + * mas_node_count_gfp() - Check if enough nodes are allocated and request more + * if there is not enough nodes. * @mas: The maple state * @count: The number of nodes needed * @gfp: the gfp flags From c5f1e2d1890935a734c302b9b8579748222b8e1e Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 8 Jan 2024 14:27:43 +0100 Subject: [PATCH 003/435] mm/memory_hotplug: introduce MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers Patch series "implement "memmap on memory" feature on s390". This series provides "memmap on memory" support on s390 platform. "memmap on memory" allows struct pages array to be allocated from the hotplugged memory range instead of allocating it from main system memory. s390 currently preallocates struct pages array for all potentially possible memory, which ensures memory onlining always succeeds, but with the cost of significant memory consumption from the available system memory during boottime. In certain extreme configuration, this could lead to ipl failure. "memmap on memory" ensures struct pages array are populated from self contained hotplugged memory range instead of depleting the available system memory and this could eliminate ipl failure on s390 platform. On other platforms, system might go OOM when the physically hotplugged memory depletes the available memory before it is onlined. Hence, "memmap on memory" feature was introduced as described in commit a08a2ae34613 ("mm,memory_hotplug: allocate memmap from the added memory range"). Unlike other architectures, s390 memory blocks are not physically accessible until it is online. To make it physically accessible two new memory notifiers MEM_PREPARE_ONLINE / MEM_FINISH_OFFLINE are added and this notifier lets the hypervisor inform that the memory should be made physically accessible. This allows for "memmap on memory" initialization during memory hotplug onlining phase, which is performed before calling MEM_GOING_ONLINE notifier. Patch 1 introduces MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers to prepare the transition of memory to and from a physically accessible state. New mhp_flag MHP_OFFLINE_INACCESSIBLE is introduced to ensure altmap cannot be written when adding memory - before it is set online. This enhancement is crucial for implementing the "memmap on memory" feature for s390 in a subsequent patch. Patches 2 allocates vmemmap pages from self-contained memory range for s390. It allocates memory map (struct pages array) from the hotplugged memory range, rather than using system memory by passing altmap to vmemmap functions. Patch 3 removes unhandled memory notifier types on s390. Patch 4 implements MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers on s390. MEM_PREPARE_ONLINE memory notifier makes memory block physical accessible via sclp assign command. The notifier ensures self-contained memory maps are accessible and hence enabling the "memmap on memory" on s390. MEM_FINISH_OFFLINE memory notifier shifts the memory block to an inaccessible state via sclp unassign command. Patch 5 finally enables MHP_MEMMAP_ON_MEMORY on s390. This patch (of 5): Introduce MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers to prepare the transition of memory to and from a physically accessible state. This enhancement is crucial for implementing the "memmap on memory" feature for s390 in a subsequent patch. Platforms such as x86 can support physical memory hotplug via ACPI. When there is physical memory hotplug, ACPI event leads to the memory addition with the following callchain: acpi_memory_device_add() -> acpi_memory_enable_device() -> __add_memory() After this, the hotplugged memory is physically accessible, and altmap support prepared, before the "memmap on memory" initialization in memory_block_online() is called. On s390, memory hotplug works in a different way. The available hotplug memory has to be defined upfront in the hypervisor, but it is made physically accessible only when the user sets it online via sysfs, currently in the MEM_GOING_ONLINE notifier. This is too late and "memmap on memory" initialization is performed before calling MEM_GOING_ONLINE notifier. During the memory hotplug addition phase, altmap support is prepared and during the memory onlining phase s390 requires memory to be physically accessible and then subsequently initiate the "memmap on memory" initialization process. The memory provider will handle new MEM_PREPARE_ONLINE / MEM_FINISH_OFFLINE notifications and make the memory accessible. The mhp_flag MHP_OFFLINE_INACCESSIBLE is introduced and is relevant when used along with MHP_MEMMAP_ON_MEMORY, because the altmap cannot be written (e.g., poisoned) when adding memory -- before it is set online. This allows for adding memory with an altmap that is not currently made available by a hypervisor. When onlining that memory, the hypervisor can be instructed to make that memory accessible via the new notifiers and the onlining phase will not require any memory allocations, which is helpful in low-memory situations. All architectures ignore unknown memory notifiers. Therefore, the introduction of these new notifiers does not result in any functional modifications across architectures. Link: https://lkml.kernel.org/r/20240108132747.3238763-1-sumanthk@linux.ibm.com Link: https://lkml.kernel.org/r/20240108132747.3238763-2-sumanthk@linux.ibm.com Signed-off-by: Sumanth Korikkar Suggested-by: Gerald Schaefer Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- drivers/base/memory.c | 23 ++++++++++++++++++++++- include/linux/memory.h | 9 +++++++++ include/linux/memory_hotplug.h | 18 +++++++++++++++++- include/linux/memremap.h | 1 + mm/memory_hotplug.c | 17 ++++++++++++++--- mm/sparse.c | 3 ++- 6 files changed, 65 insertions(+), 6 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 14f964a7719b..c0436f46cfb7 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -188,6 +188,7 @@ static int memory_block_online(struct memory_block *mem) unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; unsigned long nr_vmemmap_pages = 0; + struct memory_notify arg; struct zone *zone; int ret; @@ -207,9 +208,19 @@ static int memory_block_online(struct memory_block *mem) if (mem->altmap) nr_vmemmap_pages = mem->altmap->free; + arg.altmap_start_pfn = start_pfn; + arg.altmap_nr_pages = nr_vmemmap_pages; + arg.start_pfn = start_pfn + nr_vmemmap_pages; + arg.nr_pages = nr_pages - nr_vmemmap_pages; mem_hotplug_begin(); + ret = memory_notify(MEM_PREPARE_ONLINE, &arg); + ret = notifier_to_errno(ret); + if (ret) + goto out_notifier; + if (nr_vmemmap_pages) { - ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); + ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, + zone, mem->altmap->inaccessible); if (ret) goto out; } @@ -231,7 +242,11 @@ static int memory_block_online(struct memory_block *mem) nr_vmemmap_pages); mem->zone = zone; + mem_hotplug_done(); + return ret; out: + memory_notify(MEM_FINISH_OFFLINE, &arg); +out_notifier: mem_hotplug_done(); return ret; } @@ -244,6 +259,7 @@ static int memory_block_offline(struct memory_block *mem) unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; unsigned long nr_vmemmap_pages = 0; + struct memory_notify arg; int ret; if (!mem->zone) @@ -275,6 +291,11 @@ static int memory_block_offline(struct memory_block *mem) mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); mem->zone = NULL; + arg.altmap_start_pfn = start_pfn; + arg.altmap_nr_pages = nr_vmemmap_pages; + arg.start_pfn = start_pfn + nr_vmemmap_pages; + arg.nr_pages = nr_pages - nr_vmemmap_pages; + memory_notify(MEM_FINISH_OFFLINE, &arg); out: mem_hotplug_done(); return ret; diff --git a/include/linux/memory.h b/include/linux/memory.h index f53cfdaaaa41..939a16bd5cea 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -96,8 +96,17 @@ int set_memory_block_size_order(unsigned int order); #define MEM_GOING_ONLINE (1<<3) #define MEM_CANCEL_ONLINE (1<<4) #define MEM_CANCEL_OFFLINE (1<<5) +#define MEM_PREPARE_ONLINE (1<<6) +#define MEM_FINISH_OFFLINE (1<<7) struct memory_notify { + /* + * The altmap_start_pfn and altmap_nr_pages fields are designated for + * specifying the altmap range and are exclusively intended for use in + * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. + */ + unsigned long altmap_start_pfn; + unsigned long altmap_nr_pages; unsigned long start_pfn; unsigned long nr_pages; int status_change_nid_normal; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7d2076583494..ee00015575aa 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -106,6 +106,22 @@ typedef int __bitwise mhp_t; * implies the node id (nid). */ #define MHP_NID_IS_MGID ((__force mhp_t)BIT(2)) +/* + * The hotplugged memory is completely inaccessible while the memory is + * offline. The memory provider will handle MEM_PREPARE_ONLINE / + * MEM_FINISH_OFFLINE notifications and make the memory accessible. + * + * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY, + * because the altmap cannot be written (e.g., poisoned) when adding + * memory -- before it is set online. + * + * This allows for adding memory with an altmap that is not currently + * made available by a hypervisor. When onlining that memory, the + * hypervisor can be instructed to make that memory available, and + * the onlining phase will not require any memory allocations, which is + * helpful in low-memory situations. + */ +#define MHP_OFFLINE_INACCESSIBLE ((__force mhp_t)BIT(3)) /* * Extended parameters for memory hotplug: @@ -154,7 +170,7 @@ extern void adjust_present_page_count(struct page *page, long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone); + struct zone *zone, bool mhp_off_inaccessible); extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 744c830f4b13..9837f3e6fb95 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -25,6 +25,7 @@ struct vmem_altmap { unsigned long free; unsigned long align; unsigned long alloc; + bool inaccessible; }; /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 21890994c1d3..707027f69150 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1087,7 +1087,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone) + struct zone *zone, bool mhp_off_inaccessible) { unsigned long end_pfn = pfn + nr_pages; int ret, i; @@ -1096,6 +1096,15 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, if (ret) return ret; + /* + * Memory block is accessible at this stage and hence poison the struct + * pages now. If the memory block is accessible during memory hotplug + * addition phase, then page poisining is already performed in + * sparse_add_section(). + */ + if (mhp_off_inaccessible) + page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); for (i = 0; i < nr_pages; i++) @@ -1415,7 +1424,7 @@ static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size) } static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, - u64 start, u64 size) + u64 start, u64 size, mhp_t mhp_flags) { unsigned long memblock_size = memory_block_size_bytes(); u64 cur_start; @@ -1431,6 +1440,8 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, }; mhp_altmap.free = memory_block_memmap_on_memory_pages(); + if (mhp_flags & MHP_OFFLINE_INACCESSIBLE) + mhp_altmap.inaccessible = true; params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap), GFP_KERNEL); if (!params.altmap) { @@ -1516,7 +1527,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) */ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && mhp_supports_memmap_on_memory(memory_block_size_bytes())) { - ret = create_altmaps_and_memory_blocks(nid, group, start, size); + ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags); if (ret) goto error; } else { diff --git a/mm/sparse.c b/mm/sparse.c index 338cf946dee8..aed0951b87fa 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -908,7 +908,8 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - page_init_poison(memmap, sizeof(struct page) * nr_pages); + if (!altmap || !altmap->inaccessible) + page_init_poison(memmap, sizeof(struct page) * nr_pages); ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); From 1a65b73ae9abd608575e9bfbaa003d829d39fab9 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 8 Jan 2024 14:27:44 +0100 Subject: [PATCH 004/435] s390/mm: allocate vmemmap pages from self-contained memory range Allocate memory map (struct pages array) from the hotplugged memory range, rather than using system memory. The change addresses the issue where standby memory, when configured to be much larger than online memory, could potentially lead to ipl failure due to memory map allocation from online memory. For example, 16MB of memory map allocation is needed for a memory block size of 1GB and when standby memory is configured much larger than online memory, this could lead to ipl failure. To address this issue, the solution involves introducing "memmap on memory" using the vmem_altmap structure on s390. Architectures that want to implement it should pass the altmap to the vmemmap_populate() function and its associated callchain. This enhancement is discussed in commit 4b94ffdc4163 ("x86, mm: introduce vmem_altmap to augment vmemmap_populate()") Provide "memmap on memory" support for s390 by passing the altmap in vmemmap_populate() and its callchain. The allocation path is described as follows: * When altmap is NULL in vmemmap_populate(), memory map allocation occurs using the existing vmemmap_alloc_block_buf(). * When altmap is not NULL in vmemmap_populate(), memory map allocation still uses vmemmap_alloc_block_buf(), but this function internally calls altmap_alloc_block_buf(). For deallocation, the process is outlined as follows: * When altmap is NULL in vmemmap_free(), memory map deallocation happens through free_pages(). * When altmap is not NULL in vmemmap_free(), memory map deallocation occurs via vmem_altmap_free(). While memory map allocation is primarily handled through the self-contained memory map range, there might still be a small amount of system memory allocation required for vmemmap pagetables. To mitigate this impact, this feature will be limited to machines with EDAT1 support. Link: https://lkml.kernel.org/r/20240108132747.3238763-3-sumanthk@linux.ibm.com Reviewed-by: Gerald Schaefer Signed-off-by: Sumanth Korikkar Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/s390/mm/init.c | 3 --- arch/s390/mm/vmem.c | 62 +++++++++++++++++++++++++-------------------- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 43e612bc2bcd..8d9a60ccb777 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -281,9 +281,6 @@ int arch_add_memory(int nid, u64 start, u64 size, unsigned long size_pages = PFN_DOWN(size); int rc; - if (WARN_ON_ONCE(params->altmap)) - return -EINVAL; - if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)) return -EINVAL; diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 186a020857cf..eb100479f7be 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -33,8 +33,12 @@ static void __ref *vmem_alloc_pages(unsigned int order) return memblock_alloc(size, size); } -static void vmem_free_pages(unsigned long addr, int order) +static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap) { + if (altmap) { + vmem_altmap_free(altmap, 1 << order); + return; + } /* We don't expect boot memory to be removed ever. */ if (!slab_is_available() || WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr)))) @@ -156,7 +160,8 @@ static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, - unsigned long end, bool add, bool direct) + unsigned long end, bool add, bool direct, + struct vmem_altmap *altmap) { unsigned long prot, pages = 0; int ret = -ENOMEM; @@ -172,11 +177,11 @@ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, if (pte_none(*pte)) continue; if (!direct) - vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0); + vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap); pte_clear(&init_mm, addr, pte); } else if (pte_none(*pte)) { if (!direct) { - void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap); if (!new_page) goto out; @@ -213,7 +218,8 @@ static void try_free_pte_table(pmd_t *pmd, unsigned long start) /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, - unsigned long end, bool add, bool direct) + unsigned long end, bool add, bool direct, + struct vmem_altmap *altmap) { unsigned long next, prot, pages = 0; int ret = -ENOMEM; @@ -234,11 +240,11 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE)) { if (!direct) - vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); pmd_clear(pmd); pages++; } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { - vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); pmd_clear(pmd); } continue; @@ -261,7 +267,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, * page tables since vmemmap_populate gets * called for each section separately. */ - new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); + new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap); if (new_page) { set_pmd(pmd, __pmd(__pa(new_page) | prot)); if (!IS_ALIGNED(addr, PMD_SIZE) || @@ -280,7 +286,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, vmemmap_use_sub_pmd(addr, next); continue; } - ret = modify_pte_table(pmd, addr, next, add, direct); + ret = modify_pte_table(pmd, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -302,12 +308,12 @@ static void try_free_pmd_table(pud_t *pud, unsigned long start) for (i = 0; i < PTRS_PER_PMD; i++, pmd++) if (!pmd_none(*pmd)) return; - vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); + vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL); pud_clear(pud); } static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, - bool add, bool direct) + bool add, bool direct, struct vmem_altmap *altmap) { unsigned long next, prot, pages = 0; int ret = -ENOMEM; @@ -347,7 +353,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, } else if (pud_large(*pud)) { continue; } - ret = modify_pmd_table(pud, addr, next, add, direct); + ret = modify_pmd_table(pud, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -370,12 +376,12 @@ static void try_free_pud_table(p4d_t *p4d, unsigned long start) if (!pud_none(*pud)) return; } - vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); + vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL); p4d_clear(p4d); } static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, - bool add, bool direct) + bool add, bool direct, struct vmem_altmap *altmap) { unsigned long next; int ret = -ENOMEM; @@ -394,7 +400,7 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, goto out; p4d_populate(&init_mm, p4d, pud); } - ret = modify_pud_table(p4d, addr, next, add, direct); + ret = modify_pud_table(p4d, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -415,12 +421,12 @@ static void try_free_p4d_table(pgd_t *pgd, unsigned long start) if (!p4d_none(*p4d)) return; } - vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); + vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL); pgd_clear(pgd); } static int modify_pagetable(unsigned long start, unsigned long end, bool add, - bool direct) + bool direct, struct vmem_altmap *altmap) { unsigned long addr, next; int ret = -ENOMEM; @@ -445,7 +451,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, goto out; pgd_populate(&init_mm, pgd, p4d); } - ret = modify_p4d_table(pgd, addr, next, add, direct); + ret = modify_p4d_table(pgd, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -458,14 +464,16 @@ out: return ret; } -static int add_pagetable(unsigned long start, unsigned long end, bool direct) +static int add_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { - return modify_pagetable(start, end, true, direct); + return modify_pagetable(start, end, true, direct, altmap); } -static int remove_pagetable(unsigned long start, unsigned long end, bool direct) +static int remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { - return modify_pagetable(start, end, false, direct); + return modify_pagetable(start, end, false, direct, altmap); } /* @@ -474,7 +482,7 @@ static int remove_pagetable(unsigned long start, unsigned long end, bool direct) static int vmem_add_range(unsigned long start, unsigned long size) { start = (unsigned long)__va(start); - return add_pagetable(start, start + size, true); + return add_pagetable(start, start + size, true, NULL); } /* @@ -483,7 +491,7 @@ static int vmem_add_range(unsigned long start, unsigned long size) static void vmem_remove_range(unsigned long start, unsigned long size) { start = (unsigned long)__va(start); - remove_pagetable(start, start + size, true); + remove_pagetable(start, start + size, true, NULL); } /* @@ -496,9 +504,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, mutex_lock(&vmem_mutex); /* We don't care about the node, just use NUMA_NO_NODE on allocations */ - ret = add_pagetable(start, end, false); + ret = add_pagetable(start, end, false, altmap); if (ret) - remove_pagetable(start, end, false); + remove_pagetable(start, end, false, altmap); mutex_unlock(&vmem_mutex); return ret; } @@ -509,7 +517,7 @@ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) { mutex_lock(&vmem_mutex); - remove_pagetable(start, end, false); + remove_pagetable(start, end, false, altmap); mutex_unlock(&vmem_mutex); } From fb6d5eb9f4c9620acbd8a78dafff0164c1b8824e Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 8 Jan 2024 14:27:45 +0100 Subject: [PATCH 005/435] s390/sclp: remove unhandled memory notifier type Remove memory notifier types which are unhandled by s390. Unhandled memory notifier types are covered by default case. Link: https://lkml.kernel.org/r/20240108132747.3238763-4-sumanthk@linux.ibm.com Suggested-by: Alexander Gordeev Reviewed-by: David Hildenbrand Signed-off-by: Sumanth Korikkar Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- drivers/s390/char/sclp_cmd.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index 11c428f4c7cf..355e63e44e95 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -340,9 +340,6 @@ static int sclp_mem_notifier(struct notifier_block *nb, if (contains_standby_increment(start, start + size)) rc = -EPERM; break; - case MEM_ONLINE: - case MEM_CANCEL_OFFLINE: - break; case MEM_GOING_ONLINE: rc = sclp_mem_change_state(start, size, 1); break; From 890a4212de718fcbb4ba4ee36569af31908d9dc5 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 8 Jan 2024 14:27:46 +0100 Subject: [PATCH 006/435] s390/mm: implement MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers MEM_PREPARE_ONLINE memory notifier makes memory block physical accessible via sclp assign command. The notifier ensures self-contained memory maps are accessible and hence enabling the "memmap on memory" on s390. MEM_FINISH_OFFLINE memory notifier shifts the memory block to an inaccessible state via sclp unassign command. Implementation considerations: * When MHP_MEMMAP_ON_MEMORY is disabled, the system retains the old behavior. This means the memory map is allocated from default memory. * If MACHINE_HAS_EDAT1 is unavailable, MHP_MEMMAP_ON_MEMORY is automatically disabled. This ensures that vmemmap pagetables do not consume additional memory from the default memory allocator. * The MEM_GOING_ONLINE notifier has been modified to perform no operation, as MEM_PREPARE_ONLINE already executes the sclp assign command. * The MEM_CANCEL_ONLINE/MEM_OFFLINE notifier now performs no operation, as MEM_FINISH_OFFLINE already executes the sclp unassign command. Link: https://lkml.kernel.org/r/20240108132747.3238763-5-sumanthk@linux.ibm.com Reviewed-by: Gerald Schaefer Reviewed-by: David Hildenbrand Signed-off-by: Sumanth Korikkar Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- drivers/s390/char/sclp_cmd.c | 41 ++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index 355e63e44e95..7815e9bea69a 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,7 @@ #include #include #include +#include #include "sclp.h" @@ -340,13 +342,38 @@ static int sclp_mem_notifier(struct notifier_block *nb, if (contains_standby_increment(start, start + size)) rc = -EPERM; break; - case MEM_GOING_ONLINE: + case MEM_PREPARE_ONLINE: + /* + * Access the altmap_start_pfn and altmap_nr_pages fields + * within the struct memory_notify specifically when dealing + * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. + * + * When altmap is in use, take the specified memory range + * online, which includes the altmap. + */ + if (arg->altmap_nr_pages) { + start = PFN_PHYS(arg->altmap_start_pfn); + size += PFN_PHYS(arg->altmap_nr_pages); + } rc = sclp_mem_change_state(start, size, 1); + if (rc || !arg->altmap_nr_pages) + break; + /* + * Set CMMA state to nodat here, since the struct page memory + * at the beginning of the memory block will not go through the + * buddy allocator later. + */ + __arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages); break; - case MEM_CANCEL_ONLINE: - sclp_mem_change_state(start, size, 0); - break; - case MEM_OFFLINE: + case MEM_FINISH_OFFLINE: + /* + * When altmap is in use, take the specified memory range + * offline, which includes the altmap. + */ + if (arg->altmap_nr_pages) { + start = PFN_PHYS(arg->altmap_start_pfn); + size += PFN_PHYS(arg->altmap_nr_pages); + } sclp_mem_change_state(start, size, 0); break; default: @@ -397,7 +424,9 @@ static void __init add_memory_merged(u16 rn) if (!size) goto skip_add; for (addr = start; addr < start + size; addr += block_size) - add_memory(0, addr, block_size, MHP_NONE); + add_memory(0, addr, block_size, + MACHINE_HAS_EDAT1 ? + MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE); skip_add: first_rn = rn; num = 1; From 9eda317c15ff13be621d219a5de365d1550f41f7 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 8 Jan 2024 14:27:47 +0100 Subject: [PATCH 007/435] s390: enable MHP_MEMMAP_ON_MEMORY Enable MHP_MEMMAP_ON_MEMORY to support "memmap on memory". memory_hotplug.memmap_on_memory=true kernel parameter should be set in kernel boot option to enable the feature. Link: https://lkml.kernel.org/r/20240108132747.3238763-6-sumanthk@linux.ibm.com Reviewed-by: Gerald Schaefer Signed-off-by: Sumanth Korikkar Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/s390/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index fe565f3a3a91..a1d6dcbc8965 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -113,6 +113,7 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_BH select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC From 6212eb4d7a636bdfe0c11c84aa90db3fb5e6a0ff Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Mon, 8 Jan 2024 12:48:15 +0800 Subject: [PATCH 008/435] mm/filemap: avoid type conversion The return type of function folio_test_hugetlb is bool type, there is no need to assign it to an integer type. Link: https://lkml.kernel.org/r/20240108044815.3291487-1-lihongbo22@huawei.com Signed-off-by: Hongbo Li Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 750e779c23db..0d7e20edf46f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -843,7 +843,7 @@ noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { XA_STATE(xas, &mapping->i_pages, index); - int huge = folio_test_hugetlb(folio); + bool huge = folio_test_hugetlb(folio); bool charged = false; long nr = 1; From 3956570ef7776bf19f7dfdf765bbf80fbd244ab9 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Fri, 5 Jan 2024 12:24:01 -0800 Subject: [PATCH 009/435] selftests/mm/ksm_functional: prevent unmapping undefined address Replace some goto statements with return statements so that unmap() is not called on an undefined address. This change is made so that unmap() can only be reached after mmap() is called (and the address mentioned is defined). Returning MAP_FAILED seems acceptable since client code checks for this value. Link: https://lkml.kernel.org/r/20240105202401.28851-1-inwardvessel@gmail.com Fixes: 42096aa24b82 ("selftest/mm: ksm_functional_tests: test in mmap_and_merge_range() if anything got merged") Signed-off-by: JP Kobryn Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/ksm_functional_tests.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index fbff0dd09191..d615767e396b 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -155,12 +155,12 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot, /* Stabilize accounting by disabling KSM completely. */ if (ksm_unmerge()) { ksft_test_result_fail("Disabling (unmerging) KSM failed\n"); - goto unmap; + return MAP_FAILED; } if (get_my_merging_pages() > 0) { ksft_test_result_fail("Still pages merged\n"); - goto unmap; + return MAP_FAILED; } map = mmap(NULL, size, PROT_READ|PROT_WRITE, From 0040f2c5531fe852505b3786992e41a5b40f8db6 Mon Sep 17 00:00:00 2001 From: Kuan-Ying Lee Date: Wed, 7 Feb 2024 16:58:51 +0800 Subject: [PATCH 010/435] scripts/gdb/vmalloc: fix vmallocinfo error The patch series "Mitigate a vmap lock contention" removes vmap_area_list, which will break the gdb vmallocinfo command: (gdb) lx-vmallocinfo Python Exception : No symbol "vmap_area_list" in current context. Error occurred in Python: No symbol "vmap_area_list" in current context. So we can instead use vmap_nodes to iterate all vmallocinfo. Link: https://lkml.kernel.org/r/20240207085856.11190-1-Kuan-Ying.Lee@mediatek.com Signed-off-by: Kuan-Ying Lee Cc: Casper Li Cc: AngeloGioacchino Del Regno Cc: Chinwen Chang Cc: Jan Kiszka Cc: Kieran Bingham Cc: Matthias Brugger Cc: Qun-Wei Lin Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- scripts/gdb/linux/vmalloc.py | 56 +++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/scripts/gdb/linux/vmalloc.py b/scripts/gdb/linux/vmalloc.py index d3c8a0274d1e..803f17371052 100644 --- a/scripts/gdb/linux/vmalloc.py +++ b/scripts/gdb/linux/vmalloc.py @@ -29,32 +29,34 @@ class LxVmallocInfo(gdb.Command): if not constants.LX_CONFIG_MMU: raise gdb.GdbError("Requires MMU support") - vmap_area_list = gdb.parse_and_eval('vmap_area_list') - for vmap_area in lists.list_for_each_entry(vmap_area_list, vmap_area_ptr_type, "list"): - if not vmap_area['vm']: - gdb.write("0x%x-0x%x %10d vm_map_ram\n" % (vmap_area['va_start'], vmap_area['va_end'], - vmap_area['va_end'] - vmap_area['va_start'])) - continue - v = vmap_area['vm'] - gdb.write("0x%x-0x%x %10d" % (v['addr'], v['addr'] + v['size'], v['size'])) - if v['caller']: - gdb.write(" %s" % str(v['caller']).split(' ')[-1]) - if v['nr_pages']: - gdb.write(" pages=%d" % v['nr_pages']) - if v['phys_addr']: - gdb.write(" phys=0x%x" % v['phys_addr']) - if v['flags'] & constants.LX_VM_IOREMAP: - gdb.write(" ioremap") - if v['flags'] & constants.LX_VM_ALLOC: - gdb.write(" vmalloc") - if v['flags'] & constants.LX_VM_MAP: - gdb.write(" vmap") - if v['flags'] & constants.LX_VM_USERMAP: - gdb.write(" user") - if v['flags'] & constants.LX_VM_DMA_COHERENT: - gdb.write(" dma-coherent") - if is_vmalloc_addr(v['pages']): - gdb.write(" vpages") - gdb.write("\n") + nr_vmap_nodes = gdb.parse_and_eval('nr_vmap_nodes') + for i in range(0, nr_vmap_nodes): + vn = gdb.parse_and_eval('&vmap_nodes[%d]' % i) + for vmap_area in lists.list_for_each_entry(vn['busy']['head'], vmap_area_ptr_type, "list"): + if not vmap_area['vm']: + gdb.write("0x%x-0x%x %10d vm_map_ram\n" % (vmap_area['va_start'], vmap_area['va_end'], + vmap_area['va_end'] - vmap_area['va_start'])) + continue + v = vmap_area['vm'] + gdb.write("0x%x-0x%x %10d" % (v['addr'], v['addr'] + v['size'], v['size'])) + if v['caller']: + gdb.write(" %s" % str(v['caller']).split(' ')[-1]) + if v['nr_pages']: + gdb.write(" pages=%d" % v['nr_pages']) + if v['phys_addr']: + gdb.write(" phys=0x%x" % v['phys_addr']) + if v['flags'] & constants.LX_VM_IOREMAP: + gdb.write(" ioremap") + if v['flags'] & constants.LX_VM_ALLOC: + gdb.write(" vmalloc") + if v['flags'] & constants.LX_VM_MAP: + gdb.write(" vmap") + if v['flags'] & constants.LX_VM_USERMAP: + gdb.write(" user") + if v['flags'] & constants.LX_VM_DMA_COHERENT: + gdb.write(" dma-coherent") + if is_vmalloc_addr(v['pages']): + gdb.write(" vpages") + gdb.write("\n") LxVmallocInfo() From 30afc8c34290184c023fa79136ce5f8813fc73da Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Wed, 10 Jan 2024 16:46:22 +0800 Subject: [PATCH 011/435] mm/mmap: simplify vma link and unlink The file parameter in the __remove_shared_vm_struct is no longer used, remove it. These functions vma_link() and mmap_region() have some of the same code, introduce vma_link_file() helper function to simplify the code. Link: https://lkml.kernel.org/r/20240110084622.2425927-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Signed-off-by: Andrew Morton --- mm/mmap.c | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index d89770eaab6b..282ed6d0914b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -105,7 +105,7 @@ void vma_set_page_prot(struct vm_area_struct *vma) * Requires inode->i_mapping->i_mmap_rwsem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, - struct file *file, struct address_space *mapping) + struct address_space *mapping) { if (vma_is_shared_maywrite(vma)) mapping_unmap_writable(mapping); @@ -126,7 +126,7 @@ void unlink_file_vma(struct vm_area_struct *vma) if (file) { struct address_space *mapping = file->f_mapping; i_mmap_lock_write(mapping); - __remove_shared_vm_struct(vma, file, mapping); + __remove_shared_vm_struct(vma, mapping); i_mmap_unlock_write(mapping); } } @@ -392,26 +392,30 @@ static void __vma_link_file(struct vm_area_struct *vma, flush_dcache_mmap_unlock(mapping); } +static void vma_link_file(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct address_space *mapping; + + if (file) { + mapping = file->f_mapping; + i_mmap_lock_write(mapping); + __vma_link_file(vma, mapping); + i_mmap_unlock_write(mapping); + } +} + static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { VMA_ITERATOR(vmi, mm, 0); - struct address_space *mapping = NULL; vma_iter_config(&vmi, vma->vm_start, vma->vm_end); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; vma_start_write(vma); - vma_iter_store(&vmi, vma); - - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - i_mmap_lock_write(mapping); - __vma_link_file(vma, mapping); - i_mmap_unlock_write(mapping); - } - + vma_link_file(vma); mm->map_count++; validate_mm(mm); return 0; @@ -519,10 +523,9 @@ static inline void vma_complete(struct vma_prepare *vp, } if (vp->remove && vp->file) { - __remove_shared_vm_struct(vp->remove, vp->file, vp->mapping); + __remove_shared_vm_struct(vp->remove, vp->mapping); if (vp->remove2) - __remove_shared_vm_struct(vp->remove2, vp->file, - vp->mapping); + __remove_shared_vm_struct(vp->remove2, vp->mapping); } else if (vp->insert) { /* * split_vma has split insert from vma, and needs @@ -2891,16 +2894,7 @@ cannot_expand: vma_start_write(vma); vma_iter_store(&vmi, vma); mm->map_count++; - if (vma->vm_file) { - i_mmap_lock_write(vma->vm_file->f_mapping); - if (vma_is_shared_maywrite(vma)) - mapping_allow_writable(vma->vm_file->f_mapping); - - flush_dcache_mmap_lock(vma->vm_file->f_mapping); - vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); - flush_dcache_mmap_unlock(vma->vm_file->f_mapping); - i_mmap_unlock_write(vma->vm_file->f_mapping); - } + vma_link_file(vma); /* * vma_merge() calls khugepaged_enter_vma() either, the below From 21fff064a26dc93d7ff24e36f35197fec809beee Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 29 Dec 2023 16:22:07 +0800 Subject: [PATCH 012/435] mm: memory: use nth_page() in clear/copy_subpage() The clear and copy of huge gigantic page has converted to use nth_page() to handle the possible discontinuous struct page(SPARSEMEM without VMEMMAP), but not change for the non-gigantic part, fix it too. Link: https://lkml.kernel.org/r/20231229082207.60235-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Zi Yan Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/memory.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 0bfc8b007c01..a25bc8a370fd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6163,7 +6163,7 @@ static int clear_subpage(unsigned long addr, int idx, void *arg) { struct page *page = arg; - clear_user_highpage(page + idx, addr); + clear_user_highpage(nth_page(page, idx), addr); return 0; } @@ -6213,10 +6213,11 @@ struct copy_subpage_arg { static int copy_subpage(unsigned long addr, int idx, void *arg) { struct copy_subpage_arg *copy_arg = arg; + struct page *dst = nth_page(copy_arg->dst, idx); + struct page *src = nth_page(copy_arg->src, idx); - if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx, - addr, copy_arg->vma)) { - memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0); + if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) { + memory_failure_queue(page_to_pfn(src), 0); return -EHWPOISON; } return 0; From 0057db47f8785d1017a04ec1c9590e0af10eb220 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Thu, 28 Dec 2023 06:27:14 +0000 Subject: [PATCH 013/435] mm: list_lru: disable memcg_aware when cgroup.memory is set to "nokmem" Actually, when using a boot time kernel option "cgroup.memory=nokmem", all lru items are inserted to list_lru_node. But for those users who invoke list_lru_init_memcg() to initialize list_lru, list_lru_memcg_aware() returns true. And this brings unneeded operations related to memcg. To make things more convenient, let's disable memcg_aware when cgroup.memory is set to "nokmem". Link: https://lkml.kernel.org/r/20231228062715.338672-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/list_lru.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/list_lru.c b/mm/list_lru.c index 35b0147542a9..158781d1d3c2 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -567,6 +567,9 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, lru->shrinker_id = shrinker->id; else lru->shrinker_id = -1; + + if (mem_cgroup_kmem_disabled()) + memcg_aware = false; #endif lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); From a02b8bfe9a60ecb97bab7ba1a10a513bf78a7866 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Thu, 28 Dec 2023 06:27:15 +0000 Subject: [PATCH 014/435] mm: list_lru: remove unused macro list_lru_init_key() list_lru_init_key() isn't used by anyone, remove it to clean up. Link: https://lkml.kernel.org/r/20231228062715.338672-2-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 7675a48a0701..c679e6b293c4 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -62,8 +62,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, #define list_lru_init(lru) \ __list_lru_init((lru), false, NULL, NULL) -#define list_lru_init_key(lru, key) \ - __list_lru_init((lru), false, (key), NULL) #define list_lru_init_memcg(lru, shrinker) \ __list_lru_init((lru), true, NULL, shrinker) From 05976a42b327d4f5a529a5e55cb8bfc2fa0bcca1 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 20 Dec 2023 22:59:42 -0800 Subject: [PATCH 015/435] mm: mmap: no need to call khugepaged_enter_vma() for stack We avoid allocating THP for temporary stack, even though khugepaged_enter_vma() is called for stack VMAs, it actualy returns false. So no need to call it in the first place at all. Link: https://lkml.kernel.org/r/20231221065943.2803551-1-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Yin Fengwei Cc: Christopher Lameter Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Rik van Riel Cc: kernel test robot Signed-off-by: Andrew Morton --- mm/mmap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 282ed6d0914b..66f534ec90a5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2051,7 +2051,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } anon_vma_unlock_write(vma->anon_vma); - khugepaged_enter_vma(vma, vma->vm_flags); mas_destroy(&mas); validate_mm(mm); return error; @@ -2145,7 +2144,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) } } anon_vma_unlock_write(vma->anon_vma); - khugepaged_enter_vma(vma, vma->vm_flags); mas_destroy(&mas); validate_mm(mm); return error; From b267e1a3e45fcc75fa1434d5c978b36624324e58 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 18:12:16 +0000 Subject: [PATCH 016/435] memcg: convert mem_cgroup_move_charge_pte_range() to use a folio Patch series "Convert memcontrol charge moving to use folios". No part of these patches should change behaviour; all the called functions already convert from page to folio, so this ought to simply be a reduction in the number of calls to compound_head(). This patch (of 4): Remove many calls to compound_head() by calling page_folio() once at the start of each stanza which receives a struct page from 'target'. There should be no change in behaviour here as all the called functions start out by converting the page to its folio. Link: https://lkml.kernel.org/r/20240111181219.3462852-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240111181219.3462852-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Muchun Song Acked-by: Shakeel Butt Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- mm/memcontrol.c | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 61932c9215e7..d1c1bd166307 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5965,23 +5965,22 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, } /** - * mem_cgroup_move_account - move account of the page - * @page: the page + * mem_cgroup_move_account - move account of the folio + * @folio: The folio. * @compound: charge the page as compound or small page - * @from: mem_cgroup which the page is moved from. - * @to: mem_cgroup which the page is moved to. @from != @to. + * @from: mem_cgroup which the folio is moved from. + * @to: mem_cgroup which the folio is moved to. @from != @to. * - * The page must be locked and not on the LRU. + * The folio must be locked and not on the LRU. * * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" * from old cgroup. */ -static int mem_cgroup_move_account(struct page *page, +static int mem_cgroup_move_account(struct folio *folio, bool compound, struct mem_cgroup *from, struct mem_cgroup *to) { - struct folio *folio = page_folio(page); struct lruvec *from_vec, *to_vec; struct pglist_data *pgdat; unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; @@ -6431,7 +6430,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, spinlock_t *ptl; enum mc_target_type target_type; union mc_target target; - struct page *page; + struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -6441,26 +6440,26 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); if (target_type == MC_TARGET_PAGE) { - page = target.page; - if (isolate_lru_page(page)) { - if (!mem_cgroup_move_account(page, true, + folio = page_folio(target.page); + if (folio_isolate_lru(folio)) { + if (!mem_cgroup_move_account(folio, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; } - putback_lru_page(page); + folio_putback_lru(folio); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } else if (target_type == MC_TARGET_DEVICE) { - page = target.page; - if (!mem_cgroup_move_account(page, true, + folio = page_folio(target.page); + if (!mem_cgroup_move_account(folio, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } spin_unlock(ptl); return 0; @@ -6483,28 +6482,28 @@ retry: device = true; fallthrough; case MC_TARGET_PAGE: - page = target.page; + folio = page_folio(target.page); /* * We can have a part of the split pmd here. Moving it * can be done but it would be too convoluted so simply * ignore such a partial THP and keep it in original * memcg. There should be somebody mapping the head. */ - if (PageTransCompound(page)) + if (folio_test_large(folio)) goto put; - if (!device && !isolate_lru_page(page)) + if (!device && !folio_isolate_lru(folio)) goto put; - if (!mem_cgroup_move_account(page, false, + if (!mem_cgroup_move_account(folio, false, mc.from, mc.to)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; } if (!device) - putback_lru_page(page); + folio_putback_lru(folio); put: /* get_mctgt_type() gets & locks the page */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; case MC_TARGET_SWAP: ent = target.ent; From b46777da7d8d82258764da8e0b2273d8dff809d9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 18:12:17 +0000 Subject: [PATCH 017/435] memcg: return the folio in union mc_target All users of target.page convert it to the folio, so we can just return the folio directly and save a few calls to compound_head(). Link: https://lkml.kernel.org/r/20240111181219.3462852-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Muchun Song Acked-by: Shakeel Butt Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- mm/memcontrol.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d1c1bd166307..f52cbc42359e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5873,7 +5873,7 @@ static int mem_cgroup_do_precharge(unsigned long count) } union mc_target { - struct page *page; + struct folio *folio; swp_entry_t ent; }; @@ -6095,7 +6095,7 @@ out: * Return: * * MC_TARGET_NONE - If the pte is not a target for move charge. * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for - * move charge. If @target is not NULL, the page is stored in target->page + * move charge. If @target is not NULL, the folio is stored in target->folio * with extra refcnt taken (Caller should release it). * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a * target for charge migration. If @target is not NULL, the entry is @@ -6160,7 +6160,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, is_device_coherent_page(page)) ret = MC_TARGET_DEVICE; if (target) - target->page = page; + target->folio = page_folio(page); } if (!ret || !target) { if (target) @@ -6210,7 +6210,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, put_page(page); return MC_TARGET_NONE; } - target->page = page; + target->folio = page_folio(page); } } return ret; @@ -6440,7 +6440,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); if (target_type == MC_TARGET_PAGE) { - folio = page_folio(target.page); + folio = target.folio; if (folio_isolate_lru(folio)) { if (!mem_cgroup_move_account(folio, true, mc.from, mc.to)) { @@ -6452,7 +6452,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, folio_unlock(folio); folio_put(folio); } else if (target_type == MC_TARGET_DEVICE) { - folio = page_folio(target.page); + folio = target.folio; if (!mem_cgroup_move_account(folio, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; @@ -6482,7 +6482,7 @@ retry: device = true; fallthrough; case MC_TARGET_PAGE: - folio = page_folio(target.page); + folio = target.folio; /* * We can have a part of the split pmd here. Moving it * can be done but it would be too convoluted so simply From b67fa6e47bffd6edf417a789c7010cc32a47a41b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 18:12:18 +0000 Subject: [PATCH 018/435] memcg: use a folio in get_mctgt_type Replace seven calls to compound_head() with one. We still use the page as page_mapped() is different from folio_mapped(). Link: https://lkml.kernel.org/r/20240111181219.3462852-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Muchun Song Acked-by: Shakeel Butt Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- mm/memcontrol.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f52cbc42359e..a5bc34d44dbd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6109,6 +6109,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { struct page *page = NULL; + struct folio *folio; enum mc_target_type ret = MC_TARGET_NONE; swp_entry_t ent = { .val = 0 }; @@ -6123,9 +6124,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, ptent, &ent); + if (page) + folio = page_folio(page); if (target && page) { - if (!trylock_page(page)) { - put_page(page); + if (!folio_trylock(folio)) { + folio_put(folio); return ret; } /* @@ -6140,8 +6143,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, * Alas, skip moving the page in this case. */ if (!pte_present(ptent) && page_mapped(page)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return ret; } } @@ -6154,18 +6157,18 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, * mem_cgroup_move_account() checks the page is valid or * not under LRU exclusion. */ - if (page_memcg(page) == mc.from) { + if (folio_memcg(folio) == mc.from) { ret = MC_TARGET_PAGE; - if (is_device_private_page(page) || - is_device_coherent_page(page)) + if (folio_is_device_private(folio) || + folio_is_device_coherent(folio)) ret = MC_TARGET_DEVICE; if (target) - target->folio = page_folio(page); + target->folio = folio; } if (!ret || !target) { if (target) - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } } /* From f6c7590b4e05c7963f3e56a0ff07df2c520f78cd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 18:12:19 +0000 Subject: [PATCH 019/435] memcg: use a folio in get_mctgt_type_thp Replace five calls to compound_head() with one. Link: https://lkml.kernel.org/r/20240111181219.3462852-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Muchun Song Acked-by: Shakeel Butt Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- mm/memcontrol.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a5bc34d44dbd..db92401257f7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6194,6 +6194,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, union mc_target *target) { struct page *page = NULL; + struct folio *folio; enum mc_target_type ret = MC_TARGET_NONE; if (unlikely(is_swap_pmd(pmd))) { @@ -6203,17 +6204,18 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, } page = pmd_page(pmd); VM_BUG_ON_PAGE(!page || !PageHead(page), page); + folio = page_folio(page); if (!(mc.flags & MOVE_ANON)) return ret; - if (page_memcg(page) == mc.from) { + if (folio_memcg(folio) == mc.from) { ret = MC_TARGET_PAGE; if (target) { - get_page(page); - if (!trylock_page(page)) { - put_page(page); + folio_get(folio); + if (!folio_trylock(folio)) { + folio_put(folio); return MC_TARGET_NONE; } - target->folio = page_folio(page); + target->folio = folio; } } return ret; From 5662400a9ac03f38ef3b84e4ff9a640a4604bef9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 15:24:20 +0000 Subject: [PATCH 020/435] mm: add pfn_swap_entry_folio() Patch series "mm: convert mm counter to take a folio", v3. Make sure all mm_counter() and mm_counter_file() callers have a folio, then convert mm counter functions to take a folio, which saves some compound_head() calls. This patch (of 10): Thanks to the compound_head() hidden inside PageLocked(), this saves a call to compound_head() over calling page_folio(pfn_swap_entry_to_page()) Link: https://lkml.kernel.org/r/20240111152429.3374566-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240111152429.3374566-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Kefeng Wang Signed-off-by: Andrew Morton --- include/linux/swapops.h | 13 +++++++++++++ mm/filemap.c | 2 +- mm/huge_memory.c | 2 +- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index bff1e8d97de0..48b700ba1d18 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -468,6 +468,19 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) return p; } +static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) +{ + struct folio *folio = pfn_folio(swp_offset_pfn(entry)); + + /* + * Any use of migration entries may only occur while the + * corresponding folio is locked + */ + BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio)); + + return folio; +} + /* * A pfn swap entry is a special type of swap entry that always has a pfn stored * in the swap offset. They are used to represent unaddressable device memory diff --git a/mm/filemap.c b/mm/filemap.c index 0d7e20edf46f..142864338ca4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1354,7 +1354,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; - struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); + struct folio *folio = pfn_swap_entry_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 94c958f7ebb5..5468b2f97cbf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2045,7 +2045,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); - struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); + struct folio *folio = pfn_swap_entry_folio(entry); pmd_t newpmd; VM_BUG_ON(!is_pmd_migration_entry(*pmd)); From 7101422464ecbca75540feab65b465dee3283438 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 15:24:21 +0000 Subject: [PATCH 021/435] proc: use pfn_swap_entry_folio where obvious These callers only pass the result to PageAnon(), so we can save the extra call to compound_head() by using pfn_swap_entry_folio(). Link: https://lkml.kernel.org/r/20240111152429.3374566-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Kefeng Wang Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3f78ebbb795f..ac6ea2cc2ee8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1807,7 +1807,7 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (p->masks_of_interest & PAGE_IS_FILE) { swp = pte_to_swp_entry(pte); if (is_pfn_swap_entry(swp) && - !PageAnon(pfn_swap_entry_to_page(swp))) + !folio_test_anon(pfn_swap_entry_folio(swp))) categories |= PAGE_IS_FILE; } if (pte_swp_soft_dirty(pte)) @@ -1873,7 +1873,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, if (p->masks_of_interest & PAGE_IS_FILE) { swp = pmd_to_swp_entry(pmd); if (is_pfn_swap_entry(swp) && - !PageAnon(pfn_swap_entry_to_page(swp))) + !folio_test_anon(pfn_swap_entry_folio(swp))) categories |= PAGE_IS_FILE; } } From f2d571b0b207087442d1c3fca5189ee1cb34648e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 15:24:22 +0000 Subject: [PATCH 022/435] mprotect: use pfn_swap_entry_folio We only want to know whether the folio is anonymous, so use pfn_swap_entry_folio() and save a call to compound_head(). Link: https://lkml.kernel.org/r/20240111152429.3374566-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/mprotect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 81991102f785..f8a4544b4601 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -198,13 +198,13 @@ static long change_pte_range(struct mmu_gather *tlb, pte_t newpte; if (is_writable_migration_entry(entry)) { - struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = pfn_swap_entry_folio(entry); /* * A protection check is difficult so * just be safe and disable write */ - if (PageAnon(page)) + if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); else From 0601ac883a814930c3a38d39a115fdc05179d886 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:23 +0000 Subject: [PATCH 023/435] s390: use pfn_swap_entry_folio() in ptep_zap_swap_entry() Call pfn_swap_entry_folio() in ptep_zap_swap_entry() as preparation for converting mm counter functions to take a folio. Link: https://lkml.kernel.org/r/20240111152429.3374566-5-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- arch/s390/mm/pgtable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 99422926efe1..7e5dd4b17664 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -721,9 +721,9 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) if (!non_swap_entry(entry)) dec_mm_counter(mm, MM_SWAPENTS); else if (is_migration_entry(entry)) { - struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = pfn_swap_entry_folio(entry); - dec_mm_counter(mm, mm_counter(page)); + dec_mm_counter(mm, mm_counter(&folio->page)); } free_swap_and_cache(entry); } From 439992ff4637ad5042ca8ee1f659fae24890de3e Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:24 +0000 Subject: [PATCH 024/435] mm: use pfn_swap_entry_folio() in __split_huge_pmd_locked() Call pfn_swap_entry_folio() in __split_huge_pmd_locked() as preparation for converting mm counter functions to take a folio. Link: https://lkml.kernel.org/r/20240111152429.3374566-6-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5468b2f97cbf..33b720037ab7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2442,7 +2442,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); - page = pfn_swap_entry_to_page(entry); + folio = pfn_swap_entry_folio(entry); } else { page = pmd_page(old_pmd); folio = page_folio(page); @@ -2453,7 +2453,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, folio_remove_rmap_pmd(folio, page, vma); folio_put(folio); } - add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); + add_mm_counter(mm, mm_counter_file(&folio->page), -HPAGE_PMD_NR); return; } From 0103b27a6b826729dc1500d013e53ebed48980b3 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:25 +0000 Subject: [PATCH 025/435] mm: use pfn_swap_entry_to_folio() in zap_huge_pmd() Call pfn_swap_entry_to_folio() in zap_huge_pmd() as preparation for converting mm counter functions to take a folio. Saves a call to compound_head() embedded inside PageAnon(). Link: https://lkml.kernel.org/r/20240111152429.3374566-7-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/huge_memory.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 33b720037ab7..7a28a7db08ea 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1905,12 +1905,14 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); } else { - struct page *page = NULL; + struct folio *folio = NULL; int flush_needed = 1; if (pmd_present(orig_pmd)) { - page = pmd_page(orig_pmd); - folio_remove_rmap_pmd(page_folio(page), page, vma); + struct page *page = pmd_page(orig_pmd); + + folio = page_folio(page); + folio_remove_rmap_pmd(folio, page, vma); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); } else if (thp_migration_supported()) { @@ -1918,23 +1920,24 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); - page = pfn_swap_entry_to_page(entry); + folio = pfn_swap_entry_folio(entry); flush_needed = 0; } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); - if (PageAnon(page)) { + if (folio_test_anon(folio)) { zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); - add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR); + add_mm_counter(tlb->mm, mm_counter_file(&folio->page), + -HPAGE_PMD_NR); } spin_unlock(ptl); if (flush_needed) - tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); + tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); } return 1; } From 530c2a0da0b440bec4af3dae5bd7110f77962e9b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:26 +0000 Subject: [PATCH 026/435] mm: use pfn_swap_entry_folio() in copy_nonpresent_pte() Call pfn_swap_entry_folio() as preparation for converting mm counter functions to take a folio. Link: https://lkml.kernel.org/r/20240111152429.3374566-8-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index a25bc8a370fd..c8dd24941914 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -806,9 +806,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + folio = pfn_swap_entry_folio(entry); - rss[mm_counter(page)]++; + rss[mm_counter(&folio->page)]++; if (!is_readable_migration_entry(entry) && is_cow_mapping(vm_flags)) { From eabafaaa957553142cdafc8ae804fb679e5a5f5e Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:27 +0000 Subject: [PATCH 027/435] mm: convert to should_zap_page() to should_zap_folio() Make should_zap_page() take a folio and rename it to should_zap_folio() as preparation for converting mm counter functions to take a folio. Saves a call to compound_head() hidden inside PageAnon(). [wangkefeng.wang@huawei.com: fix used-uninitialized warning] Link: https://lkml.kernel.org/r/962a7993-fce9-4de8-85cd-25e290f25736@huawei.com Link: https://lkml.kernel.org/r/20240111152429.3374566-9-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/memory.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index c8dd24941914..4bed82009ea7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1369,19 +1369,20 @@ static inline bool should_zap_cows(struct zap_details *details) return details->even_cows; } -/* Decides whether we should zap this page with the page pointer specified */ -static inline bool should_zap_page(struct zap_details *details, struct page *page) +/* Decides whether we should zap this folio with the folio pointer specified */ +static inline bool should_zap_folio(struct zap_details *details, + struct folio *folio) { - /* If we can make a decision without *page.. */ + /* If we can make a decision without *folio.. */ if (should_zap_cows(details)) return true; - /* E.g. the caller passes NULL for the case of a zero page */ - if (!page) + /* E.g. the caller passes NULL for the case of a zero folio */ + if (!folio) return true; - /* Otherwise we should only zap non-anon pages */ - return !PageAnon(page); + /* Otherwise we should only zap non-anon folios */ + return !folio_test_anon(folio); } static inline bool zap_drop_file_uffd_wp(struct zap_details *details) @@ -1434,7 +1435,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, arch_enter_lazy_mmu_mode(); do { pte_t ptent = ptep_get(pte); - struct folio *folio; + struct folio *folio = NULL; struct page *page; if (pte_none(ptent)) @@ -1447,7 +1448,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, unsigned int delay_rmap; page = vm_normal_page(vma, addr, ptent); - if (unlikely(!should_zap_page(details, page))) + if (page) + folio = page_folio(page); + + if (unlikely(!should_zap_folio(details, folio))) continue; ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); @@ -1460,7 +1464,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, continue; } - folio = page_folio(page); delay_rmap = 0; if (!folio_test_anon(folio)) { if (pte_dirty(ptent)) { @@ -1492,7 +1495,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, is_device_exclusive_entry(entry)) { page = pfn_swap_entry_to_page(entry); folio = page_folio(page); - if (unlikely(!should_zap_page(details, page))) + if (unlikely(!should_zap_folio(details, folio))) continue; /* * Both device private/exclusive mappings should only @@ -1513,10 +1516,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); } else if (is_migration_entry(entry)) { - page = pfn_swap_entry_to_page(entry); - if (!should_zap_page(details, page)) + folio = pfn_swap_entry_folio(entry); + if (!should_zap_folio(details, folio)) continue; - rss[mm_counter(page)]--; + rss[mm_counter(&folio->page)]--; } else if (pte_marker_entry_uffd_wp(entry)) { /* * For anon: always drop the marker; for file: only From a23f517b0e1554467b0eb3bc1ebcb4d626217302 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:28 +0000 Subject: [PATCH 028/435] mm: convert mm_counter() to take a folio Now all callers of mm_counter() have a folio, convert mm_counter() to take a folio. Saves a call to compound_head() hidden inside PageAnon(). Link: https://lkml.kernel.org/r/20240111152429.3374566-10-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- arch/s390/mm/pgtable.c | 2 +- include/linux/mm.h | 6 +++--- mm/memory.c | 10 +++++----- mm/rmap.c | 8 ++++---- mm/userfaultfd.c | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 7e5dd4b17664..b71432b15d66 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -723,7 +723,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) else if (is_migration_entry(entry)) { struct folio *folio = pfn_swap_entry_folio(entry); - dec_mm_counter(mm, mm_counter(&folio->page)); + dec_mm_counter(mm, mm_counter(folio)); } free_swap_and_cache(entry); } diff --git a/include/linux/mm.h b/include/linux/mm.h index f5a97dec5169..22e597b36b38 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2603,11 +2603,11 @@ static inline int mm_counter_file(struct page *page) return MM_FILEPAGES; } -static inline int mm_counter(struct page *page) +static inline int mm_counter(struct folio *folio) { - if (PageAnon(page)) + if (folio_test_anon(folio)) return MM_ANONPAGES; - return mm_counter_file(page); + return mm_counter_file(&folio->page); } static inline unsigned long get_mm_rss(struct mm_struct *mm) diff --git a/mm/memory.c b/mm/memory.c index 4bed82009ea7..87ef98099847 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -808,7 +808,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } else if (is_migration_entry(entry)) { folio = pfn_swap_entry_folio(entry); - rss[mm_counter(&folio->page)]++; + rss[mm_counter(folio)]++; if (!is_readable_migration_entry(entry) && is_cow_mapping(vm_flags)) { @@ -840,7 +840,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * keep things as they are. */ folio_get(folio); - rss[mm_counter(page)]++; + rss[mm_counter(folio)]++; /* Cannot fail as these pages cannot get pinned. */ folio_try_dup_anon_rmap_pte(folio, page, src_vma); @@ -1476,7 +1476,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_young(ptent) && likely(vma_has_recency(vma))) folio_mark_accessed(folio); } - rss[mm_counter(page)]--; + rss[mm_counter(folio)]--; if (!delay_rmap) { folio_remove_rmap_pte(folio, page, vma); if (unlikely(page_mapcount(page) < 0)) @@ -1504,7 +1504,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, * see zap_install_uffd_wp_if_needed(). */ WARN_ON_ONCE(!vma_is_anonymous(vma)); - rss[mm_counter(page)]--; + rss[mm_counter(folio)]--; if (is_device_private_entry(entry)) folio_remove_rmap_pte(folio, page, vma); folio_put(folio); @@ -1519,7 +1519,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, folio = pfn_swap_entry_folio(entry); if (!should_zap_folio(details, folio)) continue; - rss[mm_counter(&folio->page)]--; + rss[mm_counter(folio)]--; } else if (pte_marker_entry_uffd_wp(entry)) { /* * For anon: always drop the marker; for file: only diff --git a/mm/rmap.c b/mm/rmap.c index f5d43edad529..4648cf1d8178 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1780,7 +1780,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); } else { - dec_mm_counter(mm, mm_counter(&folio->page)); + dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } @@ -1795,7 +1795,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * migration) will not expect userfaults on already * copied pages. */ - dec_mm_counter(mm, mm_counter(&folio->page)); + dec_mm_counter(mm, mm_counter(folio)); } else if (folio_test_anon(folio)) { swp_entry_t entry = page_swap_entry(subpage); pte_t swp_pte; @@ -2181,7 +2181,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); } else { - dec_mm_counter(mm, mm_counter(&folio->page)); + dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } @@ -2196,7 +2196,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * migration) will not expect userfaults on already * copied pages. */ - dec_mm_counter(mm, mm_counter(&folio->page)); + dec_mm_counter(mm, mm_counter(folio)); } else { swp_entry_t entry; pte_t swp_pte; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7cf7d4384259..ae80c3714829 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -124,7 +124,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, * Must happen after rmap, as mm_counter() checks mapping (via * PageAnon()), which is set by __page_set_anon_rmap(). */ - inc_mm_counter(dst_mm, mm_counter(page)); + inc_mm_counter(dst_mm, mm_counter(folio)); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); From 6b27cc6c66abf0f0b091a95ca1ad4e0fc68c11fd Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:29 +0000 Subject: [PATCH 029/435] mm: convert mm_counter_file() to take a folio Now all callers of mm_counter_file() have a folio, convert mm_counter_file() to take a folio. Saves a call to compound_head() hidden inside PageSwapBacked(). Link: https://lkml.kernel.org/r/20240111152429.3374566-11-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++---- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 4 ++-- mm/khugepaged.c | 4 ++-- mm/memory.c | 10 +++++----- mm/rmap.c | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 22e597b36b38..ac6b71cbdffb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2595,10 +2595,10 @@ static inline void dec_mm_counter(struct mm_struct *mm, int member) mm_trace_rss_stat(mm, member); } -/* Optimized variant when page is already known not to be PageAnon */ -static inline int mm_counter_file(struct page *page) +/* Optimized variant when folio is already known not to be anon */ +static inline int mm_counter_file(struct folio *folio) { - if (PageSwapBacked(page)) + if (folio_test_swapbacked(folio)) return MM_SHMEMPAGES; return MM_FILEPAGES; } @@ -2607,7 +2607,7 @@ static inline int mm_counter(struct folio *folio) { if (folio_test_anon(folio)) return MM_ANONPAGES; - return mm_counter_file(&folio->page); + return mm_counter_file(folio); } static inline unsigned long get_mm_rss(struct mm_struct *mm) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 929e98c62965..e4834d23e1d1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -188,7 +188,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, dec_mm_counter(mm, MM_ANONPAGES); if (!folio_test_anon(old_folio)) { - dec_mm_counter(mm, mm_counter_file(old_page)); + dec_mm_counter(mm, mm_counter_file(old_folio)); inc_mm_counter(mm, MM_ANONPAGES); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7a28a7db08ea..f005f0424735 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1931,7 +1931,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, } else { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); - add_mm_counter(tlb->mm, mm_counter_file(&folio->page), + add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PMD_NR); } @@ -2456,7 +2456,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, folio_remove_rmap_pmd(folio, page, vma); folio_put(folio); } - add_mm_counter(mm, mm_counter_file(&folio->page), -HPAGE_PMD_NR); + add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); return; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2b219acb528e..fe43fbc44525 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1634,7 +1634,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, /* step 3: set proper refcount and mm_counters. */ if (nr_ptes) { folio_ref_sub(folio, nr_ptes); - add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); } /* step 4: remove empty page table */ @@ -1665,7 +1665,7 @@ abort: if (nr_ptes) { flush_tlb_mm(mm); folio_ref_sub(folio, nr_ptes); - add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); } if (start_pte) pte_unmap_unlock(start_pte, ptl); diff --git a/mm/memory.c b/mm/memory.c index 87ef98099847..5e608edfe330 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -966,7 +966,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, } else if (page) { folio_get(folio); folio_dup_file_rmap_pte(folio, page); - rss[mm_counter_file(page)]++; + rss[mm_counter_file(folio)]++; } /* @@ -1873,7 +1873,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, return -EBUSY; /* Ok, finally just insert the thing.. */ folio_get(folio); - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); folio_add_file_rmap_pte(folio, page, vma); set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; @@ -3178,7 +3178,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { if (old_folio) { if (!folio_test_anon(old_folio)) { - dec_mm_counter(mm, mm_counter_file(&old_folio->page)); + dec_mm_counter(mm, mm_counter_file(old_folio)); inc_mm_counter(mm, MM_ANONPAGES); } } else { @@ -4483,7 +4483,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); + add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR); folio_add_file_rmap_pmd(folio, page, vma); /* @@ -4546,7 +4546,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); } else { - add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); + add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr); folio_add_file_rmap_ptes(folio, page, nr, vma); } set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); diff --git a/mm/rmap.c b/mm/rmap.c index 4648cf1d8178..1cf2bffa48ed 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1903,7 +1903,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * * See Documentation/mm/mmu_notifier.rst */ - dec_mm_counter(mm, mm_counter_file(&folio->page)); + dec_mm_counter(mm, mm_counter_file(folio)); } discard: if (unlikely(folio_test_hugetlb(folio))) From cabbb6d51e2af4fc2f3c763f58a12c628f228987 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Thu, 11 Jan 2024 08:45:33 +0000 Subject: [PATCH 030/435] fs/proc/task_mmu.c: add_to_pagemap: remove useless parameter addr Function parameter addr of add_to_pagemap() is useless. Remove it. Link: https://lkml.kernel.org/r/20240111084533.40038-1-teawaterz@linux.alibaba.com Signed-off-by: Hui Zhu Reviewed-by: Muhammad Usama Anjum Tested-by: Muhammad Usama Anjum Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Peter Xu Cc: Ryan Roberts Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ac6ea2cc2ee8..23fbab954c20 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1352,8 +1352,7 @@ static inline pagemap_entry_t make_pme(u64 frame, u64 flags) return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; } -static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, - struct pagemapread *pm) +static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) { pm->buffer[pm->pos++] = *pme; if (pm->pos >= pm->len) @@ -1380,7 +1379,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, hole_end = end; for (; addr < hole_end; addr += PAGE_SIZE) { - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) goto out; } @@ -1392,7 +1391,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, if (vma->vm_flags & VM_SOFTDIRTY) pme = make_pme(0, PM_SOFT_DIRTY); for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) goto out; } @@ -1519,7 +1518,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) break; if (pm->show_pfn) { @@ -1547,7 +1546,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, pagemap_entry_t pme; pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) break; } @@ -1597,7 +1596,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); - err = add_to_pagemap(addr, &pme, pm); + err = add_to_pagemap(&pme, pm); if (err) return err; if (pm->show_pfn && (flags & PM_PRESENT)) From e03c16fb4af1dfc615a4e1f51be0d5fe5840b904 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 15 Jan 2024 11:25:22 +0100 Subject: [PATCH 031/435] readahead: use ilog2 instead of a while loop in page_cache_ra_order() A while loop is used to adjust the new_order to be lower than the ra->size. ilog2 could be used to do the same instead of using a loop. ilog2 typically resolves to a bit scan reverse instruction. This is particularly useful when ra->size is smaller than the 2^new_order as it resolves in one instruction instead of looping to find the new_order. No functional changes. Link: https://lkml.kernel.org/r/20240115102523.2336742-1-kernel@pankajraghav.com Signed-off-by: Pankaj Raghav Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/readahead.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 2648ec4f0494..1e74455f908e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -500,10 +500,8 @@ void page_cache_ra_order(struct readahead_control *ractl, if (new_order < MAX_PAGECACHE_ORDER) { new_order += 2; - if (new_order > MAX_PAGECACHE_ORDER) - new_order = MAX_PAGECACHE_ORDER; - while ((1 << new_order) > ra->size) - new_order--; + new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); + new_order = min_t(unsigned int, new_order, ilog2(ra->size)); } filemap_invalidate_lock_shared(mapping); From 8409a385a6b41b7042aae1a4e4053963da997ffe Mon Sep 17 00:00:00 2001 From: Ronald Monthero Date: Tue, 16 Jan 2024 23:31:45 +1000 Subject: [PATCH 032/435] mm/zswap: improve with alloc_workqueue() call The core-api create_workqueue is deprecated, this patch replaces the create_workqueue with alloc_workqueue. The previous implementation workqueue of zswap was a bounded workqueue, this patch uses alloc_workqueue() to create an unbounded workqueue. The WQ_UNBOUND attribute is desirable making the workqueue not localized to a specific cpu so that the scheduler is free to exercise improvisations in any demanding scenarios for offloading cpu time slices for workqueues. For example if any other workqueues of the same primary cpu had to be served which are WQ_HIGHPRI and WQ_CPU_INTENSIVE. Also Unbound workqueue happens to be more efficient in a system during memory pressure scenarios in comparison to a bounded workqueue. shrink_wq = alloc_workqueue("zswap-shrink", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); Overall the change suggested in this patch should be seamless and does not alter the existing behavior, other than the improvisation to be an unbounded workqueue. Link: https://lkml.kernel.org/r/20240116133145.12454-1-debug.penguin32@gmail.com Signed-off-by: Ronald Monthero Acked-by: Nhat Pham Acked-by: Johannes Weiner Cc: Chris Li Cc: Dan Streetman Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zswap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index db4625af65fb..2c3d77c6fe72 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1887,7 +1887,8 @@ static int zswap_setup(void) zswap_enabled = false; } - shrink_wq = create_workqueue("zswap-shrink"); + shrink_wq = alloc_workqueue("zswap-shrink", + WQ_UNBOUND|WQ_MEM_RECLAIM, 1); if (!shrink_wq) goto fallback_fail; From 2444172cfde45a3d6e655f50c620727c76bab4a2 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 16 Jan 2024 14:12:35 +0000 Subject: [PATCH 033/435] tools/mm: add thpmaps script to dump THP usage info With the proliferation of large folios for file-backed memory, and more recently the introduction of multi-size THP for anonymous memory, it is becoming useful to be able to see exactly how large folios are mapped into processes. For some architectures (e.g. arm64), if most memory is mapped using contpte-sized and -aligned blocks, TLB usage can be optimized so it's useful to see where these requirements are and are not being met. thpmaps is a Python utility that reads /proc//smaps, /proc//pagemap and /proc/kpageflags to print information about how transparent huge pages (both file and anon) are mapped to a specified process or cgroup. It aims to help users debug and optimize their workloads. In future we may wish to introduce stats directly into the kernel (e.g. smaps or similar), but for now this provides a short term solution without the need to introduce any new ABI. Run with help option for a full listing of the arguments: # ./thpmaps --help --8<-- usage: thpmaps [-h] [--pid pid | --cgroup path] [--rollup] [--cont size[KMG]] [--inc-smaps] [--inc-empty] [--periodic sleep_ms] Prints information about how transparent huge pages are mapped, either system-wide, or for a specified process or cgroup. When run with --pid, the user explicitly specifies the set of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run with --cgroup, the user passes either a v1 or v2 cgroup and all pids that belong to the cgroup subtree are scanned. When run with neither --pid nor --cgroup, the full set of pids on the system is gathered from /proc and scanned as if the user had provided "--pid 1 --pid 2 ...". A default set of statistics is always generated for THP mappings. However, it is also possible to generate additional statistics for "contiguous block mappings" where the block size is user-defined. Statistics are maintained independently for anonymous and file-backed (pagecache) memory and are shown both in kB and as a percentage of either total anonymous or total file-backed memory as appropriate. THP Statistics -------------- Statistics are always generated for fully- and contiguously-mapped THPs whose mapping address is aligned to their size, for each supported by the system. Separate counters describe THPs mapped by PTE vs those mapped by PMD. (Although note a THP can only be mapped by PMD if it is PMD-sized): - anon-thp-pte-aligned-kB - file-thp-pte-aligned-kB - anon-thp-pmd-aligned-kB - file-thp-pmd-aligned-kB Similarly, statistics are always generated for fully- and contiguously- mapped THPs whose mapping address is *not* aligned to their size, for each supported by the system. Due to the unaligned mapping, it is impossible to map by PMD, so there are only PTE counters for this case: - anon-thp-pte-unaligned-kB - file-thp-pte-unaligned-kB Statistics are also always generated for mapped pages that belong to a THP but where the is THP is *not* fully- and contiguously- mapped. These "partial" mappings are all counted in the same counter regardless of the size of the THP that is partially mapped: - anon-thp-pte-partial - file-thp-pte-partial Contiguous Block Statistics --------------------------- An optional, additional set of statistics is generated for every contiguous block size specified with `--cont `. These statistics show how much memory is mapped in contiguous blocks of and also aligned to . A given contiguous block must all belong to the same THP, but there is no requirement for it to be the *whole* THP. Separate counters describe contiguous blocks mapped by PTE vs those mapped by PMD: - anon-cont-pte-aligned-kB - file-cont-pte-aligned-kB - anon-cont-pmd-aligned-kB - file-cont-pmd-aligned-kB As an example, if monitoring 64K contiguous blocks (--cont 64K), there are a number of sources that could provide such blocks: a fully- and contiguously-mapped 64K THP that is aligned to a 64K boundary would provide 1 block. A fully- and contiguously-mapped 128K THP that is aligned to at least a 64K boundary would provide 2 blocks. Or a 128K THP that maps its first 100K, but contiguously and starting at a 64K boundary would provide 1 block. A fully- and contiguously-mapped 2M THP would provide 32 blocks. There are many other possible permutations. options: -h, --help show this help message and exit --pid pid Process id of the target process. Maybe issued multiple times to scan multiple processes. --pid and --cgroup are mutually exclusive. If neither are provided, all processes are scanned to provide system-wide information. --cgroup path Path to the target cgroup in sysfs. Iterates over every pid in the cgroup and its children. --pid and --cgroup are mutually exclusive. If neither are provided, all processes are scanned to provide system-wide information. --rollup Sum the per-vma statistics to provide a summary over the whole system, process or cgroup. --cont size[KMG] Adds stats for memory that is mapped in contiguous blocks of and also aligned to . May be issued multiple times to track multiple sized blocks. Useful to infer e.g. arm64 contpte and hpa mappings. Size must be a power-of-2 number of pages. --inc-smaps Include all numerical, additive /proc//smaps stats in the output. --inc-empty Show all statistics including those whose value is 0. --periodic sleep_ms Run in a loop, polling every sleep_ms milliseconds. Requires root privilege to access pagemap and kpageflags. --8<-- Example command to summarise fully and partially mapped THPs and 64K contiguous blocks over all VMAs in all processes in the system (--inc-empty forces printing stats that are 0): # ./thpmaps --cont 64K --rollup --inc-empty --8<-- anon-thp-pmd-aligned-2048kB: 139264 kB ( 6%) file-thp-pmd-aligned-2048kB: 0 kB ( 0%) anon-thp-pte-aligned-16kB: 0 kB ( 0%) anon-thp-pte-aligned-32kB: 0 kB ( 0%) anon-thp-pte-aligned-64kB: 72256 kB ( 3%) anon-thp-pte-aligned-128kB: 0 kB ( 0%) anon-thp-pte-aligned-256kB: 0 kB ( 0%) anon-thp-pte-aligned-512kB: 0 kB ( 0%) anon-thp-pte-aligned-1024kB: 0 kB ( 0%) anon-thp-pte-aligned-2048kB: 0 kB ( 0%) anon-thp-pte-unaligned-16kB: 0 kB ( 0%) anon-thp-pte-unaligned-32kB: 0 kB ( 0%) anon-thp-pte-unaligned-64kB: 0 kB ( 0%) anon-thp-pte-unaligned-128kB: 0 kB ( 0%) anon-thp-pte-unaligned-256kB: 0 kB ( 0%) anon-thp-pte-unaligned-512kB: 0 kB ( 0%) anon-thp-pte-unaligned-1024kB: 0 kB ( 0%) anon-thp-pte-unaligned-2048kB: 0 kB ( 0%) anon-thp-pte-partial: 63232 kB ( 3%) file-thp-pte-aligned-16kB: 809024 kB (47%) file-thp-pte-aligned-32kB: 43168 kB ( 3%) file-thp-pte-aligned-64kB: 98496 kB ( 6%) file-thp-pte-aligned-128kB: 17536 kB ( 1%) file-thp-pte-aligned-256kB: 0 kB ( 0%) file-thp-pte-aligned-512kB: 0 kB ( 0%) file-thp-pte-aligned-1024kB: 0 kB ( 0%) file-thp-pte-aligned-2048kB: 0 kB ( 0%) file-thp-pte-unaligned-16kB: 21712 kB ( 1%) file-thp-pte-unaligned-32kB: 704 kB ( 0%) file-thp-pte-unaligned-64kB: 896 kB ( 0%) file-thp-pte-unaligned-128kB: 44928 kB ( 3%) file-thp-pte-unaligned-256kB: 0 kB ( 0%) file-thp-pte-unaligned-512kB: 0 kB ( 0%) file-thp-pte-unaligned-1024kB: 0 kB ( 0%) file-thp-pte-unaligned-2048kB: 0 kB ( 0%) file-thp-pte-partial: 9252 kB ( 1%) anon-cont-pmd-aligned-64kB: 139264 kB ( 6%) file-cont-pmd-aligned-64kB: 0 kB ( 0%) anon-cont-pte-aligned-64kB: 100672 kB ( 4%) file-cont-pte-aligned-64kB: 161856 kB ( 9%) --8<-- Link: https://lkml.kernel.org/r/20240116141235.960842-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: Barry Song Cc: Alistair Popple Cc: David Hildenbrand Cc: John Hubbard Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: William Kucharski Cc: Zenghui Yu Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/mm/Makefile | 9 +- tools/mm/thpmaps | 675 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 680 insertions(+), 4 deletions(-) create mode 100644 tools/mm/thpmaps diff --git a/tools/mm/Makefile b/tools/mm/Makefile index 1c5606cc3334..7bb03606b9ea 100644 --- a/tools/mm/Makefile +++ b/tools/mm/Makefile @@ -3,7 +3,8 @@ # include ../scripts/Makefile.include -TARGETS=page-types slabinfo page_owner_sort +BUILD_TARGETS=page-types slabinfo page_owner_sort +INSTALL_TARGETS = $(BUILD_TARGETS) thpmaps LIB_DIR = ../lib/api LIBS = $(LIB_DIR)/libapi.a @@ -11,9 +12,9 @@ LIBS = $(LIB_DIR)/libapi.a CFLAGS += -Wall -Wextra -I../lib/ -pthread LDFLAGS += $(LIBS) -pthread -all: $(TARGETS) +all: $(BUILD_TARGETS) -$(TARGETS): $(LIBS) +$(BUILD_TARGETS): $(LIBS) $(LIBS): make -C $(LIB_DIR) @@ -29,4 +30,4 @@ sbindir ?= /usr/sbin install: all install -d $(DESTDIR)$(sbindir) - install -m 755 -p $(TARGETS) $(DESTDIR)$(sbindir) + install -m 755 -p $(INSTALL_TARGETS) $(DESTDIR)$(sbindir) diff --git a/tools/mm/thpmaps b/tools/mm/thpmaps new file mode 100644 index 000000000000..803e0318f2fe --- /dev/null +++ b/tools/mm/thpmaps @@ -0,0 +1,675 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only +# Copyright (C) 2024 ARM Ltd. +# +# Utility providing smaps-like output detailing transparent hugepage usage. +# For more info, run: +# ./thpmaps --help +# +# Requires numpy: +# pip3 install numpy + + +import argparse +import collections +import math +import os +import re +import resource +import shutil +import sys +import textwrap +import time +import numpy as np + + +with open('/sys/kernel/mm/transparent_hugepage/hpage_pmd_size') as f: + PAGE_SIZE = resource.getpagesize() + PAGE_SHIFT = int(math.log2(PAGE_SIZE)) + PMD_SIZE = int(f.read()) + PMD_ORDER = int(math.log2(PMD_SIZE / PAGE_SIZE)) + + +def align_forward(v, a): + return (v + (a - 1)) & ~(a - 1) + + +def align_offset(v, a): + return v & (a - 1) + + +def kbnr(kb): + # Convert KB to number of pages. + return (kb << 10) >> PAGE_SHIFT + + +def nrkb(nr): + # Convert number of pages to KB. + return (nr << PAGE_SHIFT) >> 10 + + +def odkb(order): + # Convert page order to KB. + return (PAGE_SIZE << order) >> 10 + + +def cont_ranges_all(search, index): + # Given a list of arrays, find the ranges for which values are monotonically + # incrementing in all arrays. all arrays in search and index must be the + # same size. + sz = len(search[0]) + r = np.full(sz, 2) + d = np.diff(search[0]) == 1 + for dd in [np.diff(arr) == 1 for arr in search[1:]]: + d &= dd + r[1:] -= d + r[:-1] -= d + return [np.repeat(arr, r).reshape(-1, 2) for arr in index] + + +class ArgException(Exception): + pass + + +class FileIOException(Exception): + pass + + +class BinArrayFile: + # Base class used to read /proc//pagemap and /proc/kpageflags into a + # numpy array. Use inherrited class in a with clause to ensure file is + # closed when it goes out of scope. + def __init__(self, filename, element_size): + self.element_size = element_size + self.filename = filename + self.fd = os.open(self.filename, os.O_RDONLY) + + def cleanup(self): + os.close(self.fd) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.cleanup() + + def _readin(self, offset, buffer): + length = os.preadv(self.fd, (buffer,), offset) + if len(buffer) != length: + raise FileIOException('error: {} failed to read {} bytes at {:x}' + .format(self.filename, len(buffer), offset)) + + def _toarray(self, buf): + assert(self.element_size == 8) + return np.frombuffer(buf, dtype=np.uint64) + + def getv(self, vec): + vec *= self.element_size + offsets = vec[:, 0] + lengths = (np.diff(vec) + self.element_size).reshape(len(vec)) + buf = bytearray(int(np.sum(lengths))) + view = memoryview(buf) + pos = 0 + for offset, length in zip(offsets, lengths): + offset = int(offset) + length = int(length) + self._readin(offset, view[pos:pos+length]) + pos += length + return self._toarray(buf) + + def get(self, index, nr=1): + offset = index * self.element_size + length = nr * self.element_size + buf = bytearray(length) + self._readin(offset, buf) + return self._toarray(buf) + + +PM_PAGE_PRESENT = 1 << 63 +PM_PFN_MASK = (1 << 55) - 1 + +class PageMap(BinArrayFile): + # Read ranges of a given pid's pagemap into a numpy array. + def __init__(self, pid='self'): + super().__init__(f'/proc/{pid}/pagemap', 8) + + +KPF_ANON = 1 << 12 +KPF_COMPOUND_HEAD = 1 << 15 +KPF_COMPOUND_TAIL = 1 << 16 +KPF_THP = 1 << 22 + +class KPageFlags(BinArrayFile): + # Read ranges of /proc/kpageflags into a numpy array. + def __init__(self): + super().__init__(f'/proc/kpageflags', 8) + + +vma_all_stats = set([ + "Size", + "Rss", + "Pss", + "Pss_Dirty", + "Shared_Clean", + "Shared_Dirty", + "Private_Clean", + "Private_Dirty", + "Referenced", + "Anonymous", + "KSM", + "LazyFree", + "AnonHugePages", + "ShmemPmdMapped", + "FilePmdMapped", + "Shared_Hugetlb", + "Private_Hugetlb", + "Swap", + "SwapPss", + "Locked", +]) + +vma_min_stats = set([ + "Rss", + "Anonymous", + "AnonHugePages", + "ShmemPmdMapped", + "FilePmdMapped", +]) + +VMA = collections.namedtuple('VMA', [ + 'name', + 'start', + 'end', + 'read', + 'write', + 'execute', + 'private', + 'pgoff', + 'major', + 'minor', + 'inode', + 'stats', +]) + +class VMAList: + # A container for VMAs, parsed from /proc//smaps. Iterate over the + # instance to receive VMAs. + def __init__(self, pid='self', stats=[]): + self.vmas = [] + with open(f'/proc/{pid}/smaps', 'r') as file: + for line in file: + elements = line.split() + if '-' in elements[0]: + start, end = map(lambda x: int(x, 16), elements[0].split('-')) + major, minor = map(lambda x: int(x, 16), elements[3].split(':')) + self.vmas.append(VMA( + name=elements[5] if len(elements) == 6 else '', + start=start, + end=end, + read=elements[1][0] == 'r', + write=elements[1][1] == 'w', + execute=elements[1][2] == 'x', + private=elements[1][3] == 'p', + pgoff=int(elements[2], 16), + major=major, + minor=minor, + inode=int(elements[4], 16), + stats={}, + )) + else: + param = elements[0][:-1] + if param in stats: + value = int(elements[1]) + self.vmas[-1].stats[param] = {'type': None, 'value': value} + + def __iter__(self): + yield from self.vmas + + +def thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads): + # Given 4 same-sized arrays representing a range within a page table backed + # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons: + # True if page is anonymous, heads: True if page is head of a THP), return a + # dictionary of statistics describing the mapped THPs. + stats = { + 'file': { + 'partial': 0, + 'aligned': [0] * (PMD_ORDER + 1), + 'unaligned': [0] * (PMD_ORDER + 1), + }, + 'anon': { + 'partial': 0, + 'aligned': [0] * (PMD_ORDER + 1), + 'unaligned': [0] * (PMD_ORDER + 1), + }, + } + + for rindex, rpfn in zip(ranges[0], ranges[2]): + index_next = int(rindex[0]) + index_end = int(rindex[1]) + 1 + pfn_end = int(rpfn[1]) + 1 + + folios = indexes[index_next:index_end][heads[index_next:index_end]] + + # Account pages for any partially mapped THP at the front. In that case, + # the first page of the range is a tail. + nr = (int(folios[0]) if len(folios) else index_end) - index_next + stats['anon' if anons[index_next] else 'file']['partial'] += nr + + # Account pages for any partially mapped THP at the back. In that case, + # the next page after the range is a tail. + if len(folios): + flags = int(kpageflags.get(pfn_end)[0]) + if flags & KPF_COMPOUND_TAIL: + nr = index_end - int(folios[-1]) + folios = folios[:-1] + index_end -= nr + stats['anon' if anons[index_end - 1] else 'file']['partial'] += nr + + # Account fully mapped THPs in the middle of the range. + if len(folios): + folio_nrs = np.append(np.diff(folios), np.uint64(index_end - folios[-1])) + folio_orders = np.log2(folio_nrs).astype(np.uint64) + for index, order in zip(folios, folio_orders): + index = int(index) + order = int(order) + nr = 1 << order + vfn = int(vfns[index]) + align = 'aligned' if align_forward(vfn, nr) == vfn else 'unaligned' + anon = 'anon' if anons[index] else 'file' + stats[anon][align][order] += nr + + # Account PMD-mapped THPs spearately, so filter out of the stats. There is a + # race between acquiring the smaps stats and reading pagemap, where memory + # could be deallocated. So clamp to zero incase it would have gone negative. + anon_pmd_mapped = vma.stats['AnonHugePages']['value'] + file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \ + vma.stats['FilePmdMapped']['value'] + stats['anon']['aligned'][PMD_ORDER] = max(0, stats['anon']['aligned'][PMD_ORDER] - kbnr(anon_pmd_mapped)) + stats['file']['aligned'][PMD_ORDER] = max(0, stats['file']['aligned'][PMD_ORDER] - kbnr(file_pmd_mapped)) + + rstats = { + f"anon-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'anon', 'value': anon_pmd_mapped}, + f"file-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'file', 'value': file_pmd_mapped}, + } + + def flatten_sub(type, subtype, stats): + param = f"{type}-thp-pte-{subtype}-{{}}kB" + for od, nr in enumerate(stats[2:], 2): + rstats[param.format(odkb(od))] = {'type': type, 'value': nrkb(nr)} + + def flatten_type(type, stats): + flatten_sub(type, 'aligned', stats['aligned']) + flatten_sub(type, 'unaligned', stats['unaligned']) + rstats[f"{type}-thp-pte-partial"] = {'type': type, 'value': nrkb(stats['partial'])} + + flatten_type('anon', stats['anon']) + flatten_type('file', stats['file']) + + return rstats + + +def cont_parse(vma, order, ranges, anons, heads): + # Given 4 same-sized arrays representing a range within a page table backed + # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons: + # True if page is anonymous, heads: True if page is head of a THP), return a + # dictionary of statistics describing the contiguous blocks. + nr_cont = 1 << order + nr_anon = 0 + nr_file = 0 + + for rindex, rvfn, rpfn in zip(*ranges): + index_next = int(rindex[0]) + index_end = int(rindex[1]) + 1 + vfn_start = int(rvfn[0]) + pfn_start = int(rpfn[0]) + + if align_offset(pfn_start, nr_cont) != align_offset(vfn_start, nr_cont): + continue + + off = align_forward(vfn_start, nr_cont) - vfn_start + index_next += off + + while index_next + nr_cont <= index_end: + folio_boundary = heads[index_next+1:index_next+nr_cont].any() + if not folio_boundary: + if anons[index_next]: + nr_anon += nr_cont + else: + nr_file += nr_cont + index_next += nr_cont + + # Account blocks that are PMD-mapped spearately, so filter out of the stats. + # There is a race between acquiring the smaps stats and reading pagemap, + # where memory could be deallocated. So clamp to zero incase it would have + # gone negative. + anon_pmd_mapped = vma.stats['AnonHugePages']['value'] + file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \ + vma.stats['FilePmdMapped']['value'] + nr_anon = max(0, nr_anon - kbnr(anon_pmd_mapped)) + nr_file = max(0, nr_file - kbnr(file_pmd_mapped)) + + rstats = { + f"anon-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'anon', 'value': anon_pmd_mapped}, + f"file-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'file', 'value': file_pmd_mapped}, + } + + rstats[f"anon-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'anon', 'value': nrkb(nr_anon)} + rstats[f"file-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'file', 'value': nrkb(nr_file)} + + return rstats + + +def vma_print(vma, pid): + # Prints a VMA instance in a format similar to smaps. The main difference is + # that the pid is included as the first value. + print("{:010d}: {:016x}-{:016x} {}{}{}{} {:08x} {:02x}:{:02x} {:08x} {}" + .format( + pid, vma.start, vma.end, + 'r' if vma.read else '-', 'w' if vma.write else '-', + 'x' if vma.execute else '-', 'p' if vma.private else 's', + vma.pgoff, vma.major, vma.minor, vma.inode, vma.name + )) + + +def stats_print(stats, tot_anon, tot_file, inc_empty): + # Print a statistics dictionary. + label_field = 32 + for label, stat in stats.items(): + type = stat['type'] + value = stat['value'] + if value or inc_empty: + pad = max(0, label_field - len(label) - 1) + if type == 'anon' and tot_anon > 0: + percent = f' ({value / tot_anon:3.0%})' + elif type == 'file' and tot_file > 0: + percent = f' ({value / tot_file:3.0%})' + else: + percent = '' + print(f"{label}:{' ' * pad}{value:8} kB{percent}") + + +def vma_parse(vma, pagemap, kpageflags, contorders): + # Generate thp and cont statistics for a single VMA. + start = vma.start >> PAGE_SHIFT + end = vma.end >> PAGE_SHIFT + + pmes = pagemap.get(start, end - start) + present = pmes & PM_PAGE_PRESENT != 0 + pfns = pmes & PM_PFN_MASK + pfns = pfns[present] + vfns = np.arange(start, end, dtype=np.uint64) + vfns = vfns[present] + + pfn_vec = cont_ranges_all([pfns], [pfns])[0] + flags = kpageflags.getv(pfn_vec) + anons = flags & KPF_ANON != 0 + heads = flags & KPF_COMPOUND_HEAD != 0 + thps = flags & KPF_THP != 0 + + vfns = vfns[thps] + pfns = pfns[thps] + anons = anons[thps] + heads = heads[thps] + + indexes = np.arange(len(vfns), dtype=np.uint64) + ranges = cont_ranges_all([vfns, pfns], [indexes, vfns, pfns]) + + thpstats = thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads) + contstats = [cont_parse(vma, order, ranges, anons, heads) for order in contorders] + + tot_anon = vma.stats['Anonymous']['value'] + tot_file = vma.stats['Rss']['value'] - tot_anon + + return { + **thpstats, + **{k: v for s in contstats for k, v in s.items()} + }, tot_anon, tot_file + + +def do_main(args): + pids = set() + rollup = {} + rollup_anon = 0 + rollup_file = 0 + + if args.cgroup: + strict = False + for walk_info in os.walk(args.cgroup): + cgroup = walk_info[0] + with open(f'{cgroup}/cgroup.procs') as pidfile: + for line in pidfile.readlines(): + pids.add(int(line.strip())) + elif args.pid: + strict = True + pids = pids.union(args.pid) + else: + strict = False + for pid in os.listdir('/proc'): + if pid.isdigit(): + pids.add(int(pid)) + + if not args.rollup: + print(" PID START END PROT OFFSET DEV INODE OBJECT") + + for pid in pids: + try: + with PageMap(pid) as pagemap: + with KPageFlags() as kpageflags: + for vma in VMAList(pid, vma_all_stats if args.inc_smaps else vma_min_stats): + if (vma.read or vma.write or vma.execute) and vma.stats['Rss']['value'] > 0: + stats, vma_anon, vma_file = vma_parse(vma, pagemap, kpageflags, args.cont) + else: + stats = {} + vma_anon = 0 + vma_file = 0 + if args.inc_smaps: + stats = {**vma.stats, **stats} + if args.rollup: + for k, v in stats.items(): + if k in rollup: + assert(rollup[k]['type'] == v['type']) + rollup[k]['value'] += v['value'] + else: + rollup[k] = v + rollup_anon += vma_anon + rollup_file += vma_file + else: + vma_print(vma, pid) + stats_print(stats, vma_anon, vma_file, args.inc_empty) + except (FileNotFoundError, ProcessLookupError, FileIOException): + if strict: + raise + + if args.rollup: + stats_print(rollup, rollup_anon, rollup_file, args.inc_empty) + + +def main(): + docs_width = shutil.get_terminal_size().columns + docs_width -= 2 + docs_width = min(80, docs_width) + + def format(string): + text = re.sub(r'\s+', ' ', string) + text = re.sub(r'\s*\\n\s*', '\n', text) + paras = text.split('\n') + paras = [textwrap.fill(p, width=docs_width) for p in paras] + return '\n'.join(paras) + + def formatter(prog): + return argparse.RawDescriptionHelpFormatter(prog, width=docs_width) + + def size2order(human): + units = { + "K": 2**10, "M": 2**20, "G": 2**30, + "k": 2**10, "m": 2**20, "g": 2**30, + } + unit = 1 + if human[-1] in units: + unit = units[human[-1]] + human = human[:-1] + try: + size = int(human) + except ValueError: + raise ArgException('error: --cont value must be integer size with optional KMG unit') + size *= unit + order = int(math.log2(size / PAGE_SIZE)) + if order < 1: + raise ArgException('error: --cont value must be size of at least 2 pages') + if (1 << order) * PAGE_SIZE != size: + raise ArgException('error: --cont value must be size of power-of-2 pages') + if order > PMD_ORDER: + raise ArgException('error: --cont value must be less than or equal to PMD order') + return order + + parser = argparse.ArgumentParser(formatter_class=formatter, + description=format("""Prints information about how transparent huge + pages are mapped, either system-wide, or for a specified + process or cgroup.\\n + \\n + When run with --pid, the user explicitly specifies the set + of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run + with --cgroup, the user passes either a v1 or v2 cgroup and + all pids that belong to the cgroup subtree are scanned. When + run with neither --pid nor --cgroup, the full set of pids on + the system is gathered from /proc and scanned as if the user + had provided "--pid 1 --pid 2 ...".\\n + \\n + A default set of statistics is always generated for THP + mappings. However, it is also possible to generate + additional statistics for "contiguous block mappings" where + the block size is user-defined.\\n + \\n + Statistics are maintained independently for anonymous and + file-backed (pagecache) memory and are shown both in kB and + as a percentage of either total anonymous or total + file-backed memory as appropriate.\\n + \\n + THP Statistics\\n + --------------\\n + \\n + Statistics are always generated for fully- and + contiguously-mapped THPs whose mapping address is aligned to + their size, for each supported by the system. + Separate counters describe THPs mapped by PTE vs those + mapped by PMD. (Although note a THP can only be mapped by + PMD if it is PMD-sized):\\n + \\n + - anon-thp-pte-aligned-kB\\n + - file-thp-pte-aligned-kB\\n + - anon-thp-pmd-aligned-kB\\n + - file-thp-pmd-aligned-kB\\n + \\n + Similarly, statistics are always generated for fully- and + contiguously-mapped THPs whose mapping address is *not* + aligned to their size, for each supported by the + system. Due to the unaligned mapping, it is impossible to + map by PMD, so there are only PTE counters for this case:\\n + \\n + - anon-thp-pte-unaligned-kB\\n + - file-thp-pte-unaligned-kB\\n + \\n + Statistics are also always generated for mapped pages that + belong to a THP but where the is THP is *not* fully- and + contiguously- mapped. These "partial" mappings are all + counted in the same counter regardless of the size of the + THP that is partially mapped:\\n + \\n + - anon-thp-pte-partial\\n + - file-thp-pte-partial\\n + \\n + Contiguous Block Statistics\\n + ---------------------------\\n + \\n + An optional, additional set of statistics is generated for + every contiguous block size specified with `--cont `. + These statistics show how much memory is mapped in + contiguous blocks of and also aligned to . A + given contiguous block must all belong to the same THP, but + there is no requirement for it to be the *whole* THP. + Separate counters describe contiguous blocks mapped by PTE + vs those mapped by PMD:\\n + \\n + - anon-cont-pte-aligned-kB\\n + - file-cont-pte-aligned-kB\\n + - anon-cont-pmd-aligned-kB\\n + - file-cont-pmd-aligned-kB\\n + \\n + As an example, if monitoring 64K contiguous blocks (--cont + 64K), there are a number of sources that could provide such + blocks: a fully- and contiguously-mapped 64K THP that is + aligned to a 64K boundary would provide 1 block. A fully- + and contiguously-mapped 128K THP that is aligned to at least + a 64K boundary would provide 2 blocks. Or a 128K THP that + maps its first 100K, but contiguously and starting at a 64K + boundary would provide 1 block. A fully- and + contiguously-mapped 2M THP would provide 32 blocks. There + are many other possible permutations.\\n"""), + epilog=format("""Requires root privilege to access pagemap and + kpageflags.""")) + + group = parser.add_mutually_exclusive_group(required=False) + group.add_argument('--pid', + metavar='pid', required=False, type=int, default=[], action='append', + help="""Process id of the target process. Maybe issued multiple times to + scan multiple processes. --pid and --cgroup are mutually exclusive. + If neither are provided, all processes are scanned to provide + system-wide information.""") + + group.add_argument('--cgroup', + metavar='path', required=False, + help="""Path to the target cgroup in sysfs. Iterates over every pid in + the cgroup and its children. --pid and --cgroup are mutually + exclusive. If neither are provided, all processes are scanned to + provide system-wide information.""") + + parser.add_argument('--rollup', + required=False, default=False, action='store_true', + help="""Sum the per-vma statistics to provide a summary over the whole + system, process or cgroup.""") + + parser.add_argument('--cont', + metavar='size[KMG]', required=False, default=[], action='append', + help="""Adds stats for memory that is mapped in contiguous blocks of + and also aligned to . May be issued multiple times to + track multiple sized blocks. Useful to infer e.g. arm64 contpte and + hpa mappings. Size must be a power-of-2 number of pages.""") + + parser.add_argument('--inc-smaps', + required=False, default=False, action='store_true', + help="""Include all numerical, additive /proc//smaps stats in the + output.""") + + parser.add_argument('--inc-empty', + required=False, default=False, action='store_true', + help="""Show all statistics including those whose value is 0.""") + + parser.add_argument('--periodic', + metavar='sleep_ms', required=False, type=int, + help="""Run in a loop, polling every sleep_ms milliseconds.""") + + args = parser.parse_args() + + try: + args.cont = [size2order(cont) for cont in args.cont] + except ArgException as e: + parser.print_usage() + raise + + if args.periodic: + while True: + do_main(args) + print() + time.sleep(args.periodic / 1000) + else: + do_main(args) + + +if __name__ == "__main__": + try: + main() + except Exception as e: + prog = os.path.basename(sys.argv[0]) + print(f'{prog}: {e}') + exit(1) From 085ff35e76368455c629b194bf3cb62dd82eadf6 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 17 Jan 2024 18:39:54 +0800 Subject: [PATCH 034/435] mm: memory: move mem_cgroup_charge() into alloc_anon_folio() The GFP flags from vma_thp_gfp_mask() according to user configuration only used for large folio allocation but not for memory cgroup charge, and GFP_KERNEL is used for both order-0 and large order folio when memory cgroup charge at present. However, mem_cgroup_charge() uses the GFP flags in a fairly sophisticated way. In addition to checking gfpflags_allow_blocking(), it pays attention to __GFP_NORETRY and __GFP_RETRY_MAYFAIL to ensure that processes within this memcg do not exceed their quotas. So we'd better to move mem_cgroup_charge() into alloc_anon_folio(), 1) it will make us to allocate as much as possible large order folio, because we could try the next order if mem_cgroup_charge() fails, although the memcg's memory usage is close to its limits. 2) using same GFP flags for allocation and charge is to be consistent with PMD THP firstly, in addition, according to GFP flag returned from vma_thp_gfp_mask(), GFP_TRANSHUGE_LIGHT could make us skip direct reclaim, _GFP_NORETRY will make us skip mem_cgroup_oom() and won't trigger memory cgroup oom from large order(order <= COSTLY_ORDER) folio charging. Link: https://lkml.kernel.org/r/20240122011612.501029-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240117103954.2756050-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memory.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 5e608edfe330..00f3f4fbd131 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4173,8 +4173,8 @@ static bool pte_range_none(pte_t *pte, int nr_pages) static struct folio *alloc_anon_folio(struct vm_fault *vmf) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE struct vm_area_struct *vma = vmf->vma; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long orders; struct folio *folio; unsigned long addr; @@ -4226,15 +4226,21 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr, true); if (folio) { + if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + folio_put(folio); + goto next; + } + folio_throttle_swaprate(folio, gfp); clear_huge_page(&folio->page, vmf->address, 1 << order); return folio; } +next: order = next_order(&orders, order); } fallback: #endif - return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address); + return folio_prealloc(vma->vm_mm, vma, vmf->address, true); } /* @@ -4301,10 +4307,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) nr_pages = folio_nr_pages(folio); addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); - if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) - goto oom_free_page; - folio_throttle_swaprate(folio, GFP_KERNEL); - /* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before @@ -4358,8 +4360,6 @@ unlock: release: folio_put(folio); goto unlock; -oom_free_page: - folio_put(folio); oom: return VM_FAULT_OOM; } From d9b3ce8769e371554a669f262bbc61c02a40efcc Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 18 Jan 2024 18:42:35 +0000 Subject: [PATCH 035/435] mm: writeback: ratelimit stat flush from mem_cgroup_wb_stats One of our workloads (Postgres 14) has regressed when migrated from 5.10 to 6.1 upstream kernel. The regression can be reproduced by sysbench's oltp_write_only benchmark. It seems like the always on rstat flush in mem_cgroup_wb_stats() is causing the regression. So, rate limit that specific rstat flush. One potential consequence would be the dirty throttling might be decided on stale memcg stats. However from our benchmarks and production traffic we have not observed any change in the dirty throttling behavior of the application. Link: https://lkml.kernel.org/r/20240118184235.618164-1-shakeelb@google.com Fixes: 2d146aa3aa84 ("mm: memcontrol: switch to rstat") Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Jan Kara Cc: Jens Axboe Cc: Michal Hocko Cc: Muchun Song Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index db92401257f7..df11d6d19ee3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4800,7 +4800,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - mem_cgroup_flush_stats(memcg); + mem_cgroup_flush_stats_ratelimited(memcg); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); From a9117b4d7f178ea36e8d256f8ab3752839e245b2 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 18 Jan 2024 01:50:57 -0800 Subject: [PATCH 036/435] selftests/memfd: delete unused declarations Commit 32d118ad50a5 ("selftests/memfd: add tests for F_SEAL_EXEC"): - added several unused 'nbytes' local variables Commit 6469b66e3f5a ("selftests: improve vm.memfd_noexec sysctl tests"): - orphaned 'newpid_thread_fn2()' forward declaration - orphaned 'join_newpid_thread()' forward declaration - added unused 'pid' local in sysctl_simple_child() - orphaned 'fd' local in sysctl_simple_child() - added unused 'fd' in sysctl_nested_child() Delete the unused locals and forward declarations. Link: https://lkml.kernel.org/r/20240118095057.677544-1-gthelen@google.com Signed-off-by: Greg Thelen Cc: Aleksa Sarai Cc: Daniel Verkamp Cc: Jeff Xu Cc: Kees Cook Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 3df008677239..18f585684e20 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -44,8 +44,6 @@ */ static size_t mfd_def_size = MFD_DEF_SIZE; static const char *memfd_str = MEMFD_STR; -static int newpid_thread_fn2(void *arg); -static void join_newpid_thread(pid_t pid); static ssize_t fd2name(int fd, char *buf, size_t bufsize) { @@ -194,7 +192,6 @@ static unsigned int mfd_assert_get_seals(int fd) static void mfd_assert_has_seals(int fd, unsigned int seals) { char buf[PATH_MAX]; - int nbytes; unsigned int s; fd2name(fd, buf, PATH_MAX); @@ -696,7 +693,6 @@ static void mfd_assert_mode(int fd, int mode) { struct stat st; char buf[PATH_MAX]; - int nbytes; fd2name(fd, buf, PATH_MAX); @@ -715,7 +711,6 @@ static void mfd_assert_mode(int fd, int mode) static void mfd_assert_chmod(int fd, int mode) { char buf[PATH_MAX]; - int nbytes; fd2name(fd, buf, PATH_MAX); @@ -731,7 +726,6 @@ static void mfd_fail_chmod(int fd, int mode) { struct stat st; char buf[PATH_MAX]; - int nbytes; fd2name(fd, buf, PATH_MAX); @@ -1254,9 +1248,6 @@ static void test_sysctl_set_sysctl2(void) static int sysctl_simple_child(void *arg) { - int fd; - int pid; - printf("%s sysctl 0\n", memfd_str); test_sysctl_set_sysctl0(); @@ -1321,7 +1312,6 @@ static void test_sysctl_sysctl2_failset(void) static int sysctl_nested_child(void *arg) { - int fd; int pid; printf("%s nested sysctl 0\n", memfd_str); From 6ca03f1bb5a7427a66df62c954b3500a4255cdb9 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Wed, 17 Jan 2024 14:39:21 -0800 Subject: [PATCH 037/435] userfaultfd: fix return error if mmap_changing is non-zero in MOVE ioctl To be consistent with other uffd ioctl's returning EAGAIN when mmap_changing is detected, we should change UFFDIO_MOVE to do the same. Link: https://lkml.kernel.org/r/20240117223922.1445327-1-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Acked-by: Suren Baghdasaryan Acked-by: Mike Rapoport (IBM) Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: David Hildenbrand Cc: Jann Horn Cc: Kalesh Singh Cc: Matthew Wilcox (Oracle) Cc: Nicolas Geoffray Cc: Peter Xu Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 959551ff9a95..05c8e8a05427 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -2047,7 +2047,7 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx, ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src, uffdio_move.len, uffdio_move.mode); else - ret = -EINVAL; + ret = -EAGAIN; mmap_read_unlock(mm); mmput(mm); From b433ffa8dbacbc5fddd6a110c4a23d0d711a6e74 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 17 Jan 2024 11:00:37 -0700 Subject: [PATCH 038/435] selftests: mm: perform some system cleanup before using hugepages When running with CATEGORY= (thp | hugetlb) we see a large numbers of tests failing. These failures are due to not being able to allocate a hugepage and normally occur on memory contrainted systems or when using large page sizes. drop_cache and compact_memory before the tests for a higher chance at a successful hugepage allocation. Link: https://lkml.kernel.org/r/20240117180037.15734-1-npache@redhat.com Signed-off-by: Nico Pache Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 246d53a5d7f2..040f27e21f47 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -206,6 +206,15 @@ pretty_name() { # Usage: run_test [test binary] [arbitrary test arguments...] run_test() { if test_selected ${CATEGORY}; then + # On memory constrainted systems some tests can fail to allocate hugepages. + # perform some cleanup before the test for a higher success rate. + if [ ${CATEGORY} == "thp" ] | [ ${CATEGORY} == "hugetlb" ]; then + echo 3 > /proc/sys/vm/drop_caches + sleep 2 + echo 1 > /proc/sys/vm/compact_memory + sleep 2 + fi + local test=$(pretty_name "$*") local title="running $*" local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -) From 8689d750006bbd811423dd41ed5efcd8a029862c Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 22 Jan 2024 11:20:00 +0100 Subject: [PATCH 039/435] maple_tree: avoid duplicate variable init in mast_spanning_rebalance() The local variables r_tmp and l_tmp in mast_spanning_rebalance() are already initialized at its declaration; there is no need to assign the value again. Remove the duplicate initialization of {r,l}_tmp. No functional change. Due to common compiler optimizations, also no change to object code. This issue was identified with clang-analyzer's dead stores analysis. Link: https://lkml.kernel.org/r/20240122102000.29558-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7b161802860b..82fb5195c235 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2271,8 +2271,6 @@ bool mast_spanning_rebalance(struct maple_subtree_state *mast) struct ma_state l_tmp = *mast->orig_l; unsigned char depth = 0; - r_tmp = *mast->orig_r; - l_tmp = *mast->orig_l; do { mas_ascend(mast->orig_r); mas_ascend(mast->orig_l); From 3efbe13e361acfd163fd1a2466e4fb9bed7dc1b0 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 22 Jan 2024 10:25:04 +0100 Subject: [PATCH 040/435] mempolicy: clean up minor dead code in queue_pages_test_walk() Commit 2cafb582173f ("mempolicy: remove confusing MPOL_MF_LAZY dead code") removes MPOL_MF_LAZY handling in queue_pages_test_walk(), and with that, there is no effective use of the local variable endvma in that function remaining. Remove the local variable endvma and its dead code. No functional change. This issue was identified with clang-analyzer's dead stores analysis. Link: https://lkml.kernel.org/r/20240122092504.18377-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/mempolicy.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 10a590ee1c89..5e519163c4dc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -654,7 +654,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, { struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; - unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; /* range check first */ @@ -682,9 +681,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, !(flags & MPOL_MF_STRICT)) return 1; - if (endvma > end) - endvma = end; - /* * Check page nodes, and queue pages to move, in the current vma. * But if no moving, and no strict checking, the scan can be skipped. From bb29fd7760ae39905127afd31fc83294625ff704 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 19 Jan 2024 11:22:22 +0000 Subject: [PATCH 041/435] mm/zswap: make sure each swapfile always have zswap rb-tree Patch series "mm/zswap: optimize the scalability of zswap rb-tree", v2. When testing the zswap performance by using kernel build -j32 in a tmpfs directory, I found the scalability of zswap rb-tree is not good, which is protected by the only spinlock. That would cause heavy lock contention if multiple tasks zswap_store/load concurrently. So a simple solution is to split the only one zswap rb-tree into multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M). This idea is from the commit 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB trunks"). Although this method can't solve the spinlock contention completely, it can mitigate much of that contention. Below is the results of kernel build in tmpfs with zswap shrinker enabled: linux-next zswap-lock-optimize real 1m9.181s 1m3.820s user 17m44.036s 17m40.100s sys 7m37.297s 4m54.622s So there are clearly improvements. And it's complementary with the ongoing zswap xarray conversion by Chris. Anyway, I think we can also merge this first, it's complementary IMHO. So I just refresh and resend this for further discussion. This patch (of 2): Not all zswap interfaces can handle the absence of the zswap rb-tree, actually only zswap_store() has handled it for now. To make things simple, we make sure each swapfile always have the zswap rb-tree prepared before being enabled and used. The preparation is unlikely to fail in practice, this patch just make it explicit. Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-0-b5cc55479090@bytedance.com Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-1-b5cc55479090@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Nhat Pham Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Cc: Chris Li Signed-off-by: Andrew Morton --- include/linux/zswap.h | 7 +++++-- mm/swapfile.c | 10 +++++++--- mm/zswap.c | 8 +++----- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 0b709f5bc65f..eca388229d9a 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -30,7 +30,7 @@ struct zswap_lruvec_state { bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); -void zswap_swapon(int type); +int zswap_swapon(int type); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); @@ -51,7 +51,10 @@ static inline bool zswap_load(struct folio *folio) } static inline void zswap_invalidate(int type, pgoff_t offset) {} -static inline void zswap_swapon(int type) {} +static inline int zswap_swapon(int type) +{ + return 0; +} static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} diff --git a/mm/swapfile.c b/mm/swapfile.c index 746aa9da5302..b3a83c5dcbb8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2348,8 +2348,6 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { - zswap_swapon(p->type); - spin_lock(&swap_lock); spin_lock(&p->lock); setup_swap_info(p, prio, swap_map, cluster_info); @@ -3167,6 +3165,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap_unlock_inode; + error = zswap_swapon(p->type); + if (error) + goto free_swap_address_space; + /* * Flush any pending IO and dirty mappings before we start using this * swap device. @@ -3175,7 +3177,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = inode_drain_writes(inode); if (error) { inode->i_flags &= ~S_SWAPFILE; - goto free_swap_address_space; + goto free_swap_zswap; } mutex_lock(&swapon_mutex); @@ -3199,6 +3201,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; +free_swap_zswap: + zswap_swapoff(p->type); free_swap_address_space: exit_swap_address_space(p->type); bad_swap_unlock_inode: diff --git a/mm/zswap.c b/mm/zswap.c index 2c3d77c6fe72..5a40a7b4bae8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1518,9 +1518,6 @@ bool zswap_store(struct folio *folio) if (folio_test_large(folio)) return false; - if (!tree) - return false; - /* * If this is a duplicate, it must be removed before attempting to store * it, otherwise, if the store fails the old page won't be removed from @@ -1775,19 +1772,20 @@ void zswap_invalidate(int type, pgoff_t offset) spin_unlock(&tree->lock); } -void zswap_swapon(int type) +int zswap_swapon(int type) { struct zswap_tree *tree; tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); - return; + return -ENOMEM; } tree->rbroot = RB_ROOT; spin_lock_init(&tree->lock); zswap_trees[type] = tree; + return 0; } void zswap_swapoff(int type) From 44c7c734a5132fc02f5584c7207f1d0c483f3ccd Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 19 Jan 2024 11:22:23 +0000 Subject: [PATCH 042/435] mm/zswap: split zswap rb-tree Each swapfile has one rb-tree to search the mapping of swp_entry_t to zswap_entry, that use a spinlock to protect, which can cause heavy lock contention if multiple tasks zswap_store/load concurrently. Optimize the scalability problem by splitting the zswap rb-tree into multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M), just like we did in the swap cache address_space splitting. Although this method can't solve the spinlock contention completely, it can mitigate much of that contention. Below is the results of kernel build in tmpfs with zswap shrinker enabled: linux-next zswap-lock-optimize real 1m9.181s 1m3.820s user 17m44.036s 17m40.100s sys 7m37.297s 4m54.622s So there are clearly improvements. Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-2-b5cc55479090@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Acked-by: Nhat Pham Acked-by: Yosry Ahmed Cc: Chris Li Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 +-- mm/swapfile.c | 2 +- mm/zswap.c | 71 ++++++++++++++++++++++++++++--------------- 3 files changed, 49 insertions(+), 28 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index eca388229d9a..91895ce1fdbc 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -30,7 +30,7 @@ struct zswap_lruvec_state { bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); -int zswap_swapon(int type); +int zswap_swapon(int type, unsigned long nr_pages); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); @@ -51,7 +51,7 @@ static inline bool zswap_load(struct folio *folio) } static inline void zswap_invalidate(int type, pgoff_t offset) {} -static inline int zswap_swapon(int type) +static inline int zswap_swapon(int type, unsigned long nr_pages) { return 0; } diff --git a/mm/swapfile.c b/mm/swapfile.c index b3a83c5dcbb8..0c6dde8b8604 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3165,7 +3165,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap_unlock_inode; - error = zswap_swapon(p->type); + error = zswap_swapon(p->type, maxpages); if (error) goto free_swap_address_space; diff --git a/mm/zswap.c b/mm/zswap.c index 5a40a7b4bae8..464179d43399 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -239,6 +239,7 @@ struct zswap_tree { }; static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; +static unsigned int nr_zswap_trees[MAX_SWAPFILES]; /* RCU-protected iteration */ static LIST_HEAD(zswap_pools); @@ -265,6 +266,12 @@ static bool zswap_has_pool; * helpers and fwd declarations **********************************/ +static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) +{ + return &zswap_trees[swp_type(swp)][swp_offset(swp) + >> SWAP_ADDRESS_SPACE_SHIFT]; +} + #define zswap_pool_debug(msg, p) \ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) @@ -864,7 +871,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o * until the entry is verified to still be alive in the tree. */ swpoffset = swp_offset(entry->swpentry); - tree = zswap_trees[swp_type(entry->swpentry)]; + tree = swap_zswap_tree(entry->swpentry); list_lru_isolate(l, item); /* * It's safe to drop the lock here because we return either @@ -1493,10 +1500,9 @@ static void zswap_fill_page(void *ptr, unsigned long value) bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; - int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; - struct zswap_tree *tree = zswap_trees[type]; + struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry, *dupentry; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; @@ -1569,7 +1575,7 @@ bool zswap_store(struct folio *folio) src = kmap_local_page(page); if (zswap_is_page_same_filled(src, &value)) { kunmap_local(src); - entry->swpentry = swp_entry(type, offset); + entry->swpentry = swp; entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages); @@ -1651,7 +1657,7 @@ bool zswap_store(struct folio *folio) mutex_unlock(&acomp_ctx->mutex); /* populate entry */ - entry->swpentry = swp_entry(type, offset); + entry->swpentry = swp; entry->handle = handle; entry->length = dlen; @@ -1711,10 +1717,9 @@ shrink: bool zswap_load(struct folio *folio) { swp_entry_t swp = folio->swap; - int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; - struct zswap_tree *tree = zswap_trees[type]; + struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry; u8 *dst; @@ -1757,7 +1762,7 @@ bool zswap_load(struct folio *folio) void zswap_invalidate(int type, pgoff_t offset) { - struct zswap_tree *tree = zswap_trees[type]; + struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset)); struct zswap_entry *entry; /* find */ @@ -1772,37 +1777,53 @@ void zswap_invalidate(int type, pgoff_t offset) spin_unlock(&tree->lock); } -int zswap_swapon(int type) +int zswap_swapon(int type, unsigned long nr_pages) { - struct zswap_tree *tree; + struct zswap_tree *trees, *tree; + unsigned int nr, i; - tree = kzalloc(sizeof(*tree), GFP_KERNEL); - if (!tree) { + nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); + trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL); + if (!trees) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); return -ENOMEM; } - tree->rbroot = RB_ROOT; - spin_lock_init(&tree->lock); - zswap_trees[type] = tree; + for (i = 0; i < nr; i++) { + tree = trees + i; + tree->rbroot = RB_ROOT; + spin_lock_init(&tree->lock); + } + + nr_zswap_trees[type] = nr; + zswap_trees[type] = trees; return 0; } void zswap_swapoff(int type) { - struct zswap_tree *tree = zswap_trees[type]; - struct zswap_entry *entry, *n; + struct zswap_tree *trees = zswap_trees[type]; + unsigned int i; - if (!tree) + if (!trees) return; - /* walk the tree and free everything */ - spin_lock(&tree->lock); - rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) - zswap_free_entry(entry); - tree->rbroot = RB_ROOT; - spin_unlock(&tree->lock); - kfree(tree); + for (i = 0; i < nr_zswap_trees[type]; i++) { + struct zswap_tree *tree = trees + i; + struct zswap_entry *entry, *n; + + /* walk the tree and free everything */ + spin_lock(&tree->lock); + rbtree_postorder_for_each_entry_safe(entry, n, + &tree->rbroot, + rbnode) + zswap_free_entry(entry); + tree->rbroot = RB_ROOT; + spin_unlock(&tree->lock); + } + + kvfree(trees); + nr_zswap_trees[type] = 0; zswap_trees[type] = NULL; } From 64cf264c8fefd013208904e2d1b5ba4ee26ca079 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 24 Jan 2024 04:51:11 +0000 Subject: [PATCH 043/435] mm: swap: enforce updating inuse_pages at the end of swap_range_free() Patch series "mm: zswap: simplify zswap_swapoff()", v2. These patches aim to simplify zswap_swapoff() by removing the unnecessary trees cleanup code. Patch 1 makes sure that the order of operations during swapoff is enforced correctly, making sure the simplification in patch 2 is correct in a future-proof manner. This patch (of 2): In swap_range_free(), we update inuse_pages then do some cleanups (arch invalidation, zswap invalidation, swap cache cleanups, etc). During swapoff, try_to_unuse() checks that inuse_pages is 0 to make sure all swap entries are freed. Make sure we only update inuse_pages after we are done with the cleanups in swap_range_free(), and use the proper memory barriers to enforce it. This makes sure that code following try_to_unuse() can safely assume that swap_range_free() ran for all entries in thr swapfile (e.g. swap cache cleanup, zswap_swapoff()). In practice, this currently isn't a problem because swap_range_free() is called with the swap info lock held, and the swapoff code happens to spin for that after try_to_unuse(). However, this seems fragile and unintentional, so make it more relable and future-proof. This also facilitates a following simplification of zswap_swapoff(). Link: https://lkml.kernel.org/r/20240124045113.415378-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20240124045113.415378-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: "Huang, Ying" Cc: Chengming Zhou Cc: Chris Li Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/swapfile.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 0c6dde8b8604..a8edaf4e5b8a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -737,8 +737,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, if (was_full && (si->flags & SWP_WRITEOK)) add_to_avail_list(si); } - atomic_long_add(nr_entries, &nr_swap_pages); - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; @@ -752,6 +750,14 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, offset++; } clear_shadow_from_swap_cache(si->type, begin, end); + + /* + * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 + * only after the above cleanups are done. + */ + smp_wmb(); + atomic_long_add(nr_entries, &nr_swap_pages); + WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); } static void set_cluster_next(struct swap_info_struct *si, unsigned long next) @@ -2049,7 +2055,7 @@ static int try_to_unuse(unsigned int type) unsigned int i; if (!READ_ONCE(si->inuse_pages)) - return 0; + goto success; retry: retval = shmem_unuse(type); @@ -2130,6 +2136,12 @@ retry: return -EINTR; } +success: + /* + * Make sure that further cleanups after try_to_unuse() returns happen + * after swap_range_free() reduces si->inuse_pages to 0. + */ + smp_mb(); return 0; } From 83e68f25decdc8d133dbfedd18b93f5267430049 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 24 Jan 2024 04:51:12 +0000 Subject: [PATCH 044/435] mm: zswap: remove unnecessary trees cleanups in zswap_swapoff() During swapoff, try_to_unuse() makes sure that zswap_invalidate() is called for all swap entries before zswap_swapoff() is called. This means that all zswap entries should already be removed from the tree. Simplify zswap_swapoff() by removing the trees cleanup code, and leave an assertion in its place. Link: https://lkml.kernel.org/r/20240124045113.415378-3-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Chengming Zhou Cc: Chris Li Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/zswap.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 464179d43399..0e4a869b6fd8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1808,19 +1808,9 @@ void zswap_swapoff(int type) if (!trees) return; - for (i = 0; i < nr_zswap_trees[type]; i++) { - struct zswap_tree *tree = trees + i; - struct zswap_entry *entry, *n; - - /* walk the tree and free everything */ - spin_lock(&tree->lock); - rbtree_postorder_for_each_entry_safe(entry, n, - &tree->rbroot, - rbnode) - zswap_free_entry(entry); - tree->rbroot = RB_ROOT; - spin_unlock(&tree->lock); - } + /* try_to_unuse() invalidated all the entries already */ + for (i = 0; i < nr_zswap_trees[type]; i++) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot)); kvfree(trees); nr_zswap_trees[type] = 0; From 412c6ef98624129dda6395ca691391b1da16d834 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Wed, 24 Jan 2024 11:57:19 +0800 Subject: [PATCH 045/435] mm/mmap: introduce vma_set_range() There is a lot of code needs to set the range of vma in mmap.c, introduce vma_set_range() to simplify the code. Link: https://lkml.kernel.org/r/20240124035719.3685193-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/internal.h | 9 +++++++++ mm/mmap.c | 29 +++++++---------------------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index f309a010d50f..1e29c5821a1d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1114,6 +1114,15 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, extern bool mirrored_kernelcore; extern bool memblock_has_mirror(void); +static __always_inline void vma_set_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + pgoff_t pgoff) +{ + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; +} + static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) { /* diff --git a/mm/mmap.c b/mm/mmap.c index 66f534ec90a5..476de5daf598 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -663,9 +663,7 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; + vma_set_range(vma, start, end, pgoff); vma_iter_store(vmi, vma); vma_complete(&vp, vmi, vma->vm_mm); @@ -708,9 +706,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_adjust_trans_huge(vma, start, end, 0); vma_iter_clear(vmi); - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; + vma_set_range(vma, start, end, pgoff); vma_complete(&vp, vmi, vma->vm_mm); return 0; } @@ -1015,10 +1011,7 @@ static struct vm_area_struct vma_prepare(&vp); vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); - - vma->vm_start = vma_start; - vma->vm_end = vma_end; - vma->vm_pgoff = vma_pgoff; + vma_set_range(vma, vma_start, vma_end, vma_pgoff); if (vma_expanded) vma_iter_store(vmi, vma); @@ -2811,11 +2804,9 @@ cannot_expand: } vma_iter_config(&vmi, addr, end); - vma->vm_start = addr; - vma->vm_end = end; + vma_set_range(vma, addr, end, pgoff); vm_flags_init(vma, vm_flags); vma->vm_page_prot = vm_get_page_prot(vm_flags); - vma->vm_pgoff = pgoff; if (file) { vma->vm_file = get_file(file); @@ -3165,9 +3156,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, goto unacct_fail; vma_set_anonymous(vma); - vma->vm_start = addr; - vma->vm_end = addr + len; - vma->vm_pgoff = addr >> PAGE_SHIFT; + vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(flags); vma_start_write(vma); @@ -3404,9 +3393,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma = vm_area_dup(vma); if (!new_vma) goto out; - new_vma->vm_start = addr; - new_vma->vm_end = addr + len; - new_vma->vm_pgoff = pgoff; + vma_set_range(new_vma, addr, addr + len, pgoff); if (vma_dup_policy(vma, new_vma)) goto out_free_vma; if (anon_vma_clone(new_vma, vma)) @@ -3574,9 +3561,7 @@ static struct vm_area_struct *__install_special_mapping( if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); - vma->vm_start = addr; - vma->vm_end = addr + len; - + vma_set_range(vma, addr, addr + len, 0); vm_flags_init(vma, (vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); From db128f5fdee9d9298c19d7137add7f7af840d9f7 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 25 Jan 2024 08:14:23 +0000 Subject: [PATCH 046/435] mm: zswap: remove unused tree argument in zswap_entry_put() Commit 7310895779624 ("mm: zswap: tighten up entry invalidation") removed the usage of tree argument, delete it. Link: https://lkml.kernel.org/r/20240125081423.1200336-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Signed-off-by: Andrew Morton --- mm/zswap.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 0e4a869b6fd8..bccef2af43cc 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -568,8 +568,7 @@ static void zswap_entry_get(struct zswap_entry *entry) /* caller must hold the tree lock * remove from the tree and free it, if nobody reference the entry */ -static void zswap_entry_put(struct zswap_tree *tree, - struct zswap_entry *entry) +static void zswap_entry_put(struct zswap_entry *entry) { int refcount = --entry->refcount; @@ -852,7 +851,7 @@ static void zswap_invalidate_entry(struct zswap_tree *tree, struct zswap_entry *entry) { if (zswap_rb_erase(&tree->rbroot, entry)) - zswap_entry_put(tree, entry); + zswap_entry_put(entry); } static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, @@ -921,7 +920,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o put_unlock: /* Drop local reference */ - zswap_entry_put(tree, entry); + zswap_entry_put(entry); unlock: spin_unlock(&tree->lock); spin_lock(lock); @@ -1754,7 +1753,7 @@ bool zswap_load(struct folio *folio) zswap_lru_del(&entry->pool->list_lru, entry); zswap_lru_add(&entry->pool->list_lru, entry); } - zswap_entry_put(tree, entry); + zswap_entry_put(entry); spin_unlock(&tree->lock); return true; From c05ae9d85b47211edb187b854b62ec68fb2a1e93 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 24 Jan 2024 12:03:46 -0800 Subject: [PATCH 047/435] dax/bus.c: replace driver-core lock usage by a local rwsem Patch series "Add DAX ABI for memmap_on_memory", v7. This series adds sysfs ABI to control memmap_on_memory behavior for DAX devices. Patch 1 replaces incorrect device_lock() usage with a local rwsem - this was identified during review. Patch 2 is also a preparatory patch that replaces sprintf() for sysfs operations with sysfs_emit() Patch 3 adds the missing documentation for the sysfs ABI for DAX regions and Dax devices. Patch 4 exports mhp_supports_memmap_on_memory(). Patch 5 adds the new ABI for toggling memmap_on_memory semantics for dax devices. This patch (of 5): The dax driver incorrectly used driver-core device locks to protect internal dax region and dax device configuration structures. Replace the device lock usage with a local rwsem, one each for dax region configuration and dax device configuration. As a result of this conversion, no device_lock() usage remains in dax/bus.c. Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-0-20d16cb8d23d@intel.com Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-1-20d16cb8d23d@intel.com Signed-off-by: Vishal Verma Reported-by: Greg Kroah-Hartman Reviewed-by: Alison Schofield Cc: Dan Williams Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Huang Ying Cc: Jonathan Cameron Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton --- drivers/dax/bus.c | 220 +++++++++++++++++++++++++++++++++------------- 1 file changed, 157 insertions(+), 63 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 1ff1ab5fa105..cb148f74ceda 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -12,6 +12,18 @@ static DEFINE_MUTEX(dax_bus_lock); +/* + * All changes to the dax region configuration occur with this lock held + * for write. + */ +DECLARE_RWSEM(dax_region_rwsem); + +/* + * All changes to the dax device configuration occur with this lock held + * for write. + */ +DECLARE_RWSEM(dax_dev_rwsem); + #define DAX_NAME_LEN 30 struct dax_id { struct list_head list; @@ -180,7 +192,7 @@ static u64 dev_dax_size(struct dev_dax *dev_dax) u64 size = 0; int i; - device_lock_assert(&dev_dax->dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_dev_rwsem)); for (i = 0; i < dev_dax->nr_range; i++) size += range_len(&dev_dax->ranges[i].range); @@ -194,8 +206,15 @@ static int dax_bus_probe(struct device *dev) struct dev_dax *dev_dax = to_dev_dax(dev); struct dax_region *dax_region = dev_dax->region; int rc; + u64 size; - if (dev_dax_size(dev_dax) == 0 || dev_dax->id < 0) + rc = down_read_interruptible(&dax_dev_rwsem); + if (rc) + return rc; + size = dev_dax_size(dev_dax); + up_read(&dax_dev_rwsem); + + if (size == 0 || dev_dax->id < 0) return -ENXIO; rc = dax_drv->probe(dev_dax); @@ -283,7 +302,7 @@ static unsigned long long dax_region_avail_size(struct dax_region *dax_region) resource_size_t size = resource_size(&dax_region->res); struct resource *res; - device_lock_assert(dax_region->dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem)); for_each_dax_region_resource(dax_region, res) size -= resource_size(res); @@ -295,10 +314,13 @@ static ssize_t available_size_show(struct device *dev, { struct dax_region *dax_region = dev_get_drvdata(dev); unsigned long long size; + int rc; - device_lock(dev); + rc = down_read_interruptible(&dax_region_rwsem); + if (rc) + return rc; size = dax_region_avail_size(dax_region); - device_unlock(dev); + up_read(&dax_region_rwsem); return sprintf(buf, "%llu\n", size); } @@ -314,10 +336,12 @@ static ssize_t seed_show(struct device *dev, if (is_static(dax_region)) return -EINVAL; - device_lock(dev); + rc = down_read_interruptible(&dax_region_rwsem); + if (rc) + return rc; seed = dax_region->seed; rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : ""); - device_unlock(dev); + up_read(&dax_region_rwsem); return rc; } @@ -333,14 +357,18 @@ static ssize_t create_show(struct device *dev, if (is_static(dax_region)) return -EINVAL; - device_lock(dev); + rc = down_read_interruptible(&dax_region_rwsem); + if (rc) + return rc; youngest = dax_region->youngest; rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : ""); - device_unlock(dev); + up_read(&dax_region_rwsem); return rc; } +static struct dev_dax *__devm_create_dev_dax(struct dev_dax_data *data); + static ssize_t create_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -358,7 +386,9 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr, if (val != 1) return -EINVAL; - device_lock(dev); + rc = down_write_killable(&dax_region_rwsem); + if (rc) + return rc; avail = dax_region_avail_size(dax_region); if (avail == 0) rc = -ENOSPC; @@ -369,7 +399,7 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr, .id = -1, .memmap_on_memory = false, }; - struct dev_dax *dev_dax = devm_create_dev_dax(&data); + struct dev_dax *dev_dax = __devm_create_dev_dax(&data); if (IS_ERR(dev_dax)) rc = PTR_ERR(dev_dax); @@ -387,7 +417,7 @@ static ssize_t create_store(struct device *dev, struct device_attribute *attr, rc = len; } } - device_unlock(dev); + up_write(&dax_region_rwsem); return rc; } @@ -417,7 +447,7 @@ static void trim_dev_dax_range(struct dev_dax *dev_dax) struct range *range = &dev_dax->ranges[i].range; struct dax_region *dax_region = dev_dax->region; - device_lock_assert(dax_region->dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem)); dev_dbg(&dev_dax->dev, "delete range[%d]: %#llx:%#llx\n", i, (unsigned long long)range->start, (unsigned long long)range->end); @@ -435,7 +465,7 @@ static void free_dev_dax_ranges(struct dev_dax *dev_dax) trim_dev_dax_range(dev_dax); } -static void unregister_dev_dax(void *dev) +static void __unregister_dev_dax(void *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); @@ -447,6 +477,17 @@ static void unregister_dev_dax(void *dev) put_device(dev); } +static void unregister_dev_dax(void *dev) +{ + if (rwsem_is_locked(&dax_region_rwsem)) + return __unregister_dev_dax(dev); + + if (WARN_ON_ONCE(down_write_killable(&dax_region_rwsem) != 0)) + return; + __unregister_dev_dax(dev); + up_write(&dax_region_rwsem); +} + static void dax_region_free(struct kref *kref) { struct dax_region *dax_region; @@ -463,11 +504,10 @@ static void dax_region_put(struct dax_region *dax_region) /* a return value >= 0 indicates this invocation invalidated the id */ static int __free_dev_dax_id(struct dev_dax *dev_dax) { - struct device *dev = &dev_dax->dev; struct dax_region *dax_region; int rc = dev_dax->id; - device_lock_assert(dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_dev_rwsem)); if (!dev_dax->dyn_id || dev_dax->id < 0) return -1; @@ -480,12 +520,13 @@ static int __free_dev_dax_id(struct dev_dax *dev_dax) static int free_dev_dax_id(struct dev_dax *dev_dax) { - struct device *dev = &dev_dax->dev; int rc; - device_lock(dev); + rc = down_write_killable(&dax_dev_rwsem); + if (rc) + return rc; rc = __free_dev_dax_id(dev_dax); - device_unlock(dev); + up_write(&dax_dev_rwsem); return rc; } @@ -519,8 +560,14 @@ static ssize_t delete_store(struct device *dev, struct device_attribute *attr, if (!victim) return -ENXIO; - device_lock(dev); - device_lock(victim); + rc = down_write_killable(&dax_region_rwsem); + if (rc) + return rc; + rc = down_write_killable(&dax_dev_rwsem); + if (rc) { + up_write(&dax_region_rwsem); + return rc; + } dev_dax = to_dev_dax(victim); if (victim->driver || dev_dax_size(dev_dax)) rc = -EBUSY; @@ -541,12 +588,12 @@ static ssize_t delete_store(struct device *dev, struct device_attribute *attr, } else rc = -EBUSY; } - device_unlock(victim); + up_write(&dax_dev_rwsem); /* won the race to invalidate the device, clean it up */ if (do_del) devm_release_action(dev, unregister_dev_dax, victim); - device_unlock(dev); + up_write(&dax_region_rwsem); put_device(victim); return rc; @@ -658,16 +705,15 @@ static void dax_mapping_release(struct device *dev) put_device(parent); } -static void unregister_dax_mapping(void *data) +static void __unregister_dax_mapping(void *data) { struct device *dev = data; struct dax_mapping *mapping = to_dax_mapping(dev); struct dev_dax *dev_dax = to_dev_dax(dev->parent); - struct dax_region *dax_region = dev_dax->region; dev_dbg(dev, "%s\n", __func__); - device_lock_assert(dax_region->dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem)); dev_dax->ranges[mapping->range_id].mapping = NULL; mapping->range_id = -1; @@ -675,28 +721,37 @@ static void unregister_dax_mapping(void *data) device_unregister(dev); } +static void unregister_dax_mapping(void *data) +{ + if (rwsem_is_locked(&dax_region_rwsem)) + return __unregister_dax_mapping(data); + + if (WARN_ON_ONCE(down_write_killable(&dax_region_rwsem) != 0)) + return; + __unregister_dax_mapping(data); + up_write(&dax_region_rwsem); +} + static struct dev_dax_range *get_dax_range(struct device *dev) { struct dax_mapping *mapping = to_dax_mapping(dev); struct dev_dax *dev_dax = to_dev_dax(dev->parent); - struct dax_region *dax_region = dev_dax->region; + int rc; - device_lock(dax_region->dev); + rc = down_write_killable(&dax_region_rwsem); + if (rc) + return NULL; if (mapping->range_id < 0) { - device_unlock(dax_region->dev); + up_write(&dax_region_rwsem); return NULL; } return &dev_dax->ranges[mapping->range_id]; } -static void put_dax_range(struct dev_dax_range *dax_range) +static void put_dax_range(void) { - struct dax_mapping *mapping = dax_range->mapping; - struct dev_dax *dev_dax = to_dev_dax(mapping->dev.parent); - struct dax_region *dax_region = dev_dax->region; - - device_unlock(dax_region->dev); + up_write(&dax_region_rwsem); } static ssize_t start_show(struct device *dev, @@ -709,7 +764,7 @@ static ssize_t start_show(struct device *dev, if (!dax_range) return -ENXIO; rc = sprintf(buf, "%#llx\n", dax_range->range.start); - put_dax_range(dax_range); + put_dax_range(); return rc; } @@ -725,7 +780,7 @@ static ssize_t end_show(struct device *dev, if (!dax_range) return -ENXIO; rc = sprintf(buf, "%#llx\n", dax_range->range.end); - put_dax_range(dax_range); + put_dax_range(); return rc; } @@ -741,7 +796,7 @@ static ssize_t pgoff_show(struct device *dev, if (!dax_range) return -ENXIO; rc = sprintf(buf, "%#lx\n", dax_range->pgoff); - put_dax_range(dax_range); + put_dax_range(); return rc; } @@ -775,7 +830,7 @@ static int devm_register_dax_mapping(struct dev_dax *dev_dax, int range_id) struct device *dev; int rc; - device_lock_assert(dax_region->dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem)); if (dev_WARN_ONCE(&dev_dax->dev, !dax_region->dev->driver, "region disabled\n")) @@ -821,7 +876,7 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start, struct resource *alloc; int i, rc; - device_lock_assert(dax_region->dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem)); /* handle the seed alloc special case */ if (!size) { @@ -875,13 +930,12 @@ static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, r { int last_range = dev_dax->nr_range - 1; struct dev_dax_range *dax_range = &dev_dax->ranges[last_range]; - struct dax_region *dax_region = dev_dax->region; bool is_shrink = resource_size(res) > size; struct range *range = &dax_range->range; struct device *dev = &dev_dax->dev; int rc; - device_lock_assert(dax_region->dev); + WARN_ON_ONCE(!rwsem_is_locked(&dax_region_rwsem)); if (dev_WARN_ONCE(dev, !size, "deletion is handled by dev_dax_shrink\n")) return -EINVAL; @@ -907,10 +961,13 @@ static ssize_t size_show(struct device *dev, { struct dev_dax *dev_dax = to_dev_dax(dev); unsigned long long size; + int rc; - device_lock(dev); + rc = down_write_killable(&dax_dev_rwsem); + if (rc) + return rc; size = dev_dax_size(dev_dax); - device_unlock(dev); + up_write(&dax_dev_rwsem); return sprintf(buf, "%llu\n", size); } @@ -1080,17 +1137,27 @@ static ssize_t size_store(struct device *dev, struct device_attribute *attr, return -EINVAL; } - device_lock(dax_region->dev); + rc = down_write_killable(&dax_region_rwsem); + if (rc) + return rc; if (!dax_region->dev->driver) { - device_unlock(dax_region->dev); - return -ENXIO; + rc = -ENXIO; + goto err_region; } - device_lock(dev); - rc = dev_dax_resize(dax_region, dev_dax, val); - device_unlock(dev); - device_unlock(dax_region->dev); + rc = down_write_killable(&dax_dev_rwsem); + if (rc) + goto err_dev; - return rc == 0 ? len : rc; + rc = dev_dax_resize(dax_region, dev_dax, val); + +err_dev: + up_write(&dax_dev_rwsem); +err_region: + up_write(&dax_region_rwsem); + + if (rc == 0) + return len; + return rc; } static DEVICE_ATTR_RW(size); @@ -1138,18 +1205,24 @@ static ssize_t mapping_store(struct device *dev, struct device_attribute *attr, return rc; rc = -ENXIO; - device_lock(dax_region->dev); + rc = down_write_killable(&dax_region_rwsem); + if (rc) + return rc; if (!dax_region->dev->driver) { - device_unlock(dax_region->dev); + up_write(&dax_region_rwsem); + return rc; + } + rc = down_write_killable(&dax_dev_rwsem); + if (rc) { + up_write(&dax_region_rwsem); return rc; } - device_lock(dev); to_alloc = range_len(&r); if (alloc_is_aligned(dev_dax, to_alloc)) rc = alloc_dev_dax_range(dev_dax, r.start, to_alloc); - device_unlock(dev); - device_unlock(dax_region->dev); + up_write(&dax_dev_rwsem); + up_write(&dax_region_rwsem); return rc == 0 ? len : rc; } @@ -1196,13 +1269,19 @@ static ssize_t align_store(struct device *dev, struct device_attribute *attr, if (!dax_align_valid(val)) return -EINVAL; - device_lock(dax_region->dev); + rc = down_write_killable(&dax_region_rwsem); + if (rc) + return rc; if (!dax_region->dev->driver) { - device_unlock(dax_region->dev); + up_write(&dax_region_rwsem); return -ENXIO; } - device_lock(dev); + rc = down_write_killable(&dax_dev_rwsem); + if (rc) { + up_write(&dax_region_rwsem); + return rc; + } if (dev->driver) { rc = -EBUSY; goto out_unlock; @@ -1214,8 +1293,8 @@ static ssize_t align_store(struct device *dev, struct device_attribute *attr, if (rc) dev_dax->align = align_save; out_unlock: - device_unlock(dev); - device_unlock(dax_region->dev); + up_write(&dax_dev_rwsem); + up_write(&dax_region_rwsem); return rc == 0 ? len : rc; } static DEVICE_ATTR_RW(align); @@ -1325,7 +1404,7 @@ static const struct device_type dev_dax_type = { .groups = dax_attribute_groups, }; -struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data) +static struct dev_dax *__devm_create_dev_dax(struct dev_dax_data *data) { struct dax_region *dax_region = data->dax_region; struct device *parent = dax_region->dev; @@ -1440,6 +1519,21 @@ err_id: return ERR_PTR(rc); } + +struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data) +{ + struct dev_dax *dev_dax; + int rc; + + rc = down_write_killable(&dax_region_rwsem); + if (rc) + return ERR_PTR(rc); + + dev_dax = __devm_create_dev_dax(data); + up_write(&dax_region_rwsem); + + return dev_dax; +} EXPORT_SYMBOL_GPL(devm_create_dev_dax); int __dax_driver_register(struct dax_device_driver *dax_drv, From 6ebed0007f082cd80a63a8849c54e7cad0069469 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 24 Jan 2024 12:03:47 -0800 Subject: [PATCH 048/435] dax/bus.c: replace several sprintf() with sysfs_emit() There were several places where drivers/dax/bus.c uses 'sprintf' to print sysfs data. Since a sysfs_emit() helper is available specifically for this purpose, replace all the sprintf() usage for sysfs with sysfs_emit() in this file. Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-2-20d16cb8d23d@intel.com Signed-off-by: Vishal Verma Reported-by: Greg Kroah-Hartman Reviewed-by: Alison Schofield Cc: Dan Williams Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Huang Ying Cc: Jonathan Cameron Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton --- drivers/dax/bus.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index cb148f74ceda..0fd948a4443e 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -269,7 +269,7 @@ static ssize_t id_show(struct device *dev, { struct dax_region *dax_region = dev_get_drvdata(dev); - return sprintf(buf, "%d\n", dax_region->id); + return sysfs_emit(buf, "%d\n", dax_region->id); } static DEVICE_ATTR_RO(id); @@ -278,8 +278,8 @@ static ssize_t region_size_show(struct device *dev, { struct dax_region *dax_region = dev_get_drvdata(dev); - return sprintf(buf, "%llu\n", (unsigned long long) - resource_size(&dax_region->res)); + return sysfs_emit(buf, "%llu\n", + (unsigned long long)resource_size(&dax_region->res)); } static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, region_size_show, NULL); @@ -289,7 +289,7 @@ static ssize_t region_align_show(struct device *dev, { struct dax_region *dax_region = dev_get_drvdata(dev); - return sprintf(buf, "%u\n", dax_region->align); + return sysfs_emit(buf, "%u\n", dax_region->align); } static struct device_attribute dev_attr_region_align = __ATTR(align, 0400, region_align_show, NULL); @@ -322,7 +322,7 @@ static ssize_t available_size_show(struct device *dev, size = dax_region_avail_size(dax_region); up_read(&dax_region_rwsem); - return sprintf(buf, "%llu\n", size); + return sysfs_emit(buf, "%llu\n", size); } static DEVICE_ATTR_RO(available_size); @@ -340,7 +340,7 @@ static ssize_t seed_show(struct device *dev, if (rc) return rc; seed = dax_region->seed; - rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : ""); + rc = sysfs_emit(buf, "%s\n", seed ? dev_name(seed) : ""); up_read(&dax_region_rwsem); return rc; @@ -361,7 +361,7 @@ static ssize_t create_show(struct device *dev, if (rc) return rc; youngest = dax_region->youngest; - rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : ""); + rc = sysfs_emit(buf, "%s\n", youngest ? dev_name(youngest) : ""); up_read(&dax_region_rwsem); return rc; @@ -763,7 +763,7 @@ static ssize_t start_show(struct device *dev, dax_range = get_dax_range(dev); if (!dax_range) return -ENXIO; - rc = sprintf(buf, "%#llx\n", dax_range->range.start); + rc = sysfs_emit(buf, "%#llx\n", dax_range->range.start); put_dax_range(); return rc; @@ -779,7 +779,7 @@ static ssize_t end_show(struct device *dev, dax_range = get_dax_range(dev); if (!dax_range) return -ENXIO; - rc = sprintf(buf, "%#llx\n", dax_range->range.end); + rc = sysfs_emit(buf, "%#llx\n", dax_range->range.end); put_dax_range(); return rc; @@ -795,7 +795,7 @@ static ssize_t pgoff_show(struct device *dev, dax_range = get_dax_range(dev); if (!dax_range) return -ENXIO; - rc = sprintf(buf, "%#lx\n", dax_range->pgoff); + rc = sysfs_emit(buf, "%#lx\n", dax_range->pgoff); put_dax_range(); return rc; @@ -969,7 +969,7 @@ static ssize_t size_show(struct device *dev, size = dev_dax_size(dev_dax); up_write(&dax_dev_rwsem); - return sprintf(buf, "%llu\n", size); + return sysfs_emit(buf, "%llu\n", size); } static bool alloc_is_aligned(struct dev_dax *dev_dax, resource_size_t size) @@ -1233,7 +1233,7 @@ static ssize_t align_show(struct device *dev, { struct dev_dax *dev_dax = to_dev_dax(dev); - return sprintf(buf, "%d\n", dev_dax->align); + return sysfs_emit(buf, "%d\n", dev_dax->align); } static ssize_t dev_dax_validate_align(struct dev_dax *dev_dax) @@ -1311,7 +1311,7 @@ static ssize_t target_node_show(struct device *dev, { struct dev_dax *dev_dax = to_dev_dax(dev); - return sprintf(buf, "%d\n", dev_dax_target_node(dev_dax)); + return sysfs_emit(buf, "%d\n", dev_dax_target_node(dev_dax)); } static DEVICE_ATTR_RO(target_node); @@ -1327,7 +1327,7 @@ static ssize_t resource_show(struct device *dev, else start = dev_dax->ranges[0].range.start; - return sprintf(buf, "%#llx\n", start); + return sysfs_emit(buf, "%#llx\n", start); } static DEVICE_ATTR(resource, 0400, resource_show, NULL); @@ -1338,14 +1338,14 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, * We only ever expect to handle device-dax instances, i.e. the * @type argument to MODULE_ALIAS_DAX_DEVICE() is always zero */ - return sprintf(buf, DAX_DEVICE_MODALIAS_FMT "\n", 0); + return sysfs_emit(buf, DAX_DEVICE_MODALIAS_FMT "\n", 0); } static DEVICE_ATTR_RO(modalias); static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", dev_to_node(dev)); + return sysfs_emit(buf, "%d\n", dev_to_node(dev)); } static DEVICE_ATTR_RO(numa_node); From 51e7849cd6e4050599ef4e255e85195788841bb4 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 24 Jan 2024 12:03:48 -0800 Subject: [PATCH 049/435] Documentatiion/ABI: add ABI documentation for sys-bus-dax Add the missing sysfs ABI documentation for the device DAX subsystem. Various ABI attributes under this have been present since v5.1, and more have been added over time. In preparation for adding a new attribute, add this file with the historical details. Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-3-20d16cb8d23d@intel.com Signed-off-by: Vishal Verma Cc: Dan Williams Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Huang Ying Cc: Jonathan Cameron Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-bus-dax | 136 ++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-bus-dax diff --git a/Documentation/ABI/testing/sysfs-bus-dax b/Documentation/ABI/testing/sysfs-bus-dax new file mode 100644 index 000000000000..6359f7bc9bf4 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-dax @@ -0,0 +1,136 @@ +What: /sys/bus/dax/devices/daxX.Y/align +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (RW) Provides a way to specify an alignment for a dax device. + Values allowed are constrained by the physical address ranges + that back the dax device, and also by arch requirements. + +What: /sys/bus/dax/devices/daxX.Y/mapping +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (WO) Provides a way to allocate a mapping range under a dax + device. Specified in the format -. + +What: /sys/bus/dax/devices/daxX.Y/mapping[0..N]/start +What: /sys/bus/dax/devices/daxX.Y/mapping[0..N]/end +What: /sys/bus/dax/devices/daxX.Y/mapping[0..N]/page_offset +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (RO) A dax device may have multiple constituent discontiguous + address ranges. These are represented by the different + 'mappingX' subdirectories. The 'start' attribute indicates the + start physical address for the given range. The 'end' attribute + indicates the end physical address for the given range. The + 'page_offset' attribute indicates the offset of the current + range in the dax device. + +What: /sys/bus/dax/devices/daxX.Y/resource +Date: June, 2019 +KernelVersion: v5.3 +Contact: nvdimm@lists.linux.dev +Description: + (RO) The resource attribute indicates the starting physical + address of a dax device. In case of a device with multiple + constituent ranges, it indicates the starting address of the + first range. + +What: /sys/bus/dax/devices/daxX.Y/size +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (RW) The size attribute indicates the total size of a dax + device. For creating subdivided dax devices, or for resizing + an existing device, the new size can be written to this as + part of the reconfiguration process. + +What: /sys/bus/dax/devices/daxX.Y/numa_node +Date: November, 2019 +KernelVersion: v5.5 +Contact: nvdimm@lists.linux.dev +Description: + (RO) If NUMA is enabled and the platform has affinitized the + backing device for this dax device, emit the CPU node + affinity for this device. + +What: /sys/bus/dax/devices/daxX.Y/target_node +Date: February, 2019 +KernelVersion: v5.1 +Contact: nvdimm@lists.linux.dev +Description: + (RO) The target-node attribute is the Linux numa-node that a + device-dax instance may create when it is online. Prior to + being online the device's 'numa_node' property reflects the + closest online cpu node which is the typical expectation of a + device 'numa_node'. Once it is online it becomes its own + distinct numa node. + +What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/available_size +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (RO) The available_size attribute tracks available dax region + capacity. This only applies to volatile hmem devices, not pmem + devices, since pmem devices are defined by nvdimm namespace + boundaries. + +What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/size +Date: July, 2017 +KernelVersion: v5.1 +Contact: nvdimm@lists.linux.dev +Description: + (RO) The size attribute indicates the size of a given dax region + in bytes. + +What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/align +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (RO) The align attribute indicates alignment of the dax region. + Changes on align may not always be valid, when say certain + mappings were created with 2M and then we switch to 1G. This + validates all ranges against the new value being attempted, post + resizing. + +What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/seed +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (RO) The seed device is a concept for dynamic dax regions to be + able to split the region amongst multiple sub-instances. The + seed device, similar to libnvdimm seed devices, is a device + that starts with zero capacity allocated and unbound to a + driver. + +What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/create +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (RW) The create interface to the dax region provides a way to + create a new unconfigured dax device under the given region, which + can then be configured (with a size etc.) and then probed. + +What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/delete +Date: October, 2020 +KernelVersion: v5.10 +Contact: nvdimm@lists.linux.dev +Description: + (WO) The delete interface for a dax region provides for deletion + of any 0-sized and idle dax devices. + +What: $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/id +Date: July, 2017 +KernelVersion: v5.1 +Contact: nvdimm@lists.linux.dev +Description: + (RO) The id attribute indicates the region id of a dax region. From 42d9358252e5d055223487d9f653c2a2ac859a2a Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 24 Jan 2024 12:03:49 -0800 Subject: [PATCH 050/435] mm/memory_hotplug: export mhp_supports_memmap_on_memory() In preparation for adding sysfs ABI to toggle memmap_on_memory semantics for drivers adding memory, export the mhp_supports_memmap_on_memory() helper. This allows drivers to check if memmap_on_memory support is available before trying to request it, and display an appropriate message if it isn't available. As part of this, remove the size argument to this - with recent updates to allow memmap_on_memory for larger ranges, and the internal splitting of altmaps into respective memory blocks, the size argument is meaningless. [akpm@linux-foundation.org: fix build] Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-4-20d16cb8d23d@intel.com Signed-off-by: Vishal Verma Acked-by: David Hildenbrand Suggested-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Jonathan Cameron Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Cc: Dan Williams Cc: Dave Jiang Cc: Dave Hansen Cc: Huang Ying Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 6 ++++++ mm/memory_hotplug.c | 17 ++++++----------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ee00015575aa..7a9ff464608d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -137,6 +137,7 @@ struct mhp_params { bool mhp_range_allowed(u64 start, u64 size, bool need_mapping); struct range mhp_get_pluggable_range(bool need_mapping); +bool mhp_supports_memmap_on_memory(void); /* * Zone resizing functions @@ -278,6 +279,11 @@ static inline bool movable_node_is_enabled(void) return false; } +static inline bool mhp_supports_memmap_on_memory(void) +{ + return false; +} + static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {} static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {} static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {} diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 707027f69150..a444e2d7dd2b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1337,7 +1337,7 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) } #endif -static bool mhp_supports_memmap_on_memory(unsigned long size) +bool mhp_supports_memmap_on_memory(void) { unsigned long vmemmap_size = memory_block_memmap_size(); unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); @@ -1346,17 +1346,11 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) * Besides having arch support and the feature enabled at runtime, we * need a few more assumptions to hold true: * - * a) We span a single memory block: memory onlining/offlinin;g happens - * in memory block granularity. We don't want the vmemmap of online - * memory blocks to reside on offline memory blocks. In the future, - * we might want to support variable-sized memory blocks to make the - * feature more versatile. - * - * b) The vmemmap pages span complete PMDs: We don't want vmemmap code + * a) The vmemmap pages span complete PMDs: We don't want vmemmap code * to populate memory from the altmap for unrelated parts (i.e., * other memory blocks) * - * c) The vmemmap pages (and thereby the pages that will be exposed to + * b) The vmemmap pages (and thereby the pages that will be exposed to * the buddy) have to cover full pageblocks: memory onlining/offlining * code requires applicable ranges to be page-aligned, for example, to * set the migratetypes properly. @@ -1368,7 +1362,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) * altmap as an alternative source of memory, and we do not exactly * populate a single PMD. */ - if (!mhp_memmap_on_memory() || size != memory_block_size_bytes()) + if (!mhp_memmap_on_memory()) return false; /* @@ -1391,6 +1385,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) return arch_supports_memmap_on_memory(vmemmap_size); } +EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory); static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size) { @@ -1526,7 +1521,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) * Self hosted memmap array */ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && - mhp_supports_memmap_on_memory(memory_block_size_bytes())) { + mhp_supports_memmap_on_memory()) { ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags); if (ret) goto error; From 73954d379efd176e9b011142c55aa8da93ac740a Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 24 Jan 2024 12:03:50 -0800 Subject: [PATCH 051/435] dax: add a sysfs knob to control memmap_on_memory behavior Add a sysfs knob for dax devices to control the memmap_on_memory setting if the dax device were to be hotplugged as system memory. The default memmap_on_memory setting for dax devices originating via pmem or hmem is set to 'false' - i.e. no memmap_on_memory semantics, to preserve legacy behavior. For dax devices via CXL, the default is on. The sysfs control allows the administrator to override the above defaults if needed. Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-5-20d16cb8d23d@intel.com Signed-off-by: Vishal Verma Tested-by: Li Zhijian Reviewed-by: Jonathan Cameron Reviewed-by: David Hildenbrand Reviewed-by: Huang, Ying Reviewed-by: Alison Schofield Cc: Dan Williams Cc: Dave Jiang Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-bus-dax | 17 ++++++++++ drivers/dax/bus.c | 43 +++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-dax b/Documentation/ABI/testing/sysfs-bus-dax index 6359f7bc9bf4..b34266bfae49 100644 --- a/Documentation/ABI/testing/sysfs-bus-dax +++ b/Documentation/ABI/testing/sysfs-bus-dax @@ -134,3 +134,20 @@ KernelVersion: v5.1 Contact: nvdimm@lists.linux.dev Description: (RO) The id attribute indicates the region id of a dax region. + +What: /sys/bus/dax/devices/daxX.Y/memmap_on_memory +Date: January, 2024 +KernelVersion: v6.8 +Contact: nvdimm@lists.linux.dev +Description: + (RW) Control the memmap_on_memory setting if the dax device + were to be hotplugged as system memory. This determines whether + the 'altmap' for the hotplugged memory will be placed on the + device being hotplugged (memmap_on_memory=1) or if it will be + placed on regular memory (memmap_on_memory=0). This attribute + must be set before the device is handed over to the 'kmem' + driver (i.e. hotplugged into system-ram). Additionally, this + depends on CONFIG_MHP_MEMMAP_ON_MEMORY, and a globally enabled + memmap_on_memory parameter for memory_hotplug. This is + typically set on the kernel command line - + memory_hotplug.memmap_on_memory set to 'true' or 'force'." diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 0fd948a4443e..27c86d0ca711 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -1349,6 +1349,48 @@ static ssize_t numa_node_show(struct device *dev, } static DEVICE_ATTR_RO(numa_node); +static ssize_t memmap_on_memory_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + + return sysfs_emit(buf, "%d\n", dev_dax->memmap_on_memory); +} + +static ssize_t memmap_on_memory_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct dev_dax *dev_dax = to_dev_dax(dev); + bool val; + int rc; + + rc = kstrtobool(buf, &val); + if (rc) + return rc; + + if (val == true && !mhp_supports_memmap_on_memory()) { + dev_dbg(dev, "memmap_on_memory is not available\n"); + return -EOPNOTSUPP; + } + + rc = down_write_killable(&dax_dev_rwsem); + if (rc) + return rc; + + if (dev_dax->memmap_on_memory != val && dev->driver && + to_dax_drv(dev->driver)->type == DAXDRV_KMEM_TYPE) { + up_write(&dax_dev_rwsem); + return -EBUSY; + } + + dev_dax->memmap_on_memory = val; + up_write(&dax_dev_rwsem); + + return len; +} +static DEVICE_ATTR_RW(memmap_on_memory); + static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); @@ -1375,6 +1417,7 @@ static struct attribute *dev_dax_attributes[] = { &dev_attr_align.attr, &dev_attr_resource.attr, &dev_attr_numa_node.attr, + &dev_attr_memmap_on_memory.attr, NULL, }; From 9af47276ed83cc346263e56243756543a2a33c9d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 18:12:15 +0000 Subject: [PATCH 052/435] highmem: add kernel-doc for memcpy_*_folio() This was inadvertently skipped when adding the new functions. Link: https://lkml.kernel.org/r/20240124181217.1761674-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/highmem.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 451c1dff0e87..00341b56d291 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -439,6 +439,13 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len) kunmap_local(addr); } +/** + * memcpy_from_folio - Copy a range of bytes from a folio. + * @to: The memory to copy to. + * @folio: The folio to read from. + * @offset: The first byte in the folio to read. + * @len: The number of bytes to copy. + */ static inline void memcpy_from_folio(char *to, struct folio *folio, size_t offset, size_t len) { @@ -460,6 +467,13 @@ static inline void memcpy_from_folio(char *to, struct folio *folio, } while (len > 0); } +/** + * memcpy_to_folio - Copy a range of bytes to a folio. + * @folio: The folio to write to. + * @offset: The first byte in the folio to store to. + * @from: The memory to copy from. + * @len: The number of bytes to copy. + */ static inline void memcpy_to_folio(struct folio *folio, size_t offset, const char *from, size_t len) { From d749cc7547bb46be73da848576ea155daf8186f3 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 24 Jan 2024 18:31:34 +0100 Subject: [PATCH 053/435] mm: kmsan: remove runtime checks from kmsan_unpoison_memory() Similarly to what's been done in commit 85716a80c16d ("kmsan: allow using __msan_instrument_asm_store() inside runtime"), it should be safe to call kmsan_unpoison_memory() from within the runtime, as it does not allocate memory or take locks. Remove the redundant runtime checks. This should fix false positives seen with CONFIG_DEBUG_LIST=y when the non-instrumented lib/stackdepot.c failed to unpoison the memory chunks later checked by the instrumented lib/list_debug.c Also replace the implementation of kmsan_unpoison_entry_regs() with a call to kmsan_unpoison_memory(). Link: https://lkml.kernel.org/r/20240124173134.1165747-1-glider@google.com Fixes: f80be4571b19 ("kmsan: add KMSAN runtime core") Signed-off-by: Alexander Potapenko Tested-by: Marco Elver Cc: Dmitry Vyukov Cc: Ilya Leoshkevich Cc: Nicholas Miehlbradt Signed-off-by: Andrew Morton --- mm/kmsan/hooks.c | 50 +++++++++++++++++++----------------------------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 5d6e2dee5692..0b09daa188ef 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -359,6 +359,12 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, } /* Functions from kmsan-checks.h follow. */ + +/* + * To create an origin, kmsan_poison_memory() unwinds the stacks and stores it + * into the stack depot. This may cause deadlocks if done from within KMSAN + * runtime, therefore we bail out if kmsan_in_runtime(). + */ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) { if (!kmsan_enabled || kmsan_in_runtime()) @@ -371,37 +377,11 @@ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) } EXPORT_SYMBOL(kmsan_poison_memory); -void kmsan_unpoison_memory(const void *address, size_t size) -{ - unsigned long ua_flags; - - if (!kmsan_enabled || kmsan_in_runtime()) - return; - - ua_flags = user_access_save(); - kmsan_enter_runtime(); - /* The users may want to poison/unpoison random memory. */ - kmsan_internal_unpoison_memory((void *)address, size, - KMSAN_POISON_NOCHECK); - kmsan_leave_runtime(); - user_access_restore(ua_flags); -} -EXPORT_SYMBOL(kmsan_unpoison_memory); - /* - * Version of kmsan_unpoison_memory() that can be called from within the KMSAN - * runtime. - * - * Non-instrumented IRQ entry functions receive struct pt_regs from assembly - * code. Those regs need to be unpoisoned, otherwise using them will result in - * false positives. - * Using kmsan_unpoison_memory() is not an option in entry code, because the - * return value of in_task() is inconsistent - as a result, certain calls to - * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that - * the registers are unpoisoned even if kmsan_in_runtime() is true in the early - * entry code. + * Unlike kmsan_poison_memory(), this function can be used from within KMSAN + * runtime, because it does not trigger allocations or call instrumented code. */ -void kmsan_unpoison_entry_regs(const struct pt_regs *regs) +void kmsan_unpoison_memory(const void *address, size_t size) { unsigned long ua_flags; @@ -409,10 +389,20 @@ void kmsan_unpoison_entry_regs(const struct pt_regs *regs) return; ua_flags = user_access_save(); - kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs), + /* The users may want to poison/unpoison random memory. */ + kmsan_internal_unpoison_memory((void *)address, size, KMSAN_POISON_NOCHECK); user_access_restore(ua_flags); } +EXPORT_SYMBOL(kmsan_unpoison_memory); + +/* + * Version of kmsan_unpoison_memory() called from IRQ entry functions. + */ +void kmsan_unpoison_entry_regs(const struct pt_regs *regs) +{ + kmsan_unpoison_memory((void *)regs, sizeof(*regs)); +} void kmsan_check_memory(const void *addr, size_t size) { From 8c407e05a9b3bd851c5731387033edfb362b8b98 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 5 Jan 2024 07:54:19 -0800 Subject: [PATCH 054/435] selftests/mm: new test that steals pages This test stresses the race between of madvise(DONTNEED), a page fault and a parallel huge page mmap, which should fail due to lack of available page available for mapping. This test case must run on a system with one and only one huge page available. # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages During setup, the test allocates the only available page, and starts three threads: - thread 1: * madvise(MADV_DONTNEED) on the allocated huge page - thread 2: * Write to the allocated huge page - thread 3: * Tries to allocated (steal) an extra huge page (which is not available) thread 3 should never succeed in the allocation, since the only huge page was never unmapped, and should be reserved. Touching the old page after thread3 allocation will raise a SIGBUS. Link: https://lkml.kernel.org/r/20240105155419.1939484-2-leitao@debian.org Signed-off-by: Breno Leitao Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Rik van Riel Cc: Shuah Khan Cc: Vegard Nossum Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + .../selftests/mm/hugetlb_madv_vs_map.c | 124 ++++++++++++++++++ 3 files changed, 126 insertions(+) create mode 100644 tools/testing/selftests/mm/hugetlb_madv_vs_map.c diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 4ff10ea61461..d26e962f2ac4 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -46,3 +46,4 @@ gup_longterm mkdirty va_high_addr_switch hugetlb_fault_after_madv +hugetlb_madv_vs_map diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 2453add65d12..990e9bb112c5 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -70,6 +70,7 @@ TEST_GEN_FILES += ksm_tests TEST_GEN_FILES += ksm_functional_tests TEST_GEN_FILES += mdwe_test TEST_GEN_FILES += hugetlb_fault_after_madv +TEST_GEN_FILES += hugetlb_madv_vs_map ifneq ($(ARCH),arm64) TEST_GEN_FILES += soft-dirty diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c new file mode 100644 index 000000000000..d01e8d4901d0 --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test case that must run on a system with one and only one huge page available. + * # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + * + * During setup, the test allocates the only available page, and starts three threads: + * - thread1: + * * madvise(MADV_DONTNEED) on the allocated huge page + * - thread 2: + * * Write to the allocated huge page + * - thread 3: + * * Try to allocated an extra huge page (which must not available) + * + * The test fails if thread3 is able to allocate a page. + * + * Touching the first page after thread3's allocation will raise a SIGBUS + * + * Author: Breno Leitao + */ +#include +#include +#include +#include +#include +#include + +#include "vm_util.h" +#include "../kselftest.h" + +#define MMAP_SIZE (1 << 21) +#define INLOOP_ITER 100 + +char *huge_ptr; + +/* Touch the memory while it is being madvised() */ +void *touch(void *unused) +{ + for (int i = 0; i < INLOOP_ITER; i++) + huge_ptr[0] = '.'; + + return NULL; +} + +void *madv(void *unused) +{ + for (int i = 0; i < INLOOP_ITER; i++) + madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED); + + return NULL; +} + +/* + * We got here, and there must be no huge page available for mapping + * The other hugepage should be flipping from used <-> reserved, because + * of madvise(DONTNEED). + */ +void *map_extra(void *unused) +{ + void *ptr; + + for (int i = 0; i < INLOOP_ITER; i++) { + ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + + if ((long)ptr != -1) { + /* Touching the other page now will cause a SIGBUG + * huge_ptr[0] = '1'; + */ + return ptr; + } + } + + return NULL; +} + +int main(void) +{ + pthread_t thread1, thread2, thread3; + unsigned long free_hugepages; + void *ret; + + /* + * On kernel 6.7, we are able to reproduce the problem with ~10 + * interactions + */ + int max = 10; + + free_hugepages = get_free_hugepages(); + + if (free_hugepages != 1) { + ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n", + free_hugepages); + } + + while (max--) { + huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + + if ((unsigned long)huge_ptr == -1) { + ksft_exit_skip("Failed to allocated huge page\n"); + return KSFT_SKIP; + } + + pthread_create(&thread1, NULL, madv, NULL); + pthread_create(&thread2, NULL, touch, NULL); + pthread_create(&thread3, NULL, map_extra, NULL); + + pthread_join(thread1, NULL); + pthread_join(thread2, NULL); + pthread_join(thread3, &ret); + + if (ret) { + ksft_test_result_fail("Unexpected huge page allocation\n"); + return KSFT_FAIL; + } + + /* Unmap and restart */ + munmap(huge_ptr, MMAP_SIZE); + } + + return KSFT_PASS; +} From 13ef7424577ff9b663aeaa4fda67e186d9856df3 Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Fri, 26 Jan 2024 21:19:25 +0000 Subject: [PATCH 055/435] mm: memcg: don't periodically flush stats when memcg is disabled The root memcg is onlined even when memcg is disabled. When it's onlined a 2 second periodic stat flush is started, but no stat flushing is required when memcg is disabled because there can be no child memcgs. Most calls to flush memcg stats are avoided when memcg is disabled as a result of the mem_cgroup_disabled check added in 7d7ef0a4686a ("mm: memcg: restore subtree stats flushing"), but the periodic flushing started in mem_cgroup_css_online is not. Skip it. Link: https://lkml.kernel.org/r/20240126211927.1171338-1-tjmercier@google.com Fixes: aa48e47e3906 ("memcg: infrastructure to flush memcg stats") Signed-off-by: T.J. Mercier Acked-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Chris Li Reported-by: Minchan Kim Reviewed-by: Yosry Ahmed Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Michal Koutn Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index df11d6d19ee3..484a9d2862d4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5621,7 +5621,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (alloc_shrinker_info(memcg)) goto offline_kmem; - if (unlikely(mem_cgroup_is_root(memcg))) + if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled()) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); lru_gen_online_memcg(memcg); From 96200c915040c2ec6f6c173a29caf9f7c9c15a7e Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Fri, 26 Jan 2024 15:25:54 +0000 Subject: [PATCH 056/435] kswapd: replace try_to_freeze() with kthread_freezable_should_stop() Instead of using try_to_freeze, use kthread_freezable_should_stop in kswapd. By this, we can avoid unnecessary freezing when kswapd should stop. Link: https://lkml.kernel.org/r/20240126152556.58791-1-ppbuk5246@gmail.com Signed-off-by: Levi Yun Signed-off-by: Andrew Morton --- mm/vmscan.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4f9c854ce6cc..1f139830b26f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6796,6 +6796,7 @@ restart: bool raise_priority = true; bool balanced; bool ret; + bool was_frozen; sc.reclaim_idx = highest_zoneidx; @@ -6894,9 +6895,9 @@ restart: /* Check if kswapd should be suspending */ __fs_reclaim_release(_THIS_IP_); - ret = try_to_freeze(); + ret = kthread_freezable_should_stop(&was_frozen); __fs_reclaim_acquire(_THIS_IP_); - if (ret || kthread_should_stop()) + if (was_frozen || ret) break; /* @@ -7102,7 +7103,7 @@ static int kswapd(void *p) WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); atomic_set(&pgdat->nr_writeback_throttled, 0); for ( ; ; ) { - bool ret; + bool was_frozen; alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, @@ -7119,15 +7120,14 @@ kswapd_try_sleep: WRITE_ONCE(pgdat->kswapd_order, 0); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); - ret = try_to_freeze(); - if (kthread_should_stop()) + if (kthread_freezable_should_stop(&was_frozen)) break; /* * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ - if (ret) + if (was_frozen) continue; /* From 5cec4eb7fad6fb1e9a3dd8403b558d1eff7490ff Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 26 Jan 2024 16:19:44 +0800 Subject: [PATCH 057/435] mm and cache_info: remove unnecessary CPU cache info update For each CPU hotplug event, we will update per-CPU data slice size and corresponding PCP configuration for every online CPU to make the implementation simple. But, Kyle reported that this takes tens seconds during boot on a machine with 34 zones and 3840 CPUs. So, in this patch, for each CPU hotplug event, we only update per-CPU data slice size and corresponding PCP configuration for the CPUs that share caches with the hotplugged CPU. With the patch, the system boot time reduces 67 seconds on the machine. Link: https://lkml.kernel.org/r/20240126081944.414520-1-ying.huang@intel.com Fixes: 362d37a106dd ("mm, pcp: reduce lock contention for draining high-order pages") Signed-off-by: "Huang, Ying" Originally-by: Kyle Meyer Reported-and-tested-by: Kyle Meyer Cc: Sudeep Holla Cc: Mel Gorman Signed-off-by: Andrew Morton --- drivers/base/cacheinfo.c | 50 +++++++++++++++++++++++++++++++++++----- include/linux/gfp.h | 2 +- mm/page_alloc.c | 39 +++++++++++++++---------------- 3 files changed, 63 insertions(+), 28 deletions(-) diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index f1e79263fe61..23b8cba4a2a3 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -898,6 +898,37 @@ err: return rc; } +static unsigned int cpu_map_shared_cache(bool online, unsigned int cpu, + cpumask_t **map) +{ + struct cacheinfo *llc, *sib_llc; + unsigned int sibling; + + if (!last_level_cache_is_valid(cpu)) + return 0; + + llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1); + + if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED) + return 0; + + if (online) { + *map = &llc->shared_cpu_map; + return cpumask_weight(*map); + } + + /* shared_cpu_map of offlined CPU will be cleared, so use sibling map */ + for_each_cpu(sibling, &llc->shared_cpu_map) { + if (sibling == cpu || !last_level_cache_is_valid(sibling)) + continue; + sib_llc = per_cpu_cacheinfo_idx(sibling, cache_leaves(sibling) - 1); + *map = &sib_llc->shared_cpu_map; + return cpumask_weight(*map); + } + + return 0; +} + /* * Calculate the size of the per-CPU data cache slice. This can be * used to estimate the size of the data cache slice that can be used @@ -929,28 +960,31 @@ static void update_per_cpu_data_slice_size_cpu(unsigned int cpu) ci->per_cpu_data_slice_size = llc->size / nr_shared; } -static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu) +static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu, + cpumask_t *cpu_map) { unsigned int icpu; - for_each_online_cpu(icpu) { + for_each_cpu(icpu, cpu_map) { if (!cpu_online && icpu == cpu) continue; update_per_cpu_data_slice_size_cpu(icpu); + setup_pcp_cacheinfo(icpu); } } static int cacheinfo_cpu_online(unsigned int cpu) { int rc = detect_cache_attributes(cpu); + cpumask_t *cpu_map; if (rc) return rc; rc = cache_add_dev(cpu); if (rc) goto err; - update_per_cpu_data_slice_size(true, cpu); - setup_pcp_cacheinfo(); + if (cpu_map_shared_cache(true, cpu, &cpu_map)) + update_per_cpu_data_slice_size(true, cpu, cpu_map); return 0; err: free_cache_attributes(cpu); @@ -959,12 +993,16 @@ err: static int cacheinfo_cpu_pre_down(unsigned int cpu) { + cpumask_t *cpu_map; + unsigned int nr_shared; + + nr_shared = cpu_map_shared_cache(false, cpu, &cpu_map); if (cpumask_test_and_clear_cpu(cpu, &cache_dev_map)) cpu_cache_sysfs_exit(cpu); free_cache_attributes(cpu); - update_per_cpu_data_slice_size(false, cpu); - setup_pcp_cacheinfo(); + if (nr_shared > 1) + update_per_cpu_data_slice_size(false, cpu, cpu_map); return 0; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index de292a007138..09e22091f1b0 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -334,7 +334,7 @@ void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); void page_alloc_init_late(void); -void setup_pcp_cacheinfo(void); +void setup_pcp_cacheinfo(unsigned int cpu); /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 150d4f23b010..9faca05d124e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5572,37 +5572,34 @@ static void zone_pcp_update(struct zone *zone, int cpu_online) mutex_unlock(&pcp_batch_high_lock); } -static void zone_pcp_update_cacheinfo(struct zone *zone) +static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) { - int cpu; struct per_cpu_pages *pcp; struct cpu_cacheinfo *cci; - for_each_online_cpu(cpu) { - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - cci = get_cpu_cacheinfo(cpu); - /* - * If data cache slice of CPU is large enough, "pcp->batch" - * pages can be preserved in PCP before draining PCP for - * consecutive high-order pages freeing without allocation. - * This can reduce zone lock contention without hurting - * cache-hot pages sharing. - */ - spin_lock(&pcp->lock); - if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) - pcp->flags |= PCPF_FREE_HIGH_BATCH; - else - pcp->flags &= ~PCPF_FREE_HIGH_BATCH; - spin_unlock(&pcp->lock); - } + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + cci = get_cpu_cacheinfo(cpu); + /* + * If data cache slice of CPU is large enough, "pcp->batch" + * pages can be preserved in PCP before draining PCP for + * consecutive high-order pages freeing without allocation. + * This can reduce zone lock contention without hurting + * cache-hot pages sharing. + */ + spin_lock(&pcp->lock); + if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) + pcp->flags |= PCPF_FREE_HIGH_BATCH; + else + pcp->flags &= ~PCPF_FREE_HIGH_BATCH; + spin_unlock(&pcp->lock); } -void setup_pcp_cacheinfo(void) +void setup_pcp_cacheinfo(unsigned int cpu) { struct zone *zone; for_each_populated_zone(zone) - zone_pcp_update_cacheinfo(zone); + zone_pcp_update_cacheinfo(zone, cpu); } /* From 7dbbc8f57d4ba3c369b692db9fd2c9653abf0bb5 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 26 Jan 2024 08:06:43 +0000 Subject: [PATCH 058/435] x86/mm: delete unused cpu argument to leave_mm() The argument is unused since commit 3d28ebceaffa ("x86/mm: Rework lazy TLB to track the actual loaded mm"), delete it. Link: https://lkml.kernel.org/r/20240126080644.1714297-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Cc: Andy Lutomirski Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/include/asm/mmu.h | 2 +- arch/x86/kernel/alternative.c | 2 +- arch/x86/mm/tlb.c | 2 +- arch/x86/xen/mmu_pv.c | 2 +- drivers/cpuidle/cpuidle.c | 2 +- include/linux/mmu_context.h | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 0da5c227f490..ce4677b8b735 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -75,7 +75,7 @@ typedef struct { .lock = __MUTEX_INITIALIZER(mm.context.lock), \ } -void leave_mm(int cpu); +void leave_mm(void); #define leave_mm leave_mm #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1d85cb7071cb..21108d8e6f6b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1805,7 +1805,7 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) * restoring the previous mm. */ if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) - leave_mm(smp_processor_id()); + leave_mm(); temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); switch_mm_irqs_off(NULL, mm, current); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5768d386efab..80b0caa82a91 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -299,7 +299,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam, write_cr3(new_mm_cr3); } -void leave_mm(int cpu) +void leave_mm(void) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 72af496a160c..218773cfb009 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -913,7 +913,7 @@ static void drop_mm_ref_this_cpu(void *info) struct mm_struct *mm = info; if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) - leave_mm(smp_processor_id()); + leave_mm(); /* * If this cpu still has a stale cr3 reference, then make sure diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 737a026ef58a..02e40fd7d948 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -237,7 +237,7 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev, } if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED) - leave_mm(dev->cpu); + leave_mm(); /* Take note of the planned idle state. */ sched_idle_set_state(target_state); diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index f2b7a3f04099..bbaec80c78c5 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -11,7 +11,7 @@ #endif #ifndef leave_mm -static inline void leave_mm(int cpu) { } +static inline void leave_mm(void) { } #endif /* From 3cfd6625a6cf838137f3bf7a1635332cffe7ec63 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 26 Jan 2024 08:06:44 +0000 Subject: [PATCH 059/435] x86/mm: clarify "prev" usage in switch_mm_irqs_off() In the x86 implementation of switch_mm_irqs_off(), we do not use the "prev" argument passed in by the caller, we use exclusively use "real_prev", which is cpu_tlbstate.loaded_mm. This is not obvious at the first sight. Furthermore, a comment describes a condition that happens when called with prev == next, but this should not affect the function in any way since prev is unused. Apparently, the comment is intended to clarify why we don't rely on prev == next to decide whether we need to update CR3, but again, it is not obvious. The comment also references the fact that leave_mm() calls with prev == NULL and tsk == NULL, but this also shouldn't matter because prev is unused and tsk is only used in one function which has a NULL check. Clarify things by renaming (prev -> unused) and (real_prev -> prev), also move and rewrite the comment as an explanation for why we don't rely on "prev" supplied by the caller in x86 code and use our own. Hopefully this makes reading the code easier. Link: https://lkml.kernel.org/r/20240126080644.1714297-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Cc: Andy Lutomirski Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/mm/tlb.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 80b0caa82a91..bf9605caf24f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -492,10 +492,16 @@ void cr4_update_pce(void *ignored) static inline void cr4_update_pce_mm(struct mm_struct *mm) { } #endif -void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, +/* + * The "prev" argument passed by the caller does not always match CR3. For + * example, the scheduler passes in active_mm when switching from lazy TLB mode + * to normal mode, but switch_mm_irqs_off() can be called from x86 code without + * updating active_mm. Use cpu_tlbstate.loaded_mm instead. + */ +void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, struct task_struct *tsk) { - struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); unsigned long new_lam = mm_lam_cr3_mask(next); bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); @@ -504,15 +510,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, bool need_flush; u16 new_asid; - /* - * NB: The scheduler will call us with prev == next when switching - * from lazy TLB mode to normal mode if active_mm isn't changing. - * When this happens, we don't assume that CR3 (and hence - * cpu_tlbstate.loaded_mm) matches next. - * - * NB: leave_mm() calls us with prev == NULL and tsk == NULL. - */ - /* We don't want flush_tlb_func() to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(!irqs_disabled()); @@ -527,7 +524,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * isn't free. */ #ifdef CONFIG_DEBUG_VM - if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, + if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid, tlbstate_lam_cr3_mask()))) { /* * If we were to BUG here, we'd be very likely to kill @@ -559,7 +556,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * provides that full memory barrier and core serializing * instruction. */ - if (real_prev == next) { + if (prev == next) { /* Not actually switching mm's */ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); @@ -574,7 +571,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * mm_cpumask. The TLB shootdown code can figure out from * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. */ - if (WARN_ON_ONCE(real_prev != &init_mm && + if (WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); @@ -616,10 +613,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * Skip kernel threads; we never send init_mm TLB flushing IPIs, * but the bitmap manipulation can cause cache line contention. */ - if (real_prev != &init_mm) { + if (prev != &init_mm) { VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, - mm_cpumask(real_prev))); - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + mm_cpumask(prev))); + cpumask_clear_cpu(cpu, mm_cpumask(prev)); } /* @@ -656,9 +653,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); - if (next != real_prev) { + if (next != prev) { cr4_update_pce_mm(next); - switch_ldt(real_prev, next); + switch_ldt(prev, next); } } From 5878303c5353bdd9b7949db9edbaded53c150173 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 28 Jan 2024 13:28:50 +0000 Subject: [PATCH 060/435] mm/zswap: fix race between lru writeback and swapoff LRU writeback has race problem with swapoff, as spotted by Yosry [1]: CPU1 CPU2 shrink_memcg_cb swap_off list_lru_isolate zswap_invalidate zswap_swapoff kfree(tree) // UAF spin_lock(&tree->lock) The problem is that the entry in lru list can't protect the tree from being swapoff and freed, and the entry also can be invalidated and freed concurrently after we unlock the lru lock. We can fix it by moving the swap cache allocation ahead before referencing the tree, then check invalidate race with tree lock, only after that we can safely deref the entry. Note we couldn't deref entry or tree anymore after we unlock the folio, since we depend on this to hold on swapoff. So this patch moves all tree and entry usage to zswap_writeback_entry(), we only use the copied swpentry on the stack to allocate swap cache and if returned with folio locked we can reference the tree safely. Then we can check invalidate race with tree lock, the following things is much the same like zswap_load(). Since we can't deref the entry after zswap_writeback_entry(), we can't use zswap_lru_putback() anymore, instead we rotate the entry in the beginning. And it will be unlinked and freed when invalidated if writeback success. Another change is we don't update the memcg nr_zswap_protected in the -ENOMEM and -EEXIST cases anymore. -EEXIST case means we raced with swapin or concurrent shrinker action, since swapin already have memcg nr_zswap_protected updated, don't need double counts here. For concurrent shrinker, the folio will be writeback and freed anyway. -ENOMEM case is extremely rare and doesn't happen spuriously either, so don't bother distinguishing this case. [1] https://lore.kernel.org/all/CAJD7tkasHsRnT_75-TXsEe58V9_OW6m3g6CF7Kmsvz8CKRG_EA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-2-b10479847099@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Chris Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 114 +++++++++++++++++++++++------------------------------ 1 file changed, 49 insertions(+), 65 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index bccef2af43cc..ddc8f930d343 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -277,7 +277,7 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) zpool_get_type((p)->zpools[0])) static int zswap_writeback_entry(struct zswap_entry *entry, - struct zswap_tree *tree); + swp_entry_t swpentry); static int zswap_pool_get(struct zswap_pool *pool); static void zswap_pool_put(struct zswap_pool *pool); @@ -444,27 +444,6 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) rcu_read_unlock(); } -static void zswap_lru_putback(struct list_lru *list_lru, - struct zswap_entry *entry) -{ - int nid = entry_to_nid(entry); - spinlock_t *lock = &list_lru->node[nid].lock; - struct mem_cgroup *memcg; - struct lruvec *lruvec; - - rcu_read_lock(); - memcg = mem_cgroup_from_entry(entry); - spin_lock(lock); - /* we cannot use list_lru_add here, because it increments node's lru count */ - list_lru_putback(list_lru, &entry->lru, nid, memcg); - spin_unlock(lock); - - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); - /* increment the protection area to account for the LRU rotation. */ - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); - rcu_read_unlock(); -} - /********************************* * rbtree functions **********************************/ @@ -859,40 +838,47 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o { struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); bool *encountered_page_in_swapcache = (bool *)arg; - struct zswap_tree *tree; - pgoff_t swpoffset; + swp_entry_t swpentry; enum lru_status ret = LRU_REMOVED_RETRY; int writeback_result; + /* + * Rotate the entry to the tail before unlocking the LRU, + * so that in case of an invalidation race concurrent + * reclaimers don't waste their time on it. + * + * If writeback succeeds, or failure is due to the entry + * being invalidated by the swap subsystem, the invalidation + * will unlink and free it. + * + * Temporary failures, where the same entry should be tried + * again immediately, almost never happen for this shrinker. + * We don't do any trylocking; -ENOMEM comes closest, + * but that's extremely rare and doesn't happen spuriously + * either. Don't bother distinguishing this case. + * + * But since they do exist in theory, the entry cannot just + * be unlinked, or we could leak it. Hence, rotate. + */ + list_move_tail(item, &l->list); + /* * Once the lru lock is dropped, the entry might get freed. The - * swpoffset is copied to the stack, and entry isn't deref'd again + * swpentry is copied to the stack, and entry isn't deref'd again * until the entry is verified to still be alive in the tree. */ - swpoffset = swp_offset(entry->swpentry); - tree = swap_zswap_tree(entry->swpentry); - list_lru_isolate(l, item); + swpentry = entry->swpentry; + /* * It's safe to drop the lock here because we return either * LRU_REMOVED_RETRY or LRU_RETRY. */ spin_unlock(lock); - /* Check for invalidate() race */ - spin_lock(&tree->lock); - if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) - goto unlock; + writeback_result = zswap_writeback_entry(entry, swpentry); - /* Hold a reference to prevent a free during writeback */ - zswap_entry_get(entry); - spin_unlock(&tree->lock); - - writeback_result = zswap_writeback_entry(entry, tree); - - spin_lock(&tree->lock); if (writeback_result) { zswap_reject_reclaim_fail++; - zswap_lru_putback(&entry->pool->list_lru, entry); ret = LRU_RETRY; /* @@ -902,27 +888,10 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o */ if (writeback_result == -EEXIST && encountered_page_in_swapcache) *encountered_page_in_swapcache = true; - - goto put_unlock; + } else { + zswap_written_back_pages++; } - zswap_written_back_pages++; - if (entry->objcg) - count_objcg_event(entry->objcg, ZSWPWB); - - count_vm_event(ZSWPWB); - /* - * Writeback started successfully, the page now belongs to the - * swapcache. Drop the entry from zswap - unless invalidate already - * took it out while we had the tree->lock released for IO. - */ - zswap_invalidate_entry(tree, entry); - -put_unlock: - /* Drop local reference */ - zswap_entry_put(entry); -unlock: - spin_unlock(&tree->lock); spin_lock(lock); return ret; } @@ -1407,9 +1376,9 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) * freed. */ static int zswap_writeback_entry(struct zswap_entry *entry, - struct zswap_tree *tree) + swp_entry_t swpentry) { - swp_entry_t swpentry = entry->swpentry; + struct zswap_tree *tree; struct folio *folio; struct mempolicy *mpol; bool folio_was_allocated; @@ -1425,9 +1394,11 @@ static int zswap_writeback_entry(struct zswap_entry *entry, return -ENOMEM; /* - * Found an existing folio, we raced with load/swapin. We generally - * writeback cold folios from zswap, and swapin means the folio just - * became hot. Skip this folio and let the caller find another one. + * Found an existing folio, we raced with swapin or concurrent + * shrinker. We generally writeback cold folios from zswap, and + * swapin means the folio just became hot, so skip this folio. + * For unlikely concurrent shrinker case, it will be unlinked + * and freed when invalidated by the concurrent shrinker anyway. */ if (!folio_was_allocated) { folio_put(folio); @@ -1441,18 +1412,31 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * backs (our zswap_entry reference doesn't prevent that), to * avoid overwriting a new swap folio with old compressed data. */ + tree = swap_zswap_tree(swpentry); spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { + if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { spin_unlock(&tree->lock); delete_from_swap_cache(folio); folio_unlock(folio); folio_put(folio); return -ENOMEM; } + + /* Safe to deref entry after the entry is verified above. */ + zswap_entry_get(entry); spin_unlock(&tree->lock); __zswap_load(entry, &folio->page); + count_vm_event(ZSWPWB); + if (entry->objcg) + count_objcg_event(entry->objcg, ZSWPWB); + + spin_lock(&tree->lock); + zswap_invalidate_entry(tree, entry); + zswap_entry_put(entry); + spin_unlock(&tree->lock); + /* folio is up to date */ folio_mark_uptodate(folio); From 3f798aa6121ab3eb572f96ab2d8558894d979a4c Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 28 Jan 2024 13:28:51 +0000 Subject: [PATCH 061/435] mm/list_lru: remove list_lru_putback() Since the only user zswap_lru_putback() has gone, remove list_lru_putback() too. Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-3-b10479847099@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Yosry Ahmed Cc: Chris Li Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 16 ---------------- mm/list_lru.c | 14 -------------- mm/zswap.c | 2 +- 3 files changed, 1 insertion(+), 31 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index c679e6b293c4..f2882a820690 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -168,22 +168,6 @@ static inline unsigned long list_lru_count(struct list_lru *lru) void list_lru_isolate(struct list_lru_one *list, struct list_head *item); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); -/** - * list_lru_putback: undo list_lru_isolate - * @lru: the lru pointer. - * @item: the item to put back. - * @nid: the node id of the sublist to put the item back to. - * @memcg: the cgroup of the sublist to put the item back to. - * - * Put back an isolated item into its original LRU. Note that unlike - * list_lru_add, this does not increment the node LRU count (as - * list_lru_isolate does not originally decrement this count). - * - * Since we might have dropped the LRU lock in between, recompute list_lru_one - * from the node's id and memcg. - */ -void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); diff --git a/mm/list_lru.c b/mm/list_lru.c index 158781d1d3c2..61f3b6b1134f 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -162,20 +162,6 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, } EXPORT_SYMBOL_GPL(list_lru_isolate_move); -void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg) -{ - struct list_lru_one *list = - list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); - - if (list_empty(item)) { - list_add_tail(item, &list->list); - if (!list->nr_items++) - set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); - } -} -EXPORT_SYMBOL_GPL(list_lru_putback); - unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { diff --git a/mm/zswap.c b/mm/zswap.c index ddc8f930d343..2d7f594e6d07 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -410,7 +410,7 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The * new entry will be added directly to memcg's parent's list_lru. * - * Similar reasoning holds for list_lru_del() and list_lru_putback(). + * Similar reasoning holds for list_lru_del(). */ rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); From 42398be2adb12096500e61f9585f05d0a479bf57 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:37 -0500 Subject: [PATCH 062/435] mm: zswap: rename zswap_free_entry to zswap_entry_free There is a zswap_entry_ namespace with multiple functions already. Link: https://lkml.kernel.org/r/20240130014208.565554-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 2d7f594e6d07..5cb79cb497c0 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -519,7 +519,7 @@ static struct zpool *zswap_find_zpool(struct zswap_entry *entry) * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. */ -static void zswap_free_entry(struct zswap_entry *entry) +static void zswap_entry_free(struct zswap_entry *entry) { if (!entry->length) atomic_dec(&zswap_same_filled_pages); @@ -554,7 +554,7 @@ static void zswap_entry_put(struct zswap_entry *entry) WARN_ON_ONCE(refcount < 0); if (refcount == 0) { WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); - zswap_free_entry(entry); + zswap_entry_free(entry); } } From 5b297f70bb26b9041d5badd0d51d9227e25fc1eb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:38 -0500 Subject: [PATCH 063/435] mm: zswap: inline and remove zswap_entry_find_get() There is only one caller and the function is trivial. Inline it. Link: https://lkml.kernel.org/r/20240130014208.565554-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 5cb79cb497c0..3df8b6329cf5 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -558,19 +558,6 @@ static void zswap_entry_put(struct zswap_entry *entry) } } -/* caller must hold the tree lock */ -static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, - pgoff_t offset) -{ - struct zswap_entry *entry; - - entry = zswap_rb_search(root, offset); - if (entry) - zswap_entry_get(entry); - - return entry; -} - /********************************* * shrinker functions **********************************/ @@ -1708,13 +1695,13 @@ bool zswap_load(struct folio *folio) VM_WARN_ON_ONCE(!folio_test_locked(folio)); - /* find */ spin_lock(&tree->lock); - entry = zswap_entry_find_get(&tree->rbroot, offset); + entry = zswap_rb_search(&tree->rbroot, offset); if (!entry) { spin_unlock(&tree->lock); return false; } + zswap_entry_get(entry); spin_unlock(&tree->lock); if (entry->length) From 7dd1f7f0fc1c98525cbadf954eb6b7672ad4acea Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:39 -0500 Subject: [PATCH 064/435] mm: zswap: move zswap_invalidate_entry() to related functions Move it up to the other tree and refcounting functions. Link: https://lkml.kernel.org/r/20240130014208.565554-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 3df8b6329cf5..0dfd410d1b3c 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -558,6 +558,18 @@ static void zswap_entry_put(struct zswap_entry *entry) } } +/* + * If the entry is still valid in the tree, drop the initial ref and remove it + * from the tree. This function must be called with an additional ref held, + * otherwise it may race with another invalidation freeing the entry. + */ +static void zswap_invalidate_entry(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + if (zswap_rb_erase(&tree->rbroot, entry)) + zswap_entry_put(entry); +} + /********************************* * shrinker functions **********************************/ @@ -808,18 +820,6 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) return NULL; } -/* - * If the entry is still valid in the tree, drop the initial ref and remove it - * from the tree. This function must be called with an additional ref held, - * otherwise it may race with another invalidation freeing the entry. - */ -static void zswap_invalidate_entry(struct zswap_tree *tree, - struct zswap_entry *entry) -{ - if (zswap_rb_erase(&tree->rbroot, entry)) - zswap_entry_put(entry); -} - static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { From e477559ca602a033e3970ebdbbfb39773345d694 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:40 -0500 Subject: [PATCH 065/435] mm: zswap: warn when referencing a dead entry Put a standard sanity check on zswap_entry_get() for UAF scenario. Link: https://lkml.kernel.org/r/20240130014208.565554-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/zswap.c b/mm/zswap.c index 0dfd410d1b3c..70e409add32b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -541,6 +541,7 @@ static void zswap_entry_free(struct zswap_entry *entry) /* caller must hold the tree lock */ static void zswap_entry_get(struct zswap_entry *entry) { + WARN_ON_ONCE(!entry->refcount); entry->refcount++; } From dab7711fac6dd3c45471bfb3b6b5336e8fa900a2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:41 -0500 Subject: [PATCH 066/435] mm: zswap: clean up zswap_entry_put() Remove stale comment and unnecessary local variable. Link: https://lkml.kernel.org/r/20240130014208.565554-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Yosry Ahmed Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 70e409add32b..32bcc291397f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -545,15 +545,11 @@ static void zswap_entry_get(struct zswap_entry *entry) entry->refcount++; } -/* caller must hold the tree lock -* remove from the tree and free it, if nobody reference the entry -*/ +/* caller must hold the tree lock */ static void zswap_entry_put(struct zswap_entry *entry) { - int refcount = --entry->refcount; - - WARN_ON_ONCE(refcount < 0); - if (refcount == 0) { + WARN_ON_ONCE(!entry->refcount); + if (--entry->refcount == 0) { WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); zswap_entry_free(entry); } From ff2972aa1b5d527d982af3925258be78d26c6625 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:42 -0500 Subject: [PATCH 067/435] mm: zswap: rename __zswap_load() to zswap_decompress() Link: https://lkml.kernel.org/r/20240130014208.565554-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 32bcc291397f..4a0849bf893d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1315,7 +1315,7 @@ static int zswap_enabled_param_set(const char *val, return ret; } -static void __zswap_load(struct zswap_entry *entry, struct page *page) +static void zswap_decompress(struct zswap_entry *entry, struct page *page) { struct zpool *zpool = zswap_find_zpool(entry); struct scatterlist input, output; @@ -1410,7 +1410,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, zswap_entry_get(entry); spin_unlock(&tree->lock); - __zswap_load(entry, &folio->page); + zswap_decompress(entry, &folio->page); count_vm_event(ZSWPWB); if (entry->objcg) @@ -1702,7 +1702,7 @@ bool zswap_load(struct folio *folio) spin_unlock(&tree->lock); if (entry->length) - __zswap_load(entry, page); + zswap_decompress(entry, page); else { dst = kmap_local_page(page); zswap_fill_page(dst, entry->value); From fa9ad6e21003691905f6caa307f9e5bf27ac1c23 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:43 -0500 Subject: [PATCH 068/435] mm: zswap: break out zwap_compress() zswap_store() is long and mixes work at the zswap layer with work at the backend and compression layer. Move compression & backend work to zswap_compress(), mirroring zswap_decompress(). Link: https://lkml.kernel.org/r/20240130014208.565554-8-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 145 ++++++++++++++++++++++++++++------------------------- 1 file changed, 77 insertions(+), 68 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 4a0849bf893d..82f788a93a33 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1315,6 +1315,79 @@ static int zswap_enabled_param_set(const char *val, return ret; } +static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) +{ + struct crypto_acomp_ctx *acomp_ctx; + struct scatterlist input, output; + unsigned int dlen = PAGE_SIZE; + unsigned long handle; + struct zpool *zpool; + char *buf; + gfp_t gfp; + int ret; + u8 *dst; + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + + mutex_lock(&acomp_ctx->mutex); + + dst = acomp_ctx->buffer; + sg_init_table(&input, 1); + sg_set_page(&input, &folio->page, PAGE_SIZE, 0); + + /* + * We need PAGE_SIZE * 2 here since there maybe over-compression case, + * and hardware-accelerators may won't check the dst buffer size, so + * giving the dst buffer with enough length to avoid buffer overflow. + */ + sg_init_one(&output, dst, PAGE_SIZE * 2); + acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); + + /* + * it maybe looks a little bit silly that we send an asynchronous request, + * then wait for its completion synchronously. This makes the process look + * synchronous in fact. + * Theoretically, acomp supports users send multiple acomp requests in one + * acomp instance, then get those requests done simultaneously. but in this + * case, zswap actually does store and load page by page, there is no + * existing method to send the second page before the first page is done + * in one thread doing zwap. + * but in different threads running on different cpu, we have different + * acomp instance, so multiple threads can do (de)compression in parallel. + */ + ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + if (ret) { + zswap_reject_compress_fail++; + goto unlock; + } + + zpool = zswap_find_zpool(entry); + gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + if (zpool_malloc_support_movable(zpool)) + gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + ret = zpool_malloc(zpool, dlen, gfp, &handle); + if (ret == -ENOSPC) { + zswap_reject_compress_poor++; + goto unlock; + } + if (ret) { + zswap_reject_alloc_fail++; + goto unlock; + } + + buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); + memcpy(buf, dst, dlen); + zpool_unmap_handle(zpool, handle); + + entry->handle = handle; + entry->length = dlen; + +unlock: + mutex_unlock(&acomp_ctx->mutex); + return ret == 0; +} + static void zswap_decompress(struct zswap_entry *entry, struct page *page) { struct zpool *zpool = zswap_find_zpool(entry); @@ -1471,18 +1544,11 @@ bool zswap_store(struct folio *folio) struct page *page = &folio->page; struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry, *dupentry; - struct scatterlist input, output; - struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; - struct zpool *zpool; - unsigned int dlen = PAGE_SIZE; - unsigned long handle, value; - char *buf; - u8 *src, *dst; - gfp_t gfp; - int ret; + unsigned long value; + u8 *src; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1568,65 +1634,10 @@ bool zswap_store(struct folio *folio) mem_cgroup_put(memcg); } - /* compress */ - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + if (!zswap_compress(folio, entry)) + goto put_pool; - mutex_lock(&acomp_ctx->mutex); - - dst = acomp_ctx->buffer; - sg_init_table(&input, 1); - sg_set_page(&input, &folio->page, PAGE_SIZE, 0); - - /* - * We need PAGE_SIZE * 2 here since there maybe over-compression case, - * and hardware-accelerators may won't check the dst buffer size, so - * giving the dst buffer with enough length to avoid buffer overflow. - */ - sg_init_one(&output, dst, PAGE_SIZE * 2); - acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); - /* - * it maybe looks a little bit silly that we send an asynchronous request, - * then wait for its completion synchronously. This makes the process look - * synchronous in fact. - * Theoretically, acomp supports users send multiple acomp requests in one - * acomp instance, then get those requests done simultaneously. but in this - * case, zswap actually does store and load page by page, there is no - * existing method to send the second page before the first page is done - * in one thread doing zwap. - * but in different threads running on different cpu, we have different - * acomp instance, so multiple threads can do (de)compression in parallel. - */ - ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); - dlen = acomp_ctx->req->dlen; - - if (ret) { - zswap_reject_compress_fail++; - goto put_dstmem; - } - - /* store */ - zpool = zswap_find_zpool(entry); - gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(zpool)) - gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - ret = zpool_malloc(zpool, dlen, gfp, &handle); - if (ret == -ENOSPC) { - zswap_reject_compress_poor++; - goto put_dstmem; - } - if (ret) { - zswap_reject_alloc_fail++; - goto put_dstmem; - } - buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); - memcpy(buf, dst, dlen); - zpool_unmap_handle(zpool, handle); - mutex_unlock(&acomp_ctx->mutex); - - /* populate entry */ entry->swpentry = swp; - entry->handle = handle; - entry->length = dlen; insert_entry: entry->objcg = objcg; @@ -1663,8 +1674,6 @@ insert_entry: return true; -put_dstmem: - mutex_unlock(&acomp_ctx->mutex); put_pool: zswap_pool_put(entry->pool); freepage: From be7fc97c52835b931682d79121530773b67e8307 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:44 -0500 Subject: [PATCH 069/435] mm: zswap: further cleanup zswap_store() - Remove dupentry, reusing entry works just fine. - Rename pool to shrink_pool, as this one actually is confusing. - Remove page, use folio_nid() and kmap_local_folio() directly. - Set entry->swpentry in a common path. - Move value and src to local scope of use. Link: https://lkml.kernel.org/r/20240130014208.565554-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 82f788a93a33..1a86659e1173 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1541,14 +1541,11 @@ bool zswap_store(struct folio *folio) { swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); - struct page *page = &folio->page; struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry, *dupentry; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; - struct zswap_pool *pool; - unsigned long value; - u8 *src; + struct zswap_pool *shrink_pool; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1563,10 +1560,10 @@ bool zswap_store(struct folio *folio) * the tree, and it might be written back overriding the new data. */ spin_lock(&tree->lock); - dupentry = zswap_rb_search(&tree->rbroot, offset); - if (dupentry) { + entry = zswap_rb_search(&tree->rbroot, offset); + if (entry) { + zswap_invalidate_entry(tree, entry); zswap_duplicate_entry++; - zswap_invalidate_entry(tree, dupentry); } spin_unlock(&tree->lock); @@ -1598,17 +1595,19 @@ bool zswap_store(struct folio *folio) } /* allocate entry */ - entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); + entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; } if (zswap_same_filled_pages_enabled) { - src = kmap_local_page(page); + unsigned long value; + u8 *src; + + src = kmap_local_folio(folio, 0); if (zswap_is_page_same_filled(src, &value)) { kunmap_local(src); - entry->swpentry = swp; entry->length = 0; entry->value = value; atomic_inc(&zswap_same_filled_pages); @@ -1637,9 +1636,8 @@ bool zswap_store(struct folio *folio) if (!zswap_compress(folio, entry)) goto put_pool; - entry->swpentry = swp; - insert_entry: + entry->swpentry = swp; entry->objcg = objcg; if (objcg) { obj_cgroup_charge_zswap(objcg, entry->length); @@ -1684,9 +1682,9 @@ reject: return false; shrink: - pool = zswap_pool_last_get(); - if (pool && !queue_work(shrink_wq, &pool->shrink_work)) - zswap_pool_put(pool); + shrink_pool = zswap_pool_last_get(); + if (shrink_pool && !queue_work(shrink_wq, &shrink_pool->shrink_work)) + zswap_pool_put(shrink_pool); goto reject; } From 06ed22890cf9749b0e45d59c5cdc6e46dbd7339d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:45 -0500 Subject: [PATCH 070/435] mm: zswap: simplify zswap_invalidate() The branching is awkward and duplicates code. The comment about writeback is also misleading: yes, the entry might have been written back. Or it might have never been stored in zswap to begin with due to a rejection - zswap_invalidate() is called on all exiting swap entries. Link: https://lkml.kernel.org/r/20240130014208.565554-10-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 1a86659e1173..732b0a701b77 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1739,15 +1739,10 @@ void zswap_invalidate(int type, pgoff_t offset) struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset)); struct zswap_entry *entry; - /* find */ spin_lock(&tree->lock); entry = zswap_rb_search(&tree->rbroot, offset); - if (!entry) { - /* entry was written back */ - spin_unlock(&tree->lock); - return; - } - zswap_invalidate_entry(tree, entry); + if (entry) + zswap_invalidate_entry(tree, entry); spin_unlock(&tree->lock); } From a984649b5c1f34103638a960b255bd695c2a9445 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:46 -0500 Subject: [PATCH 071/435] mm: zswap: function ordering: pool alloc & free The function ordering in zswap.c is a little chaotic, which requires jumping in unexpected directions when following related code. This is a series of patches that brings the file into the following order: - pool functions - lru functions - rbtree functions - zswap entry functions - compression/backend functions - writeback & shrinking functions - store, load, invalidate, swapon, swapoff - debugfs - init But it has to be split up such the moving still produces halfway readable diffs. In this patch, move pool allocation and freeing functions. Link: https://lkml.kernel.org/r/20240130014208.565554-11-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 297 +++++++++++++++++++++++++++-------------------------- 1 file changed, 152 insertions(+), 145 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 732b0a701b77..5947eeb8d799 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -320,6 +320,158 @@ static void zswap_update_total_size(void) zswap_pool_total_size = total; } +/********************************* +* pool functions +**********************************/ + +static void zswap_alloc_shrinker(struct zswap_pool *pool); +static void shrink_worker(struct work_struct *w); + +static struct zswap_pool *zswap_pool_create(char *type, char *compressor) +{ + int i; + struct zswap_pool *pool; + char name[38]; /* 'zswap' + 32 char (max) num + \0 */ + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + int ret; + + if (!zswap_has_pool) { + /* if either are unset, pool initialization failed, and we + * need both params to be set correctly before trying to + * create a pool. + */ + if (!strcmp(type, ZSWAP_PARAM_UNSET)) + return NULL; + if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) + return NULL; + } + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { + /* unique name for each pool specifically required by zsmalloc */ + snprintf(name, 38, "zswap%x", + atomic_inc_return(&zswap_pools_count)); + + pool->zpools[i] = zpool_create_pool(type, name, gfp); + if (!pool->zpools[i]) { + pr_err("%s zpool not available\n", type); + goto error; + } + } + pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); + + strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); + + pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); + if (!pool->acomp_ctx) { + pr_err("percpu alloc failed\n"); + goto error; + } + + ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, + &pool->node); + if (ret) + goto error; + + zswap_alloc_shrinker(pool); + if (!pool->shrinker) + goto error; + + pr_debug("using %s compressor\n", pool->tfm_name); + + /* being the current pool takes 1 ref; this func expects the + * caller to always add the new pool as the current pool + */ + kref_init(&pool->kref); + INIT_LIST_HEAD(&pool->list); + if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) + goto lru_fail; + shrinker_register(pool->shrinker); + INIT_WORK(&pool->shrink_work, shrink_worker); + atomic_set(&pool->nr_stored, 0); + + zswap_pool_debug("created", pool); + + return pool; + +lru_fail: + list_lru_destroy(&pool->list_lru); + shrinker_free(pool->shrinker); +error: + if (pool->acomp_ctx) + free_percpu(pool->acomp_ctx); + while (i--) + zpool_destroy_pool(pool->zpools[i]); + kfree(pool); + return NULL; +} + +static struct zswap_pool *__zswap_pool_create_fallback(void) +{ + bool has_comp, has_zpool; + + has_comp = crypto_has_acomp(zswap_compressor, 0, 0); + if (!has_comp && strcmp(zswap_compressor, + CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { + pr_err("compressor %s not available, using default %s\n", + zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); + param_free_charp(&zswap_compressor); + zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; + has_comp = crypto_has_acomp(zswap_compressor, 0, 0); + } + if (!has_comp) { + pr_err("default compressor %s not available\n", + zswap_compressor); + param_free_charp(&zswap_compressor); + zswap_compressor = ZSWAP_PARAM_UNSET; + } + + has_zpool = zpool_has_pool(zswap_zpool_type); + if (!has_zpool && strcmp(zswap_zpool_type, + CONFIG_ZSWAP_ZPOOL_DEFAULT)) { + pr_err("zpool %s not available, using default %s\n", + zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; + has_zpool = zpool_has_pool(zswap_zpool_type); + } + if (!has_zpool) { + pr_err("default zpool %s not available\n", + zswap_zpool_type); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = ZSWAP_PARAM_UNSET; + } + + if (!has_comp || !has_zpool) + return NULL; + + return zswap_pool_create(zswap_zpool_type, zswap_compressor); +} + +static void zswap_pool_destroy(struct zswap_pool *pool) +{ + int i; + + zswap_pool_debug("destroying", pool); + + shrinker_free(pool->shrinker); + cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); + free_percpu(pool->acomp_ctx); + list_lru_destroy(&pool->list_lru); + + spin_lock(&zswap_pools_lock); + mem_cgroup_iter_break(NULL, pool->next_shrink); + pool->next_shrink = NULL; + spin_unlock(&zswap_pools_lock); + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + zpool_destroy_pool(pool->zpools[i]); + kfree(pool); +} + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -969,151 +1121,6 @@ resched: zswap_pool_put(pool); } -static struct zswap_pool *zswap_pool_create(char *type, char *compressor) -{ - int i; - struct zswap_pool *pool; - char name[38]; /* 'zswap' + 32 char (max) num + \0 */ - gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - int ret; - - if (!zswap_has_pool) { - /* if either are unset, pool initialization failed, and we - * need both params to be set correctly before trying to - * create a pool. - */ - if (!strcmp(type, ZSWAP_PARAM_UNSET)) - return NULL; - if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) - return NULL; - } - - pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) - return NULL; - - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { - /* unique name for each pool specifically required by zsmalloc */ - snprintf(name, 38, "zswap%x", - atomic_inc_return(&zswap_pools_count)); - - pool->zpools[i] = zpool_create_pool(type, name, gfp); - if (!pool->zpools[i]) { - pr_err("%s zpool not available\n", type); - goto error; - } - } - pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); - - strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); - - pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); - if (!pool->acomp_ctx) { - pr_err("percpu alloc failed\n"); - goto error; - } - - ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, - &pool->node); - if (ret) - goto error; - - zswap_alloc_shrinker(pool); - if (!pool->shrinker) - goto error; - - pr_debug("using %s compressor\n", pool->tfm_name); - - /* being the current pool takes 1 ref; this func expects the - * caller to always add the new pool as the current pool - */ - kref_init(&pool->kref); - INIT_LIST_HEAD(&pool->list); - if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) - goto lru_fail; - shrinker_register(pool->shrinker); - INIT_WORK(&pool->shrink_work, shrink_worker); - atomic_set(&pool->nr_stored, 0); - - zswap_pool_debug("created", pool); - - return pool; - -lru_fail: - list_lru_destroy(&pool->list_lru); - shrinker_free(pool->shrinker); -error: - if (pool->acomp_ctx) - free_percpu(pool->acomp_ctx); - while (i--) - zpool_destroy_pool(pool->zpools[i]); - kfree(pool); - return NULL; -} - -static struct zswap_pool *__zswap_pool_create_fallback(void) -{ - bool has_comp, has_zpool; - - has_comp = crypto_has_acomp(zswap_compressor, 0, 0); - if (!has_comp && strcmp(zswap_compressor, - CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { - pr_err("compressor %s not available, using default %s\n", - zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); - param_free_charp(&zswap_compressor); - zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; - has_comp = crypto_has_acomp(zswap_compressor, 0, 0); - } - if (!has_comp) { - pr_err("default compressor %s not available\n", - zswap_compressor); - param_free_charp(&zswap_compressor); - zswap_compressor = ZSWAP_PARAM_UNSET; - } - - has_zpool = zpool_has_pool(zswap_zpool_type); - if (!has_zpool && strcmp(zswap_zpool_type, - CONFIG_ZSWAP_ZPOOL_DEFAULT)) { - pr_err("zpool %s not available, using default %s\n", - zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); - param_free_charp(&zswap_zpool_type); - zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; - has_zpool = zpool_has_pool(zswap_zpool_type); - } - if (!has_zpool) { - pr_err("default zpool %s not available\n", - zswap_zpool_type); - param_free_charp(&zswap_zpool_type); - zswap_zpool_type = ZSWAP_PARAM_UNSET; - } - - if (!has_comp || !has_zpool) - return NULL; - - return zswap_pool_create(zswap_zpool_type, zswap_compressor); -} - -static void zswap_pool_destroy(struct zswap_pool *pool) -{ - int i; - - zswap_pool_debug("destroying", pool); - - shrinker_free(pool->shrinker); - cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); - free_percpu(pool->acomp_ctx); - list_lru_destroy(&pool->list_lru); - - spin_lock(&zswap_pools_lock); - mem_cgroup_iter_break(NULL, pool->next_shrink); - pool->next_shrink = NULL; - spin_unlock(&zswap_pools_lock); - - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - zpool_destroy_pool(pool->zpools[i]); - kfree(pool); -} - static int __must_check zswap_pool_get(struct zswap_pool *pool) { if (!pool) From 39f3ec8eaa6065873e592857627f18ae17b6c878 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:47 -0500 Subject: [PATCH 072/435] mm: zswap: function ordering: pool refcounting Move pool refcounting functions into the pool section. First the destroy functions, then the get and put which uses them. __zswap_pool_empty() has an upward reference to the global zswap_pools, to sanity check it's not the currently active pool that's being freed. That gets the forward decl for zswap_pool_current(). This puts the get and put function above all callers, so kill the forward decls as well. Link: https://lkml.kernel.org/r/20240130014208.565554-12-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 94 +++++++++++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 5947eeb8d799..6f4e1c22a0de 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -278,8 +278,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) static int zswap_writeback_entry(struct zswap_entry *entry, swp_entry_t swpentry); -static int zswap_pool_get(struct zswap_pool *pool); -static void zswap_pool_put(struct zswap_pool *pool); static bool zswap_is_full(void) { @@ -472,6 +470,53 @@ static void zswap_pool_destroy(struct zswap_pool *pool) kfree(pool); } +static void __zswap_pool_release(struct work_struct *work) +{ + struct zswap_pool *pool = container_of(work, typeof(*pool), + release_work); + + synchronize_rcu(); + + /* nobody should have been able to get a kref... */ + WARN_ON(kref_get_unless_zero(&pool->kref)); + + /* pool is now off zswap_pools list and has no references. */ + zswap_pool_destroy(pool); +} + +static struct zswap_pool *zswap_pool_current(void); + +static void __zswap_pool_empty(struct kref *kref) +{ + struct zswap_pool *pool; + + pool = container_of(kref, typeof(*pool), kref); + + spin_lock(&zswap_pools_lock); + + WARN_ON(pool == zswap_pool_current()); + + list_del_rcu(&pool->list); + + INIT_WORK(&pool->release_work, __zswap_pool_release); + schedule_work(&pool->release_work); + + spin_unlock(&zswap_pools_lock); +} + +static int __must_check zswap_pool_get(struct zswap_pool *pool) +{ + if (!pool) + return 0; + + return kref_get_unless_zero(&pool->kref); +} + +static void zswap_pool_put(struct zswap_pool *pool) +{ + kref_put(&pool->kref, __zswap_pool_empty); +} + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -1121,51 +1166,6 @@ resched: zswap_pool_put(pool); } -static int __must_check zswap_pool_get(struct zswap_pool *pool) -{ - if (!pool) - return 0; - - return kref_get_unless_zero(&pool->kref); -} - -static void __zswap_pool_release(struct work_struct *work) -{ - struct zswap_pool *pool = container_of(work, typeof(*pool), - release_work); - - synchronize_rcu(); - - /* nobody should have been able to get a kref... */ - WARN_ON(kref_get_unless_zero(&pool->kref)); - - /* pool is now off zswap_pools list and has no references. */ - zswap_pool_destroy(pool); -} - -static void __zswap_pool_empty(struct kref *kref) -{ - struct zswap_pool *pool; - - pool = container_of(kref, typeof(*pool), kref); - - spin_lock(&zswap_pools_lock); - - WARN_ON(pool == zswap_pool_current()); - - list_del_rcu(&pool->list); - - INIT_WORK(&pool->release_work, __zswap_pool_release); - schedule_work(&pool->release_work); - - spin_unlock(&zswap_pools_lock); -} - -static void zswap_pool_put(struct zswap_pool *pool) -{ - kref_put(&pool->kref, __zswap_pool_empty); -} - /********************************* * param callbacks **********************************/ From c1a0ecb82bdc6e9e972b0adbd74eb56ff33776f9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:48 -0500 Subject: [PATCH 073/435] mm: zswap: function ordering: zswap_pools Move the operations against the global zswap_pools list (current pool, last, find) to the pool section. Link: https://lkml.kernel.org/r/20240130014208.565554-13-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 150 ++++++++++++++++++++++++++--------------------------- 1 file changed, 73 insertions(+), 77 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 6f4e1c22a0de..77104406649b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -517,6 +517,79 @@ static void zswap_pool_put(struct zswap_pool *pool) kref_put(&pool->kref, __zswap_pool_empty); } +static struct zswap_pool *__zswap_pool_current(void) +{ + struct zswap_pool *pool; + + pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); + WARN_ONCE(!pool && zswap_has_pool, + "%s: no page storage pool!\n", __func__); + + return pool; +} + +static struct zswap_pool *zswap_pool_current(void) +{ + assert_spin_locked(&zswap_pools_lock); + + return __zswap_pool_current(); +} + +static struct zswap_pool *zswap_pool_current_get(void) +{ + struct zswap_pool *pool; + + rcu_read_lock(); + + pool = __zswap_pool_current(); + if (!zswap_pool_get(pool)) + pool = NULL; + + rcu_read_unlock(); + + return pool; +} + +static struct zswap_pool *zswap_pool_last_get(void) +{ + struct zswap_pool *pool, *last = NULL; + + rcu_read_lock(); + + list_for_each_entry_rcu(pool, &zswap_pools, list) + last = pool; + WARN_ONCE(!last && zswap_has_pool, + "%s: no page storage pool!\n", __func__); + if (!zswap_pool_get(last)) + last = NULL; + + rcu_read_unlock(); + + return last; +} + +/* type and compressor must be null-terminated */ +static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) +{ + struct zswap_pool *pool; + + assert_spin_locked(&zswap_pools_lock); + + list_for_each_entry_rcu(pool, &zswap_pools, list) { + if (strcmp(pool->tfm_name, compressor)) + continue; + /* all zpools share the same type */ + if (strcmp(zpool_get_type(pool->zpools[0]), type)) + continue; + /* if we can't get it, it's about to be destroyed */ + if (!zswap_pool_get(pool)) + continue; + return pool; + } + + return NULL; +} + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -937,83 +1010,6 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) return 0; } -/********************************* -* pool functions -**********************************/ - -static struct zswap_pool *__zswap_pool_current(void) -{ - struct zswap_pool *pool; - - pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); - WARN_ONCE(!pool && zswap_has_pool, - "%s: no page storage pool!\n", __func__); - - return pool; -} - -static struct zswap_pool *zswap_pool_current(void) -{ - assert_spin_locked(&zswap_pools_lock); - - return __zswap_pool_current(); -} - -static struct zswap_pool *zswap_pool_current_get(void) -{ - struct zswap_pool *pool; - - rcu_read_lock(); - - pool = __zswap_pool_current(); - if (!zswap_pool_get(pool)) - pool = NULL; - - rcu_read_unlock(); - - return pool; -} - -static struct zswap_pool *zswap_pool_last_get(void) -{ - struct zswap_pool *pool, *last = NULL; - - rcu_read_lock(); - - list_for_each_entry_rcu(pool, &zswap_pools, list) - last = pool; - WARN_ONCE(!last && zswap_has_pool, - "%s: no page storage pool!\n", __func__); - if (!zswap_pool_get(last)) - last = NULL; - - rcu_read_unlock(); - - return last; -} - -/* type and compressor must be null-terminated */ -static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) -{ - struct zswap_pool *pool; - - assert_spin_locked(&zswap_pools_lock); - - list_for_each_entry_rcu(pool, &zswap_pools, list) { - if (strcmp(pool->tfm_name, compressor)) - continue; - /* all zpools share the same type */ - if (strcmp(zpool_get_type(pool->zpools[0]), type)) - continue; - /* if we can't get it, it's about to be destroyed */ - if (!zswap_pool_get(pool)) - continue; - return pool; - } - - return NULL; -} - static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { From abca07c04aa56f9eb41056f4d25aaf029afb00a3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:49 -0500 Subject: [PATCH 074/435] mm: zswap: function ordering: pool params Patch series "mm: zswap: cleanups". Cleanups and maintenance items that accumulated while reviewing zswap patches. This patch (of 20): The parameters primarily control pool attributes. Move those operations up to the pool section. Link: https://lkml.kernel.org/r/20240130014208.565554-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20240130014208.565554-14-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 312 ++++++++++++++++++++++++++--------------------------- 1 file changed, 156 insertions(+), 156 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 77104406649b..98a9cd0a827a 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -590,6 +590,162 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) return NULL; } +/********************************* +* param callbacks +**********************************/ + +static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) +{ + /* no change required */ + if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) + return false; + return true; +} + +/* val must be a null-terminated string */ +static int __zswap_param_set(const char *val, const struct kernel_param *kp, + char *type, char *compressor) +{ + struct zswap_pool *pool, *put_pool = NULL; + char *s = strstrip((char *)val); + int ret = 0; + bool new_pool = false; + + mutex_lock(&zswap_init_lock); + switch (zswap_init_state) { + case ZSWAP_UNINIT: + /* if this is load-time (pre-init) param setting, + * don't create a pool; that's done during init. + */ + ret = param_set_charp(s, kp); + break; + case ZSWAP_INIT_SUCCEED: + new_pool = zswap_pool_changed(s, kp); + break; + case ZSWAP_INIT_FAILED: + pr_err("can't set param, initialization failed\n"); + ret = -ENODEV; + } + mutex_unlock(&zswap_init_lock); + + /* no need to create a new pool, return directly */ + if (!new_pool) + return ret; + + if (!type) { + if (!zpool_has_pool(s)) { + pr_err("zpool %s not available\n", s); + return -ENOENT; + } + type = s; + } else if (!compressor) { + if (!crypto_has_acomp(s, 0, 0)) { + pr_err("compressor %s not available\n", s); + return -ENOENT; + } + compressor = s; + } else { + WARN_ON(1); + return -EINVAL; + } + + spin_lock(&zswap_pools_lock); + + pool = zswap_pool_find_get(type, compressor); + if (pool) { + zswap_pool_debug("using existing", pool); + WARN_ON(pool == zswap_pool_current()); + list_del_rcu(&pool->list); + } + + spin_unlock(&zswap_pools_lock); + + if (!pool) + pool = zswap_pool_create(type, compressor); + + if (pool) + ret = param_set_charp(s, kp); + else + ret = -EINVAL; + + spin_lock(&zswap_pools_lock); + + if (!ret) { + put_pool = zswap_pool_current(); + list_add_rcu(&pool->list, &zswap_pools); + zswap_has_pool = true; + } else if (pool) { + /* add the possibly pre-existing pool to the end of the pools + * list; if it's new (and empty) then it'll be removed and + * destroyed by the put after we drop the lock + */ + list_add_tail_rcu(&pool->list, &zswap_pools); + put_pool = pool; + } + + spin_unlock(&zswap_pools_lock); + + if (!zswap_has_pool && !pool) { + /* if initial pool creation failed, and this pool creation also + * failed, maybe both compressor and zpool params were bad. + * Allow changing this param, so pool creation will succeed + * when the other param is changed. We already verified this + * param is ok in the zpool_has_pool() or crypto_has_acomp() + * checks above. + */ + ret = param_set_charp(s, kp); + } + + /* drop the ref from either the old current pool, + * or the new pool we failed to add + */ + if (put_pool) + zswap_pool_put(put_pool); + + return ret; +} + +static int zswap_compressor_param_set(const char *val, + const struct kernel_param *kp) +{ + return __zswap_param_set(val, kp, zswap_zpool_type, NULL); +} + +static int zswap_zpool_param_set(const char *val, + const struct kernel_param *kp) +{ + return __zswap_param_set(val, kp, NULL, zswap_compressor); +} + +static int zswap_enabled_param_set(const char *val, + const struct kernel_param *kp) +{ + int ret = -ENODEV; + + /* if this is load-time (pre-init) param setting, only set param. */ + if (system_state != SYSTEM_RUNNING) + return param_set_bool(val, kp); + + mutex_lock(&zswap_init_lock); + switch (zswap_init_state) { + case ZSWAP_UNINIT: + if (zswap_setup()) + break; + fallthrough; + case ZSWAP_INIT_SUCCEED: + if (!zswap_has_pool) + pr_err("can't enable, no pool configured\n"); + else + ret = param_set_bool(val, kp); + break; + case ZSWAP_INIT_FAILED: + pr_err("can't enable, initialization failed\n"); + } + mutex_unlock(&zswap_init_lock); + + return ret; +} + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -1162,162 +1318,6 @@ resched: zswap_pool_put(pool); } -/********************************* -* param callbacks -**********************************/ - -static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) -{ - /* no change required */ - if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) - return false; - return true; -} - -/* val must be a null-terminated string */ -static int __zswap_param_set(const char *val, const struct kernel_param *kp, - char *type, char *compressor) -{ - struct zswap_pool *pool, *put_pool = NULL; - char *s = strstrip((char *)val); - int ret = 0; - bool new_pool = false; - - mutex_lock(&zswap_init_lock); - switch (zswap_init_state) { - case ZSWAP_UNINIT: - /* if this is load-time (pre-init) param setting, - * don't create a pool; that's done during init. - */ - ret = param_set_charp(s, kp); - break; - case ZSWAP_INIT_SUCCEED: - new_pool = zswap_pool_changed(s, kp); - break; - case ZSWAP_INIT_FAILED: - pr_err("can't set param, initialization failed\n"); - ret = -ENODEV; - } - mutex_unlock(&zswap_init_lock); - - /* no need to create a new pool, return directly */ - if (!new_pool) - return ret; - - if (!type) { - if (!zpool_has_pool(s)) { - pr_err("zpool %s not available\n", s); - return -ENOENT; - } - type = s; - } else if (!compressor) { - if (!crypto_has_acomp(s, 0, 0)) { - pr_err("compressor %s not available\n", s); - return -ENOENT; - } - compressor = s; - } else { - WARN_ON(1); - return -EINVAL; - } - - spin_lock(&zswap_pools_lock); - - pool = zswap_pool_find_get(type, compressor); - if (pool) { - zswap_pool_debug("using existing", pool); - WARN_ON(pool == zswap_pool_current()); - list_del_rcu(&pool->list); - } - - spin_unlock(&zswap_pools_lock); - - if (!pool) - pool = zswap_pool_create(type, compressor); - - if (pool) - ret = param_set_charp(s, kp); - else - ret = -EINVAL; - - spin_lock(&zswap_pools_lock); - - if (!ret) { - put_pool = zswap_pool_current(); - list_add_rcu(&pool->list, &zswap_pools); - zswap_has_pool = true; - } else if (pool) { - /* add the possibly pre-existing pool to the end of the pools - * list; if it's new (and empty) then it'll be removed and - * destroyed by the put after we drop the lock - */ - list_add_tail_rcu(&pool->list, &zswap_pools); - put_pool = pool; - } - - spin_unlock(&zswap_pools_lock); - - if (!zswap_has_pool && !pool) { - /* if initial pool creation failed, and this pool creation also - * failed, maybe both compressor and zpool params were bad. - * Allow changing this param, so pool creation will succeed - * when the other param is changed. We already verified this - * param is ok in the zpool_has_pool() or crypto_has_acomp() - * checks above. - */ - ret = param_set_charp(s, kp); - } - - /* drop the ref from either the old current pool, - * or the new pool we failed to add - */ - if (put_pool) - zswap_pool_put(put_pool); - - return ret; -} - -static int zswap_compressor_param_set(const char *val, - const struct kernel_param *kp) -{ - return __zswap_param_set(val, kp, zswap_zpool_type, NULL); -} - -static int zswap_zpool_param_set(const char *val, - const struct kernel_param *kp) -{ - return __zswap_param_set(val, kp, NULL, zswap_compressor); -} - -static int zswap_enabled_param_set(const char *val, - const struct kernel_param *kp) -{ - int ret = -ENODEV; - - /* if this is load-time (pre-init) param setting, only set param. */ - if (system_state != SYSTEM_RUNNING) - return param_set_bool(val, kp); - - mutex_lock(&zswap_init_lock); - switch (zswap_init_state) { - case ZSWAP_UNINIT: - if (zswap_setup()) - break; - fallthrough; - case ZSWAP_INIT_SUCCEED: - if (!zswap_has_pool) - pr_err("can't enable, no pool configured\n"); - else - ret = param_set_bool(val, kp); - break; - case ZSWAP_INIT_FAILED: - pr_err("can't enable, initialization failed\n"); - } - mutex_unlock(&zswap_init_lock); - - return ret; -} - static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) { struct crypto_acomp_ctx *acomp_ctx; From 506a86c5e2217cfb1884ea4806dc74c82d058733 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:50 -0500 Subject: [PATCH 075/435] mm: zswap: function ordering: public lru api The zswap entry section sits awkwardly in the middle of LRU-related functions. Group the external LRU API functions first. Link: https://lkml.kernel.org/r/20240130014208.565554-15-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 98a9cd0a827a..74b128c3a0ed 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -746,6 +746,10 @@ static int zswap_enabled_param_set(const char *val, return ret; } +/********************************* +* lru functions +**********************************/ + /* should be called under RCU */ #ifdef CONFIG_MEMCG static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) @@ -764,6 +768,21 @@ static inline int entry_to_nid(struct zswap_entry *entry) return page_to_nid(virt_to_page(entry)); } +void zswap_lruvec_state_init(struct lruvec *lruvec) +{ + atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); +} + +void zswap_folio_swapin(struct folio *folio) +{ + struct lruvec *lruvec; + + if (folio) { + lruvec = folio_lruvec(folio); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + } +} + void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { struct zswap_pool *pool; @@ -798,23 +817,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } -/********************************* -* zswap lruvec functions -**********************************/ -void zswap_lruvec_state_init(struct lruvec *lruvec) -{ - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); -} - -void zswap_folio_swapin(struct folio *folio) -{ - struct lruvec *lruvec; - - VM_WARN_ON_ONCE(!folio_test_locked(folio)); - lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); -} - /********************************* * lru functions **********************************/ From 5182661a11baeda74a547660b11570f5b5124eaf Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:51 -0500 Subject: [PATCH 076/435] mm: zswap: function ordering: move entry sections out of LRU section This completes consolidation of the LRU section. Link: https://lkml.kernel.org/r/20240130014208.565554-16-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 101 ++++++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 52 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 74b128c3a0ed..b8834c4fb7db 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -768,58 +768,6 @@ static inline int entry_to_nid(struct zswap_entry *entry) return page_to_nid(virt_to_page(entry)); } -void zswap_lruvec_state_init(struct lruvec *lruvec) -{ - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); -} - -void zswap_folio_swapin(struct folio *folio) -{ - struct lruvec *lruvec; - - if (folio) { - lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); - } -} - -void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) -{ - struct zswap_pool *pool; - - /* lock out zswap pools list modification */ - spin_lock(&zswap_pools_lock); - list_for_each_entry(pool, &zswap_pools, list) { - if (pool->next_shrink == memcg) - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); - } - spin_unlock(&zswap_pools_lock); -} - -/********************************* -* zswap entry functions -**********************************/ -static struct kmem_cache *zswap_entry_cache; - -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) -{ - struct zswap_entry *entry; - entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); - if (!entry) - return NULL; - entry->refcount = 1; - RB_CLEAR_NODE(&entry->rbnode); - return entry; -} - -static void zswap_entry_cache_free(struct zswap_entry *entry) -{ - kmem_cache_free(zswap_entry_cache, entry); -} - -/********************************* -* lru functions -**********************************/ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { atomic_long_t *nr_zswap_protected; @@ -872,6 +820,55 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) rcu_read_unlock(); } +void zswap_lruvec_state_init(struct lruvec *lruvec) +{ + atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); +} + +void zswap_folio_swapin(struct folio *folio) +{ + struct lruvec *lruvec; + + if (folio) { + lruvec = folio_lruvec(folio); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + } +} + +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) +{ + struct zswap_pool *pool; + + /* lock out zswap pools list modification */ + spin_lock(&zswap_pools_lock); + list_for_each_entry(pool, &zswap_pools, list) { + if (pool->next_shrink == memcg) + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); + } + spin_unlock(&zswap_pools_lock); +} + +/********************************* +* zswap entry functions +**********************************/ +static struct kmem_cache *zswap_entry_cache; + +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) +{ + struct zswap_entry *entry; + entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); + if (!entry) + return NULL; + entry->refcount = 1; + RB_CLEAR_NODE(&entry->rbnode); + return entry; +} + +static void zswap_entry_cache_free(struct zswap_entry *entry) +{ + kmem_cache_free(zswap_entry_cache, entry); +} + /********************************* * rbtree functions **********************************/ From 36034bf6fcdb1281e3f8db27d68b6516a3aed4c7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:52 -0500 Subject: [PATCH 077/435] mm: zswap: function ordering: move entry section out of tree section The higher-level entry operations modify the tree, so move the entry API after the tree section. Link: https://lkml.kernel.org/r/20240130014208.565554-17-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index b8834c4fb7db..19d3482fd035 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -848,27 +848,6 @@ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) spin_unlock(&zswap_pools_lock); } -/********************************* -* zswap entry functions -**********************************/ -static struct kmem_cache *zswap_entry_cache; - -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) -{ - struct zswap_entry *entry; - entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); - if (!entry) - return NULL; - entry->refcount = 1; - RB_CLEAR_NODE(&entry->rbnode); - return entry; -} - -static void zswap_entry_cache_free(struct zswap_entry *entry) -{ - kmem_cache_free(zswap_entry_cache, entry); -} - /********************************* * rbtree functions **********************************/ @@ -930,6 +909,27 @@ static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) return false; } +/********************************* +* zswap entry functions +**********************************/ +static struct kmem_cache *zswap_entry_cache; + +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) +{ + struct zswap_entry *entry; + entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); + if (!entry) + return NULL; + entry->refcount = 1; + RB_CLEAR_NODE(&entry->rbnode); + return entry; +} + +static void zswap_entry_cache_free(struct zswap_entry *entry) +{ + kmem_cache_free(zswap_entry_cache, entry); +} + static struct zpool *zswap_find_zpool(struct zswap_entry *entry) { int i = 0; From f91e81d31c1ef7561559368f29e4bb6eddf13ea1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:53 -0500 Subject: [PATCH 078/435] mm: zswap: function ordering: compress & decompress functions Writeback needs to decompress. Move the (de)compression API above what will be the consolidated shrinking/writeback code. Link: https://lkml.kernel.org/r/20240130014208.565554-18-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 207 +++++++++++++++++++++++++++-------------------------- 1 file changed, 105 insertions(+), 102 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 19d3482fd035..6d1e5843c4dd 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -992,6 +992,111 @@ static void zswap_invalidate_entry(struct zswap_tree *tree, zswap_entry_put(entry); } +/********************************* +* compressed storage functions +**********************************/ +static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) +{ + struct crypto_acomp_ctx *acomp_ctx; + struct scatterlist input, output; + unsigned int dlen = PAGE_SIZE; + unsigned long handle; + struct zpool *zpool; + char *buf; + gfp_t gfp; + int ret; + u8 *dst; + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + + mutex_lock(&acomp_ctx->mutex); + + dst = acomp_ctx->buffer; + sg_init_table(&input, 1); + sg_set_page(&input, &folio->page, PAGE_SIZE, 0); + + /* + * We need PAGE_SIZE * 2 here since there maybe over-compression case, + * and hardware-accelerators may won't check the dst buffer size, so + * giving the dst buffer with enough length to avoid buffer overflow. + */ + sg_init_one(&output, dst, PAGE_SIZE * 2); + acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); + + /* + * it maybe looks a little bit silly that we send an asynchronous request, + * then wait for its completion synchronously. This makes the process look + * synchronous in fact. + * Theoretically, acomp supports users send multiple acomp requests in one + * acomp instance, then get those requests done simultaneously. but in this + * case, zswap actually does store and load page by page, there is no + * existing method to send the second page before the first page is done + * in one thread doing zwap. + * but in different threads running on different cpu, we have different + * acomp instance, so multiple threads can do (de)compression in parallel. + */ + ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + if (ret) { + zswap_reject_compress_fail++; + goto unlock; + } + + zpool = zswap_find_zpool(entry); + gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + if (zpool_malloc_support_movable(zpool)) + gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + ret = zpool_malloc(zpool, dlen, gfp, &handle); + if (ret == -ENOSPC) { + zswap_reject_compress_poor++; + goto unlock; + } + if (ret) { + zswap_reject_alloc_fail++; + goto unlock; + } + + buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); + memcpy(buf, dst, dlen); + zpool_unmap_handle(zpool, handle); + + entry->handle = handle; + entry->length = dlen; + +unlock: + mutex_unlock(&acomp_ctx->mutex); + return ret == 0; +} + +static void zswap_decompress(struct zswap_entry *entry, struct page *page) +{ + struct zpool *zpool = zswap_find_zpool(entry); + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; + u8 *src; + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + mutex_lock(&acomp_ctx->mutex); + + src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); + if (!zpool_can_sleep_mapped(zpool)) { + memcpy(acomp_ctx->buffer, src, entry->length); + src = acomp_ctx->buffer; + zpool_unmap_handle(zpool, entry->handle); + } + + sg_init_one(&input, src, entry->length); + sg_init_table(&output, 1); + sg_set_page(&output, page, PAGE_SIZE, 0); + acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); + BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); + BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); + mutex_unlock(&acomp_ctx->mutex); + + if (zpool_can_sleep_mapped(zpool)) + zpool_unmap_handle(zpool, entry->handle); +} + /********************************* * shrinker functions **********************************/ @@ -1317,108 +1422,6 @@ resched: zswap_pool_put(pool); } -static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) -{ - struct crypto_acomp_ctx *acomp_ctx; - struct scatterlist input, output; - unsigned int dlen = PAGE_SIZE; - unsigned long handle; - struct zpool *zpool; - char *buf; - gfp_t gfp; - int ret; - u8 *dst; - - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - - mutex_lock(&acomp_ctx->mutex); - - dst = acomp_ctx->buffer; - sg_init_table(&input, 1); - sg_set_page(&input, &folio->page, PAGE_SIZE, 0); - - /* - * We need PAGE_SIZE * 2 here since there maybe over-compression case, - * and hardware-accelerators may won't check the dst buffer size, so - * giving the dst buffer with enough length to avoid buffer overflow. - */ - sg_init_one(&output, dst, PAGE_SIZE * 2); - acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); - - /* - * it maybe looks a little bit silly that we send an asynchronous request, - * then wait for its completion synchronously. This makes the process look - * synchronous in fact. - * Theoretically, acomp supports users send multiple acomp requests in one - * acomp instance, then get those requests done simultaneously. but in this - * case, zswap actually does store and load page by page, there is no - * existing method to send the second page before the first page is done - * in one thread doing zwap. - * but in different threads running on different cpu, we have different - * acomp instance, so multiple threads can do (de)compression in parallel. - */ - ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); - dlen = acomp_ctx->req->dlen; - if (ret) { - zswap_reject_compress_fail++; - goto unlock; - } - - zpool = zswap_find_zpool(entry); - gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(zpool)) - gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - ret = zpool_malloc(zpool, dlen, gfp, &handle); - if (ret == -ENOSPC) { - zswap_reject_compress_poor++; - goto unlock; - } - if (ret) { - zswap_reject_alloc_fail++; - goto unlock; - } - - buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); - memcpy(buf, dst, dlen); - zpool_unmap_handle(zpool, handle); - - entry->handle = handle; - entry->length = dlen; - -unlock: - mutex_unlock(&acomp_ctx->mutex); - return ret == 0; -} - -static void zswap_decompress(struct zswap_entry *entry, struct page *page) -{ - struct zpool *zpool = zswap_find_zpool(entry); - struct scatterlist input, output; - struct crypto_acomp_ctx *acomp_ctx; - u8 *src; - - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(&acomp_ctx->mutex); - - src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); - if (!zpool_can_sleep_mapped(zpool)) { - memcpy(acomp_ctx->buffer, src, entry->length); - src = acomp_ctx->buffer; - zpool_unmap_handle(zpool, entry->handle); - } - - sg_init_one(&input, src, entry->length); - sg_init_table(&output, 1); - sg_set_page(&output, page, PAGE_SIZE, 0); - acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); - BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); - BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); - mutex_unlock(&acomp_ctx->mutex); - - if (zpool_can_sleep_mapped(zpool)) - zpool_unmap_handle(zpool, entry->handle); -} - /********************************* * writeback code **********************************/ From 64f200b8304c6b26fbbd7768365850315bd71bc0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:54 -0500 Subject: [PATCH 079/435] mm: zswap: function ordering: per-cpu compression infra The per-cpu compression init/exit callbacks are awkwardly in the middle of the shrinker code. Move them up to the compression section. Link: https://lkml.kernel.org/r/20240130014208.565554-19-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 135 ++++++++++++++++++++++++++--------------------------- 1 file changed, 66 insertions(+), 69 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 6d1e5843c4dd..680e5a4c1af4 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -995,6 +995,72 @@ static void zswap_invalidate_entry(struct zswap_tree *tree, /********************************* * compressed storage functions **********************************/ +static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) +{ + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct crypto_acomp *acomp; + struct acomp_req *req; + int ret; + + mutex_init(&acomp_ctx->mutex); + + acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!acomp_ctx->buffer) + return -ENOMEM; + + acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); + if (IS_ERR(acomp)) { + pr_err("could not alloc crypto acomp %s : %ld\n", + pool->tfm_name, PTR_ERR(acomp)); + ret = PTR_ERR(acomp); + goto acomp_fail; + } + acomp_ctx->acomp = acomp; + + req = acomp_request_alloc(acomp_ctx->acomp); + if (!req) { + pr_err("could not alloc crypto acomp_request %s\n", + pool->tfm_name); + ret = -ENOMEM; + goto req_fail; + } + acomp_ctx->req = req; + + crypto_init_wait(&acomp_ctx->wait); + /* + * if the backend of acomp is async zip, crypto_req_done() will wakeup + * crypto_wait_req(); if the backend of acomp is scomp, the callback + * won't be called, crypto_wait_req() will return without blocking. + */ + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &acomp_ctx->wait); + + return 0; + +req_fail: + crypto_free_acomp(acomp_ctx->acomp); +acomp_fail: + kfree(acomp_ctx->buffer); + return ret; +} + +static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) +{ + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + + if (!IS_ERR_OR_NULL(acomp_ctx)) { + if (!IS_ERR_OR_NULL(acomp_ctx->req)) + acomp_request_free(acomp_ctx->req); + if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) + crypto_free_acomp(acomp_ctx->acomp); + kfree(acomp_ctx->buffer); + } + + return 0; +} + static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) { struct crypto_acomp_ctx *acomp_ctx; @@ -1201,75 +1267,6 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool) pool->shrinker->seeks = DEFAULT_SEEKS; } -/********************************* -* per-cpu code -**********************************/ -static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) -{ - struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct crypto_acomp *acomp; - struct acomp_req *req; - int ret; - - mutex_init(&acomp_ctx->mutex); - - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!acomp_ctx->buffer) - return -ENOMEM; - - acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); - if (IS_ERR(acomp)) { - pr_err("could not alloc crypto acomp %s : %ld\n", - pool->tfm_name, PTR_ERR(acomp)); - ret = PTR_ERR(acomp); - goto acomp_fail; - } - acomp_ctx->acomp = acomp; - - req = acomp_request_alloc(acomp_ctx->acomp); - if (!req) { - pr_err("could not alloc crypto acomp_request %s\n", - pool->tfm_name); - ret = -ENOMEM; - goto req_fail; - } - acomp_ctx->req = req; - - crypto_init_wait(&acomp_ctx->wait); - /* - * if the backend of acomp is async zip, crypto_req_done() will wakeup - * crypto_wait_req(); if the backend of acomp is scomp, the callback - * won't be called, crypto_wait_req() will return without blocking. - */ - acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &acomp_ctx->wait); - - return 0; - -req_fail: - crypto_free_acomp(acomp_ctx->acomp); -acomp_fail: - kfree(acomp_ctx->buffer); - return ret; -} - -static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) -{ - struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - - if (!IS_ERR_OR_NULL(acomp_ctx)) { - if (!IS_ERR_OR_NULL(acomp_ctx->req)) - acomp_request_free(acomp_ctx->req); - if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) - crypto_free_acomp(acomp_ctx->acomp); - kfree(acomp_ctx->buffer); - } - - return 0; -} - static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { From 9986d35d4ceb53962f1c3bc378e317b5ca841f73 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:55 -0500 Subject: [PATCH 080/435] mm: zswap: function ordering: writeback Shrinking needs writeback. Naturally, move the writeback code above the shrinking code. Delete the forward decl. Link: https://lkml.kernel.org/r/20240130014208.565554-20-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 183 ++++++++++++++++++++++++++--------------------------- 1 file changed, 90 insertions(+), 93 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 680e5a4c1af4..667ed3e19340 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -276,9 +276,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp) pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) -static int zswap_writeback_entry(struct zswap_entry *entry, - swp_entry_t swpentry); - static bool zswap_is_full(void) { return totalram_pages() * zswap_max_pool_percent / 100 < @@ -1163,6 +1160,96 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page) zpool_unmap_handle(zpool, entry->handle); } +/********************************* +* writeback code +**********************************/ +/* + * Attempts to free an entry by adding a folio to the swap cache, + * decompressing the entry data into the folio, and issuing a + * bio write to write the folio back to the swap device. + * + * This can be thought of as a "resumed writeback" of the folio + * to the swap device. We are basically resuming the same swap + * writeback path that was intercepted with the zswap_store() + * in the first place. After the folio has been decompressed into + * the swap cache, the compressed version stored by zswap can be + * freed. + */ +static int zswap_writeback_entry(struct zswap_entry *entry, + swp_entry_t swpentry) +{ + struct zswap_tree *tree; + struct folio *folio; + struct mempolicy *mpol; + bool folio_was_allocated; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + + /* try to allocate swap cache folio */ + mpol = get_task_policy(current); + folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, + NO_INTERLEAVE_INDEX, &folio_was_allocated, true); + if (!folio) + return -ENOMEM; + + /* + * Found an existing folio, we raced with swapin or concurrent + * shrinker. We generally writeback cold folios from zswap, and + * swapin means the folio just became hot, so skip this folio. + * For unlikely concurrent shrinker case, it will be unlinked + * and freed when invalidated by the concurrent shrinker anyway. + */ + if (!folio_was_allocated) { + folio_put(folio); + return -EEXIST; + } + + /* + * folio is locked, and the swapcache is now secured against + * concurrent swapping to and from the slot. Verify that the + * swap entry hasn't been invalidated and recycled behind our + * backs (our zswap_entry reference doesn't prevent that), to + * avoid overwriting a new swap folio with old compressed data. + */ + tree = swap_zswap_tree(swpentry); + spin_lock(&tree->lock); + if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { + spin_unlock(&tree->lock); + delete_from_swap_cache(folio); + folio_unlock(folio); + folio_put(folio); + return -ENOMEM; + } + + /* Safe to deref entry after the entry is verified above. */ + zswap_entry_get(entry); + spin_unlock(&tree->lock); + + zswap_decompress(entry, &folio->page); + + count_vm_event(ZSWPWB); + if (entry->objcg) + count_objcg_event(entry->objcg, ZSWPWB); + + spin_lock(&tree->lock); + zswap_invalidate_entry(tree, entry); + zswap_entry_put(entry); + spin_unlock(&tree->lock); + + /* folio is up to date */ + folio_mark_uptodate(folio); + + /* move it to the tail of the inactive list after end_writeback */ + folio_set_reclaim(folio); + + /* start writeback */ + __swap_writepage(folio, &wbc); + folio_put(folio); + + return 0; +} + /********************************* * shrinker functions **********************************/ @@ -1419,96 +1506,6 @@ resched: zswap_pool_put(pool); } -/********************************* -* writeback code -**********************************/ -/* - * Attempts to free an entry by adding a folio to the swap cache, - * decompressing the entry data into the folio, and issuing a - * bio write to write the folio back to the swap device. - * - * This can be thought of as a "resumed writeback" of the folio - * to the swap device. We are basically resuming the same swap - * writeback path that was intercepted with the zswap_store() - * in the first place. After the folio has been decompressed into - * the swap cache, the compressed version stored by zswap can be - * freed. - */ -static int zswap_writeback_entry(struct zswap_entry *entry, - swp_entry_t swpentry) -{ - struct zswap_tree *tree; - struct folio *folio; - struct mempolicy *mpol; - bool folio_was_allocated; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - }; - - /* try to allocate swap cache folio */ - mpol = get_task_policy(current); - folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &folio_was_allocated, true); - if (!folio) - return -ENOMEM; - - /* - * Found an existing folio, we raced with swapin or concurrent - * shrinker. We generally writeback cold folios from zswap, and - * swapin means the folio just became hot, so skip this folio. - * For unlikely concurrent shrinker case, it will be unlinked - * and freed when invalidated by the concurrent shrinker anyway. - */ - if (!folio_was_allocated) { - folio_put(folio); - return -EEXIST; - } - - /* - * folio is locked, and the swapcache is now secured against - * concurrent swapping to and from the slot. Verify that the - * swap entry hasn't been invalidated and recycled behind our - * backs (our zswap_entry reference doesn't prevent that), to - * avoid overwriting a new swap folio with old compressed data. - */ - tree = swap_zswap_tree(swpentry); - spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) { - spin_unlock(&tree->lock); - delete_from_swap_cache(folio); - folio_unlock(folio); - folio_put(folio); - return -ENOMEM; - } - - /* Safe to deref entry after the entry is verified above. */ - zswap_entry_get(entry); - spin_unlock(&tree->lock); - - zswap_decompress(entry, &folio->page); - - count_vm_event(ZSWPWB); - if (entry->objcg) - count_objcg_event(entry->objcg, ZSWPWB); - - spin_lock(&tree->lock); - zswap_invalidate_entry(tree, entry); - zswap_entry_put(entry); - spin_unlock(&tree->lock); - - /* folio is up to date */ - folio_mark_uptodate(folio); - - /* move it to the tail of the inactive list after end_writeback */ - folio_set_reclaim(folio); - - /* start writeback */ - __swap_writepage(folio, &wbc); - folio_put(folio); - - return 0; -} - static int zswap_is_page_same_filled(void *ptr, unsigned long *value) { unsigned long *page; From eb23ee4f96937ca47fe6c88151d998551642a772 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 29 Jan 2024 20:36:56 -0500 Subject: [PATCH 081/435] mm: zswap: function ordering: shrink_memcg_cb shrink_memcg_cb() is called by the shrinker and is based on zswap_writeback_entry(). Move it in between. Save one fwd decl. Link: https://lkml.kernel.org/r/20240130014208.565554-21-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 125 ++++++++++++++++++++++++++--------------------------- 1 file changed, 61 insertions(+), 64 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 667ed3e19340..2bf4bf1d356c 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1254,7 +1254,67 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * shrinker functions **********************************/ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, - spinlock_t *lock, void *arg); + spinlock_t *lock, void *arg) +{ + struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); + bool *encountered_page_in_swapcache = (bool *)arg; + swp_entry_t swpentry; + enum lru_status ret = LRU_REMOVED_RETRY; + int writeback_result; + + /* + * Rotate the entry to the tail before unlocking the LRU, + * so that in case of an invalidation race concurrent + * reclaimers don't waste their time on it. + * + * If writeback succeeds, or failure is due to the entry + * being invalidated by the swap subsystem, the invalidation + * will unlink and free it. + * + * Temporary failures, where the same entry should be tried + * again immediately, almost never happen for this shrinker. + * We don't do any trylocking; -ENOMEM comes closest, + * but that's extremely rare and doesn't happen spuriously + * either. Don't bother distinguishing this case. + * + * But since they do exist in theory, the entry cannot just + * be unlinked, or we could leak it. Hence, rotate. + */ + list_move_tail(item, &l->list); + + /* + * Once the lru lock is dropped, the entry might get freed. The + * swpentry is copied to the stack, and entry isn't deref'd again + * until the entry is verified to still be alive in the tree. + */ + swpentry = entry->swpentry; + + /* + * It's safe to drop the lock here because we return either + * LRU_REMOVED_RETRY or LRU_RETRY. + */ + spin_unlock(lock); + + writeback_result = zswap_writeback_entry(entry, swpentry); + + if (writeback_result) { + zswap_reject_reclaim_fail++; + ret = LRU_RETRY; + + /* + * Encountering a page already in swap cache is a sign that we are shrinking + * into the warmer region. We should terminate shrinking (if we're in the dynamic + * shrinker context). + */ + if (writeback_result == -EEXIST && encountered_page_in_swapcache) + *encountered_page_in_swapcache = true; + } else { + zswap_written_back_pages++; + } + + spin_lock(lock); + return ret; +} static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) @@ -1354,69 +1414,6 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool) pool->shrinker->seeks = DEFAULT_SEEKS; } -static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, - spinlock_t *lock, void *arg) -{ - struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); - bool *encountered_page_in_swapcache = (bool *)arg; - swp_entry_t swpentry; - enum lru_status ret = LRU_REMOVED_RETRY; - int writeback_result; - - /* - * Rotate the entry to the tail before unlocking the LRU, - * so that in case of an invalidation race concurrent - * reclaimers don't waste their time on it. - * - * If writeback succeeds, or failure is due to the entry - * being invalidated by the swap subsystem, the invalidation - * will unlink and free it. - * - * Temporary failures, where the same entry should be tried - * again immediately, almost never happen for this shrinker. - * We don't do any trylocking; -ENOMEM comes closest, - * but that's extremely rare and doesn't happen spuriously - * either. Don't bother distinguishing this case. - * - * But since they do exist in theory, the entry cannot just - * be unlinked, or we could leak it. Hence, rotate. - */ - list_move_tail(item, &l->list); - - /* - * Once the lru lock is dropped, the entry might get freed. The - * swpentry is copied to the stack, and entry isn't deref'd again - * until the entry is verified to still be alive in the tree. - */ - swpentry = entry->swpentry; - - /* - * It's safe to drop the lock here because we return either - * LRU_REMOVED_RETRY or LRU_RETRY. - */ - spin_unlock(lock); - - writeback_result = zswap_writeback_entry(entry, swpentry); - - if (writeback_result) { - zswap_reject_reclaim_fail++; - ret = LRU_RETRY; - - /* - * Encountering a page already in swap cache is a sign that we are shrinking - * into the warmer region. We should terminate shrinking (if we're in the dynamic - * shrinker context). - */ - if (writeback_result == -EEXIST && encountered_page_in_swapcache) - *encountered_page_in_swapcache = true; - } else { - zswap_written_back_pages++; - } - - spin_lock(lock); - return ret; -} - static int shrink_memcg(struct mem_cgroup *memcg) { struct zswap_pool *pool; From 5af28560fe4f091c215580b16672004fdf08f304 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:40 -0800 Subject: [PATCH 082/435] Docs/admin-guide/mm/damon/usage: use sysfs interface for tracepoints example Patch series "mm/damon: make DAMON debugfs interface deprecation unignorable". DAMON debugfs interface is deprecated in February 2023, by commit 5445fcbc4cda ("Docs/admin-guide/mm/damon/usage: add DAMON debugfs interface deprecation notice"). Make the fact unable to be easily ignored by removing an example usage from the document (patch 1), renaming the config (patch 2), adding a deprecation notice file to the debugfs directory (patches 3-5), and renaming the debugfs file that essnetial to be used for real use of DAMON (patches 6-9). This patch (of 9): DAMON tracepoints example on the DAMON usage document is using DAMON debugfs interface, which is deprecated. Use its alternative, DAMON sysfs interface. Link: https://lkml.kernel.org/r/20240130013549.89538-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240130013549.89538-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 9d23144bf985..f2feabb4bd35 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -579,11 +579,11 @@ monitoring results recording. While the monitoring is turned on, you could record the tracepoint events and show results using tracepoint supporting tools like ``perf``. For example:: - # echo on > monitor_on + # echo on > kdamonds/0/state # perf record -e damon:damon_aggregated & # sleep 5 # kill 9 $(pidof perf) - # echo off > monitor_on + # echo off > kdamonds/0/state # perf script kdamond.0 46568 [027] 79357.842179: damon:damon_aggregated: target_id=0 nr_regions=11 122509119488-135708762112: 0 864 [...] From f4cba4bf6777b15d934a354e8aa07059112bf2e8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:41 -0800 Subject: [PATCH 083/435] mm/damon: rename CONFIG_DAMON_DBGFS to DAMON_DBGFS_DEPRECATED DAMON debugfs interface is deprecated. The fact has documented by commit 5445fcbc4cda ("Docs/admin-guide/mm/damon/usage: add DAMON debugfs interface deprecation notice"). Commit 620932cd2852 ("mm/damon/dbgfs: print DAMON debugfs interface deprecation message") further started printing a warning message when users still use it. Many people don't read documentation or kernel log, though. Make the deprecation harder to be ignored using the approach of commit eb07c4f39c3e ("mm/slab: rename CONFIG_SLAB to CONFIG_SLAB_DEPRECATED"). 'make oldconfig' with 'CONFIG_DAMON_DBGFS=y' will get a new prompt with the explicit deprecation notice on the name. 'make olddefconfig' with 'CONFIG_DAMON_DBGFS=y' will result in not building DAMON debugfs interface. If there is a real user of DAMON debugfs interface, they will complain the change to the builder. Link: https://lkml.kernel.org/r/20240130013549.89538-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 29f43fbc2eff..fecb8172410c 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -71,7 +71,7 @@ config DAMON_SYSFS_KUNIT_TEST If unsure, say N. -config DAMON_DBGFS +config DAMON_DBGFS_DEPRECATED bool "DAMON debugfs interface (DEPRECATED!)" depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS help @@ -84,6 +84,11 @@ config DAMON_DBGFS (DAMON_SYSFS). If you depend on this and cannot move, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org. +config DAMON_DBGFS + bool + default y + depends on DAMON_DBGFS_DEPRECATED + config DAMON_DBGFS_KUNIT_TEST bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS depends on DAMON_DBGFS && KUNIT=y From f921003b40d107e805b29358fd1035eadae3806c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:42 -0800 Subject: [PATCH 084/435] mm/damon/dbgfs: implement deprecation notice file Implement a read-only file for DAMON debugfs interface deprecation notice, to let users who manually read/write the DAMON debugfs files from their shell command line easily notice the fact. [arnd@arndb.de: fix bogus string length] Link: https://lkml.kernel.org/r/20240202124339.892862-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20240130013549.89538-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Arnd Bergmann Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- mm/damon/dbgfs.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 7dac24e69e3b..f66865acb523 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -805,6 +805,17 @@ static void dbgfs_destroy_ctx(struct damon_ctx *ctx) damon_destroy_ctx(ctx); } +static ssize_t damon_dbgfs_deprecated_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + static const char kbuf[512] = "DAMON debugfs interface is deprecated, " + "so users should move to DAMON_SYSFS. If you cannot, " + "please report your usecase to damon@lists.linux.dev and " + "linux-mm@kvack.org.\n"; + + return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf)); +} + /* * Make a context of @name and create a debugfs directory for it. * @@ -1056,6 +1067,10 @@ static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file) return nonseekable_open(inode, file); } +static const struct file_operations deprecated_fops = { + .read = damon_dbgfs_deprecated_read, +}; + static const struct file_operations mk_contexts_fops = { .open = damon_dbgfs_static_file_open, .write = dbgfs_mk_context_write, @@ -1076,9 +1091,9 @@ static int __init __damon_dbgfs_init(void) { struct dentry *dbgfs_root; const char * const file_names[] = {"mk_contexts", "rm_contexts", - "monitor_on"}; + "monitor_on", "DEPRECATED"}; const struct file_operations *fops[] = {&mk_contexts_fops, - &rm_contexts_fops, &monitor_on_fops}; + &rm_contexts_fops, &monitor_on_fops, &deprecated_fops}; int i; dbgfs_root = debugfs_create_dir("damon", NULL); From eceea30c9086916cb82cb9204a5a41be2523acbb Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:43 -0800 Subject: [PATCH 085/435] mm/damon/dbgfs: make debugfs interface deprecation message a macro DAMON debugfs interface deprecation message is written twice, once for the warning, and again for DEPRECATED file's read output. De-duplicate those by defining the message as a macro and reuse. [akpm@linux-foundation.org: s/comnst/const/] Link: https://lkml.kernel.org/r/20240130013549.89538-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- mm/damon/dbgfs.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index f66865acb523..45d769984daa 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -15,6 +15,11 @@ #include #include +#define DAMON_DBGFS_DEPRECATION_NOTICE \ + "DAMON debugfs interface is deprecated, so users should move " \ + "to DAMON_SYSFS. If you cannot, please report your usecase to " \ + "damon@lists.linux.dev and linux-mm@kvack.org.\n" + static struct damon_ctx **dbgfs_ctxs; static int dbgfs_nr_ctxs; static struct dentry **dbgfs_dirs; @@ -22,10 +27,7 @@ static DEFINE_MUTEX(damon_dbgfs_lock); static void damon_dbgfs_warn_deprecation(void) { - pr_warn_once("DAMON debugfs interface is deprecated, " - "so users should move to DAMON_SYSFS. If you cannot, " - "please report your usecase to damon@lists.linux.dev and " - "linux-mm@kvack.org.\n"); + pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE); } /* @@ -808,10 +810,7 @@ static void dbgfs_destroy_ctx(struct damon_ctx *ctx) static ssize_t damon_dbgfs_deprecated_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - static const char kbuf[512] = "DAMON debugfs interface is deprecated, " - "so users should move to DAMON_SYSFS. If you cannot, " - "please report your usecase to damon@lists.linux.dev and " - "linux-mm@kvack.org.\n"; + static const char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE; return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf)); } From cf3810cc317c069259915770f4ed43d61eaf85bf Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:44 -0800 Subject: [PATCH 086/435] Docs/admin-guide/mm/damon/usage: document 'DEPRECATED' file of DAMON debugfs interface Document the newly added DAMON debugfs interface deprecation notice file on the usage document. Link: https://lkml.kernel.org/r/20240130013549.89538-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index f2feabb4bd35..5d3df18dfb9f 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -628,9 +628,16 @@ debugfs Interface (DEPRECATED!) move, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org. -DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``, -``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and -``rm_contexts`` under its debugfs directory, ``/damon/``. +DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``, +``init_regions``, ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` +and ``rm_contexts`` under its debugfs directory, ``/damon/``. + + +``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation +notice. Reading it returns the deprecation notice, as below:: + + # cat DEPRECATED + DAMON debugfs interface is deprecated, so users should move to DAMON_SYSFS. If you cannot, please report your usecase to damon@lists.linux.dev and linux-mm@kvack.org. Attributes From 8d1d3807d501de1467904eb40b0aa345668c3827 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:45 -0800 Subject: [PATCH 087/435] selftets/damon: prepare for monitor_on file renaming Following change will rename 'monitor_on' DAMON debugfs file to 'monitor_on_DEPRECATED', to make the deprecation unignorable in runtime. Since it could make DAMON selftests fail and disturb future bisects, update DAMON selftests to support the change. Link: https://lkml.kernel.org/r/20240130013549.89538-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_chk_dependency.sh | 11 +++++++++-- tools/testing/selftests/damon/_debugfs_common.sh | 7 +++++++ .../testing/selftests/damon/debugfs_empty_targets.sh | 12 ++++++++++-- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh index 0328ac0b5a5e..350f8c2b071d 100644 --- a/tools/testing/selftests/damon/_chk_dependency.sh +++ b/tools/testing/selftests/damon/_chk_dependency.sh @@ -18,7 +18,14 @@ then exit $ksft_skip fi -for f in attrs target_ids monitor_on +if [ -f "$DBGFS/monitor_on_DEPRECATED" ] +then + monitor_on_file="monitor_on_DEPRECATED" +else + monitor_on_file="monitor_on" +fi + +for f in attrs target_ids "$monitor_on_file" do if [ ! -f "$DBGFS/$f" ] then @@ -28,7 +35,7 @@ do done permission_error="Operation not permitted" -for f in attrs target_ids monitor_on +for f in attrs target_ids "$monitor_on_file" do status=$( cat "$DBGFS/$f" 2>&1 ) if [ "${status#*$permission_error}" != "$status" ]; then diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh index 48989d4813ae..aa995516870b 100644 --- a/tools/testing/selftests/damon/_debugfs_common.sh +++ b/tools/testing/selftests/damon/_debugfs_common.sh @@ -45,6 +45,13 @@ test_content() { source ./_chk_dependency.sh damon_onoff="$DBGFS/monitor_on" +if [ -f "$DBGFS/monitor_on_DEPRECATED" ] +then + damon_onoff="$DBGFS/monitor_on_DEPRECATED" +else + damon_onoff="$DBGFS/monitor_on" +fi + if [ $(cat "$damon_onoff") = "on" ] then echo "monitoring is on" diff --git a/tools/testing/selftests/damon/debugfs_empty_targets.sh b/tools/testing/selftests/damon/debugfs_empty_targets.sh index 87aff8083822..effbea33dc16 100755 --- a/tools/testing/selftests/damon/debugfs_empty_targets.sh +++ b/tools/testing/selftests/damon/debugfs_empty_targets.sh @@ -8,6 +8,14 @@ source _debugfs_common.sh orig_target_ids=$(cat "$DBGFS/target_ids") echo "" > "$DBGFS/target_ids" -orig_monitor_on=$(cat "$DBGFS/monitor_on") -test_write_fail "$DBGFS/monitor_on" "on" "orig_monitor_on" "empty target ids" + +if [ -f "$DBGFS/monitor_on_DEPRECATED" ] +then + monitor_on_file="$DBGFS/monitor_on_DEPRECATED" +else + monitor_on_file="$DBGFS/monitor_on" +fi + +orig_monitor_on=$(cat "$monitor_on_file") +test_write_fail "$monitor_on_file" "on" "orig_monitor_on" "empty target ids" echo "$orig_target_ids" > "$DBGFS/target_ids" From 772333cb2acffb6c92a11bbf4d2823c5b6154b27 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:46 -0800 Subject: [PATCH 088/435] mm/damon/dbgfs: rename monitor_on file to monitor_on_DEPRECATED Kernel builders could silently enable CONFIG_DAMON_DBGFS_DEPRECATED. Users who manually check the files under the DAMON debugfs directory could notice the deprecation owing to the 'DEPRECATED' DAMON debugfs file, but there could be users who doesn't manually check the files. Make the deprecation cannot be ignored in the case by renaming 'monitor_on' file, which is essential for real use of DAMON on runtime, to 'monitor_on_DEPRECATED'. Still users who control DAMON via only user-space tool could ignore the deprecation, but that's what the tool developers should take care of. DAMON user-space tool, damo, has also made a change[1] for the purpose. [1] commit 935dae76f2aee ("_damon_args: Rename --damon_interface to --damon_interface_DEPRECATED") of https://github.com/awslabs/damo Link: https://lkml.kernel.org/r/20240130013549.89538-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- mm/damon/dbgfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 45d769984daa..2461cfe2e968 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -1090,7 +1090,7 @@ static int __init __damon_dbgfs_init(void) { struct dentry *dbgfs_root; const char * const file_names[] = {"mk_contexts", "rm_contexts", - "monitor_on", "DEPRECATED"}; + "monitor_on_DEPRECATED", "DEPRECATED"}; const struct file_operations *fops[] = {&mk_contexts_fops, &rm_contexts_fops, &monitor_on_fops, &deprecated_fops}; int i; From ec28cf530cdf390ba5f8529b5767eab0c615b918 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:47 -0800 Subject: [PATCH 089/435] Docs/admin-guide/mm/damon/usage: update for monitor_on renaming Update DAMON debugfs interface sections on the usage document to reflect the fact that 'monitor_on' file has renamed to 'monitor_on_DEPRECATED'. Link: https://lkml.kernel.org/r/20240130013549.89538-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 29 ++++++++++---------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 5d3df18dfb9f..58c34e66b31b 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -629,8 +629,9 @@ debugfs Interface (DEPRECATED!) linux-mm@kvack.org. DAMON exports nine files, ``DEPRECATED``, ``attrs``, ``target_ids``, -``init_regions``, ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` -and ``rm_contexts`` under its debugfs directory, ``/damon/``. +``init_regions``, ``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, +``mk_contexts`` and ``rm_contexts`` under its debugfs directory, +``/damon/``. ``DEPRECATED`` is a read-only file for the DAMON debugfs interface deprecation @@ -855,16 +856,16 @@ Turning On/Off Setting the files as described above doesn't incur effect unless you explicitly start the monitoring. You can start, stop, and check the current status of the -monitoring by writing to and reading from the ``monitor_on`` file. Writing -``on`` to the file starts the monitoring of the targets with the attributes. -Writing ``off`` to the file stops those. DAMON also stops if every target -process is terminated. Below example commands turn on, off, and check the -status of DAMON:: +monitoring by writing to and reading from the ``monitor_on_DEPRECATED`` file. +Writing ``on`` to the file starts the monitoring of the targets with the +attributes. Writing ``off`` to the file stops those. DAMON also stops if +every target process is terminated. Below example commands turn on, off, and +check the status of DAMON:: # cd /damon - # echo on > monitor_on - # echo off > monitor_on - # cat monitor_on + # echo on > monitor_on_DEPRECATED + # echo off > monitor_on_DEPRECATED + # cat monitor_on_DEPRECATED off Please note that you cannot write to the above-mentioned debugfs files while @@ -880,11 +881,11 @@ can get the pid of the thread by reading the ``kdamond_pid`` file. When the monitoring is turned off, reading the file returns ``none``. :: # cd /damon - # cat monitor_on + # cat monitor_on_DEPRECATED off # cat kdamond_pid none - # echo on > monitor_on + # echo on > monitor_on_DEPRECATED # cat kdamond_pid 18594 @@ -914,5 +915,5 @@ directory by putting the name of the context to the ``rm_contexts`` file. :: # ls foo # ls: cannot access 'foo': No such file or directory -Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the -root directory only. +Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on_DEPRECATED`` files +are in the root directory only. From 87beb00404b712da545fad55ed3c3060099eea2f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 29 Jan 2024 17:35:48 -0800 Subject: [PATCH 090/435] Docs/translations/damon/usage: update for monitor_on renaming Update DAMON debugfs interface sections on the translated usage documents to reflect the fact that 'monitor_on' file has renamed to 'monitor_on_DEPRECATED'. Link: https://lkml.kernel.org/r/20240130013549.89538-10-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yanteng Si Signed-off-by: Andrew Morton --- .../zh_CN/admin-guide/mm/damon/usage.rst | 20 +++++++++---------- .../zh_TW/admin-guide/mm/damon/usage.rst | 20 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst index 17b9949d9b43..da2745464ece 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst @@ -344,7 +344,7 @@ debugfs接口 :ref:`sysfs接口`。 DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``, -``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` 和 +``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和 ``rm_contexts`` under its debugfs directory, ``/damon/``. @@ -521,15 +521,15 @@ DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``, 开关 ---- -除非你明确地启动监测,否则如上所述的文件设置不会产生效果。你可以通过写入和读取 ``monitor_on`` +除非你明确地启动监测,否则如上所述的文件设置不会产生效果。你可以通过写入和读取 ``monitor_on_DEPRECATED`` 文件来启动、停止和检查监测的当前状态。写入 ``on`` 该文件可以启动对有属性的目标的监测。写入 ``off`` 该文件则停止这些目标。如果每个目标进程被终止,DAMON也会停止。下面的示例命令开启、关 闭和检查DAMON的状态:: # cd /damon - # echo on > monitor_on - # echo off > monitor_on - # cat monitor_on + # echo on > monitor_on_DEPRECATED + # echo off > monitor_on_DEPRECATED + # cat monitor_on_DEPRECATED off 请注意,当监测开启时,你不能写到上述的debugfs文件。如果你在DAMON运行时写到这些文件,将会返 @@ -543,11 +543,11 @@ DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以 得该线程的 ``pid`` 。当监测被 ``关闭`` 时,读取该文件不会返回任何信息:: # cd /damon - # cat monitor_on + # cat monitor_on_DEPRECATED off # cat kdamond_pid none - # echo on > monitor_on + # echo on > monitor_on_DEPRECATED # cat kdamond_pid 18594 @@ -574,7 +574,7 @@ DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以 # ls foo # ls: cannot access 'foo': No such file or directory -注意, ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on`` 文件只在根目录下。 +注意, ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目录下。 监测结果的监测点 @@ -583,9 +583,9 @@ DAMON通过一个叫做kdamond的内核线程来进行请求监测。你可以 DAMON通过一个tracepoint ``damon:damon_aggregated`` 提供监测结果. 当监测开启时,你可 以记录追踪点事件,并使用追踪点支持工具如perf显示结果。比如说:: - # echo on > monitor_on + # echo on > monitor_on_DEPRECATED # perf record -e damon:damon_aggregated & # sleep 5 # kill 9 $(pidof perf) - # echo off > monitor_on + # echo off > monitor_on_DEPRECATED # perf script diff --git a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst index 6dee719a32ea..7464279f9b7d 100644 --- a/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst +++ b/Documentation/translations/zh_TW/admin-guide/mm/damon/usage.rst @@ -344,7 +344,7 @@ debugfs接口 :ref:`sysfs接口`。 DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``, -``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` 和 +``schemes``, ``monitor_on_DEPRECATED``, ``kdamond_pid``, ``mk_contexts`` 和 ``rm_contexts`` under its debugfs directory, ``/damon/``. @@ -521,15 +521,15 @@ DAMON導出了八個文件, ``attrs``, ``target_ids``, ``init_regions``, 開關 ---- -除非你明確地啓動監測,否則如上所述的文件設置不會產生效果。你可以通過寫入和讀取 ``monitor_on`` +除非你明確地啓動監測,否則如上所述的文件設置不會產生效果。你可以通過寫入和讀取 ``monitor_on_DEPRECATED`` 文件來啓動、停止和檢查監測的當前狀態。寫入 ``on`` 該文件可以啓動對有屬性的目標的監測。寫入 ``off`` 該文件則停止這些目標。如果每個目標進程被終止,DAMON也會停止。下面的示例命令開啓、關 閉和檢查DAMON的狀態:: # cd /damon - # echo on > monitor_on - # echo off > monitor_on - # cat monitor_on + # echo on > monitor_on_DEPRECATED + # echo off > monitor_on_DEPRECATED + # cat monitor_on_DEPRECATED off 請注意,當監測開啓時,你不能寫到上述的debugfs文件。如果你在DAMON運行時寫到這些文件,將會返 @@ -543,11 +543,11 @@ DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以 得該線程的 ``pid`` 。當監測被 ``關閉`` 時,讀取該文件不會返回任何信息:: # cd /damon - # cat monitor_on + # cat monitor_on_DEPRECATED off # cat kdamond_pid none - # echo on > monitor_on + # echo on > monitor_on_DEPRECATED # cat kdamond_pid 18594 @@ -574,7 +574,7 @@ DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以 # ls foo # ls: cannot access 'foo': No such file or directory -注意, ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on`` 文件只在根目錄下。 +注意, ``mk_contexts`` 、 ``rm_contexts`` 和 ``monitor_on_DEPRECATED`` 文件只在根目錄下。 監測結果的監測點 @@ -583,10 +583,10 @@ DAMON通過一個叫做kdamond的內核線程來進行請求監測。你可以 DAMON通過一個tracepoint ``damon:damon_aggregated`` 提供監測結果. 當監測開啓時,你可 以記錄追蹤點事件,並使用追蹤點支持工具如perf顯示結果。比如說:: - # echo on > monitor_on + # echo on > monitor_on_DEPRECATED # perf record -e damon:damon_aggregated & # sleep 5 # kill 9 $(pidof perf) - # echo off > monitor_on + # echo off > monitor_on_DEPRECATED # perf script From 9c793854a04b4434686c6708b70b2c0d6489b710 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Wed, 31 Jan 2024 11:19:13 +0800 Subject: [PATCH 091/435] mm/mmap: use SZ_{8K, 128K} helper macro Use SZ_{8K, 128K} helper macro instead of the number in init_user_reserve and reserve_mem_notifier. This is more readable. Link: https://lkml.kernel.org/r/20240131031913.2058597-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Signed-off-by: Andrew Morton --- mm/mmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 476de5daf598..1f9e70242858 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3845,7 +3845,7 @@ static int init_user_reserve(void) free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); - sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K); return 0; } subsys_initcall(init_user_reserve); @@ -3866,7 +3866,7 @@ static int init_admin_reserve(void) free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); - sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K); return 0; } subsys_initcall(init_admin_reserve); @@ -3898,12 +3898,12 @@ static int reserve_mem_notifier(struct notifier_block *nb, case MEM_ONLINE: /* Default max is 128MB. Leave alone if modified by operator. */ tmp = sysctl_user_reserve_kbytes; - if (0 < tmp && tmp < (1UL << 17)) + if (tmp > 0 && tmp < SZ_128K) init_user_reserve(); /* Default max is 8MB. Leave alone if modified by operator. */ tmp = sysctl_admin_reserve_kbytes; - if (0 < tmp && tmp < (1UL << 13)) + if (tmp > 0 && tmp < SZ_8K) init_admin_reserve(); break; From dce41f5ae2539d1c20ae8de4e039630aec3c3f3c Mon Sep 17 00:00:00 2001 From: Rakie Kim Date: Fri, 2 Feb 2024 12:02:35 -0500 Subject: [PATCH 092/435] mm/mempolicy: implement the sysfs-based weighted_interleave interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/mempolicy: weighted interleave mempolicy and sysfs extension", v5. Weighted interleave is a new interleave policy intended to make use of heterogeneous memory environments appearing with CXL. The existing interleave mechanism does an even round-robin distribution of memory across all nodes in a nodemask, while weighted interleave distributes memory across nodes according to a provided weight. (Weight = # of page allocations per round) Weighted interleave is intended to reduce average latency when bandwidth is pressured - therefore increasing total throughput. In other words: It allows greater use of the total available bandwidth in a heterogeneous hardware environment (different hardware provides different bandwidth capacity). As bandwidth is pressured, latency increases - first linearly and then exponentially. By keeping bandwidth usage distributed according to available bandwidth, we therefore can reduce the average latency of a cacheline fetch. A good explanation of the bandwidth vs latency response curve: https://mahmoudhatem.wordpress.com/2017/11/07/memory-bandwidth-vs-latency-response-curve/ From the article: ``` Constant region: The latency response is fairly constant for the first 40% of the sustained bandwidth. Linear region: In between 40% to 80% of the sustained bandwidth, the latency response increases almost linearly with the bandwidth demand of the system due to contention overhead by numerous memory requests. Exponential region: Between 80% to 100% of the sustained bandwidth, the memory latency is dominated by the contention latency which can be as much as twice the idle latency or more. Maximum sustained bandwidth : Is 65% to 75% of the theoretical maximum bandwidth. ``` As a general rule of thumb: * If bandwidth usage is low, latency does not increase. It is optimal to place data in the nearest (lowest latency) device. * If bandwidth usage is high, latency increases. It is optimal to place data such that bandwidth use is optimized per-device. This is the top line goal: Provide a user a mechanism to target using the "maximum sustained bandwidth" of each hardware component in a heterogenous memory system. For example, the stream benchmark demonstrates that 1:1 (default) interleave is actively harmful, while weighted interleave can be beneficial. Default interleave distributes data such that too much pressure is placed on devices with lower available bandwidth. Stream Benchmark (vs DRAM, 1 Socket + 1 CXL Device) Default interleave : -78% (slower than DRAM) Global weighting : -6% to +4% (workload dependant) Targeted weights : +2.5% to +4% (consistently better than DRAM) Global means the task-policy was set (set_mempolicy), while targeted means VMA policies were set (mbind2). We see weighted interleave is not always beneficial when applied globally, but is always beneficial when applied to bandwidth-driving memory regions. There are 4 patches in this set: 1) Implement system-global interleave weights as sysfs extension in mm/mempolicy.c. These weights are RCU protected, and a default weight set is provided (all weights are 1 by default). In future work, we intend to expose an interface for HMAT/CDAT code to set reasonable default values based on the memory configuration of the system discovered at boot/hotplug. 2) A mild refactor of some interleave-logic for re-use in the new weighted interleave logic. 3) MPOL_WEIGHTED_INTERLEAVE extension for set_mempolicy/mbind 4) Protect interleave logic (weighted and normal) with the mems_allowed seq cookie. If the nodemask changes while accessing it during a rebind, just retry the access. Included below are some performance and LTP test information, and a sample numactl branch which can be used for testing. = Performance summary = (tests may have different configurations, see extended info below) 1) MLC (W2) : +38% over DRAM. +264% over default interleave. MLC (W5) : +40% over DRAM. +226% over default interleave. 2) Stream : -6% to +4% over DRAM, +430% over default interleave. 3) XSBench : +19% over DRAM. +47% over default interleave. = LTP Testing Summary = existing mempolicy & mbind tests: pass mempolicy & mbind + weighted interleave (global weights): pass = version history v5: - style fixes - mems_allowed cookie protection to detect rebind issues, prevents spurious allocation failures and/or mis-allocations - sparse warning fixes related to __rcu on local variables ===================================================================== Performance tests - MLC From - Ravi Jonnalagadda Hardware: Single-socket, multiple CXL memory expanders. Workload: W2 Data Signature: 2:1 read:write DRAM only bandwidth (GBps): 298.8 DRAM + CXL (default interleave) (GBps): 113.04 DRAM + CXL (weighted interleave)(GBps): 412.5 Gain over DRAM only: 1.38x Gain over default interleave: 2.64x Workload: W5 Data Signature: 1:1 read:write DRAM only bandwidth (GBps): 273.2 DRAM + CXL (default interleave) (GBps): 117.23 DRAM + CXL (weighted interleave)(GBps): 382.7 Gain over DRAM only: 1.4x Gain over default interleave: 2.26x ===================================================================== Performance test - Stream From - Gregory Price Hardware: Single socket, single CXL expander numactl extension: https://github.com/gmprice/numactl/tree/weighted_interleave_master Summary: 64 threads, ~18GB workload, 3GB per array, executed 100 times Default interleave : -78% (slower than DRAM) Global weighting : -6% to +4% (workload dependant) mbind2 weights : +2.5% to +4% (consistently better than DRAM) dram only: numactl --cpunodebind=1 --membind=1 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Function Direction BestRateMBs AvgTime MinTime MaxTime Copy: 0->0 200923.2 0.032662 0.031853 0.033301 Scale: 0->0 202123.0 0.032526 0.031664 0.032970 Add: 0->0 208873.2 0.047322 0.045961 0.047884 Triad: 0->0 208523.8 0.047262 0.046038 0.048414 CXL-only: numactl --cpunodebind=1 -w --membind=2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Copy: 0->0 22209.7 0.288661 0.288162 0.289342 Scale: 0->0 22288.2 0.287549 0.287147 0.288291 Add: 0->0 24419.1 0.393372 0.393135 0.393735 Triad: 0->0 24484.6 0.392337 0.392083 0.394331 Based on the above, the optimal weights are ~9:1 echo 9 > /sys/kernel/mm/mempolicy/weighted_interleave/node1 echo 1 > /sys/kernel/mm/mempolicy/weighted_interleave/node2 default interleave: numactl --cpunodebind=1 --interleave=1,2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Copy: 0->0 44666.2 0.143671 0.143285 0.144174 Scale: 0->0 44781.6 0.143256 0.142916 0.143713 Add: 0->0 48600.7 0.197719 0.197528 0.197858 Triad: 0->0 48727.5 0.197204 0.197014 0.197439 global weighted interleave: numactl --cpunodebind=1 -w --interleave=1,2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Copy: 0->0 190085.9 0.034289 0.033669 0.034645 Scale: 0->0 207677.4 0.031909 0.030817 0.033061 Add: 0->0 202036.8 0.048737 0.047516 0.053409 Triad: 0->0 217671.5 0.045819 0.044103 0.046755 targted regions w/ global weights (modified stream to mbind2 malloc'd regions)) numactl --cpunodebind=1 --membind=1 ./stream_c.exe -b --ntimes 100 --array-size 400M --malloc Copy: 0->0 205827.0 0.031445 0.031094 0.031984 Scale: 0->0 208171.8 0.031320 0.030744 0.032505 Add: 0->0 217352.0 0.045087 0.044168 0.046515 Triad: 0->0 216884.8 0.045062 0.044263 0.046982 ===================================================================== Performance tests - XSBench From - Hyeongtak Ji Hardware: Single socket, Single CXL memory Expander NUMA node 0: 56 logical cores, 128 GB memory NUMA node 2: 96 GB CXL memory Threads: 56 Lookups: 170,000,000 Summary: +19% over DRAM. +47% over default interleave. Performance tests - XSBench 1. dram only $ numactl -m 0 ./XSBench -s XL –p 5000000 Runtime: 36.235 seconds Lookups/s: 4,691,618 2. default interleave $ numactl –i 0,2 ./XSBench –s XL –p 5000000 Runtime: 55.243 seconds Lookups/s: 3,077,293 3. weighted interleave numactl –w –i 0,2 ./XSBench –s XL –p 5000000 Runtime: 29.262 seconds Lookups/s: 5,809,513 ===================================================================== LTP Tests: https://github.com/gmprice/ltp/tree/mempolicy2 = Existing tests set_mempolicy, get_mempolicy, mbind MPOL_WEIGHTED_INTERLEAVE added manually to test basic functionality but did not adjust tests for weighting. Basically the weights were set to 1, which is the default, and it should behave the same as MPOL_INTERLEAVE if logic is correct. == set_mempolicy01 : passed 18, failed 0 == set_mempolicy02 : passed 10, failed 0 == set_mempolicy03 : passed 64, failed 0 == set_mempolicy04 : passed 32, failed 0 == set_mempolicy05 - n/a on non-x86 == set_mempolicy06 : passed 10, failed 0 this is set_mempolicy02 + MPOL_WEIGHTED_INTERLEAVE == set_mempolicy07 : passed 32, failed 0 set_mempolicy04 + MPOL_WEIGHTED_INTERLEAVE == get_mempolicy01 : passed 12, failed 0 change: added MPOL_WEIGHTED_INTERLEAVE == get_mempolicy02 : passed 2, failed 0 == mbind01 : passed 15, failed 0 added MPOL_WEIGHTED_INTERLEAVE == mbind02 : passed 4, failed 0 added MPOL_WEIGHTED_INTERLEAVE == mbind03 : passed 16, failed 0 added MPOL_WEIGHTED_INTERLEAVE == mbind04 : passed 48, failed 0 added MPOL_WEIGHTED_INTERLEAVE ===================================================================== numactl (set_mempolicy) w/ global weighting test numactl fork: https://github.com/gmprice/numactl/tree/weighted_interleave_master command: numactl -w --interleave=0,1 ./eatmem result (weights 1:1): 0176a000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=32897 N1=32896 kernelpagesize_kB=4 7fceeb9ff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=32768 N1=32769 kernelpagesize_kB=4 50% distribution is correct result (weights 5:1): 01b14000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=54828 N1=10965 kernelpagesize_kB=4 7f47a1dff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=54614 N1=10923 kernelpagesize_kB=4 16.666% distribution is correct result (weights 1:5): 01f07000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=10966 N1=54827 kernelpagesize_kB=4 7f17b1dff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=10923 N1=54614 kernelpagesize_kB=4 16.666% distribution is correct #include #include #include int main (void) { char* mem = malloc(1024*1024*256); memset(mem, 1, 1024*1024*256); for (int i = 0; i < ((1024*1024*256)/4096); i++) { mem = malloc(4096); mem[0] = 1; } printf("done\n"); getchar(); return 0; } This patch (of 4): This patch provides a way to set interleave weight information under sysfs at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN The sysfs structure is designed as follows. $ tree /sys/kernel/mm/mempolicy/ /sys/kernel/mm/mempolicy/ [1] └── weighted_interleave [2] ├── node0 [3] └── node1 Each file above can be explained as follows. [1] mm/mempolicy: configuration interface for mempolicy subsystem [2] weighted_interleave/: config interface for weighted interleave policy [3] weighted_interleave/nodeN: weight for nodeN If a node value is set to `0`, the system-default value will be used. As of this patch, the system-default for all nodes is always 1. Link: https://lkml.kernel.org/r/20240202170238.90004-1-gregory.price@memverge.com Link: https://lkml.kernel.org/r/20240202170238.90004-2-gregory.price@memverge.com Suggested-by: "Huang, Ying" Signed-off-by: Rakie Kim Signed-off-by: Honggyu Kim Co-developed-by: Gregory Price Signed-off-by: Gregory Price Co-developed-by: Hyeongtak Ji Signed-off-by: Hyeongtak Ji Reviewed-by: "Huang, Ying" Cc: Dan Williams Cc: Gregory Price Cc: Hasan Al Maruf Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Cc: Srinivasulu Thanneeru Signed-off-by: Andrew Morton --- .../ABI/testing/sysfs-kernel-mm-mempolicy | 4 + ...fs-kernel-mm-mempolicy-weighted-interleave | 25 ++ mm/mempolicy.c | 223 ++++++++++++++++++ 3 files changed, 252 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy new file mode 100644 index 000000000000..8ac327fd7fb6 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy @@ -0,0 +1,4 @@ +What: /sys/kernel/mm/mempolicy/ +Date: January 2024 +Contact: Linux memory management mailing list +Description: Interface for Mempolicy diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave new file mode 100644 index 000000000000..0b7972de04e9 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave @@ -0,0 +1,25 @@ +What: /sys/kernel/mm/mempolicy/weighted_interleave/ +Date: January 2024 +Contact: Linux memory management mailing list +Description: Configuration Interface for the Weighted Interleave policy + +What: /sys/kernel/mm/mempolicy/weighted_interleave/nodeN +Date: January 2024 +Contact: Linux memory management mailing list +Description: Weight configuration interface for nodeN + + The interleave weight for a memory node (N). These weights are + utilized by tasks which have set their mempolicy to + MPOL_WEIGHTED_INTERLEAVE. + + These weights only affect new allocations, and changes at runtime + will not cause migrations on already allocated pages. + + The minimum weight for a node is always 1. + + Minimum weight: 1 + Maximum weight: 255 + + Writing an empty string or `0` will reset the weight to the + system default. The system default may be set by the kernel + or drivers at boot or during hotplug events. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5e519163c4dc..b4fccc921b62 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -131,6 +131,32 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; +/* + * iw_table is the sysfs-set interleave weight table, a value of 0 denotes + * system-default value should be used. A NULL iw_table also denotes that + * system-default values should be used. Until the system-default table + * is implemented, the system-default is always 1. + * + * iw_table is RCU protected + */ +static u8 __rcu *iw_table; +static DEFINE_MUTEX(iw_table_lock); + +static u8 get_il_weight(int node) +{ + u8 *table; + u8 weight; + + rcu_read_lock(); + table = rcu_dereference(iw_table); + /* if no iw_table, use system default */ + weight = table ? table[node] : 1; + /* if value in iw_table is 0, use system default */ + weight = weight ? weight : 1; + rcu_read_unlock(); + return weight; +} + /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search @@ -3063,3 +3089,200 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } + +#ifdef CONFIG_SYSFS +struct iw_node_attr { + struct kobj_attribute kobj_attr; + int nid; +}; + +static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct iw_node_attr *node_attr; + u8 weight; + + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); + weight = get_il_weight(node_attr->nid); + return sysfs_emit(buf, "%d\n", weight); +} + +static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct iw_node_attr *node_attr; + u8 *new; + u8 *old; + u8 weight = 0; + + node_attr = container_of(attr, struct iw_node_attr, kobj_attr); + if (count == 0 || sysfs_streq(buf, "")) + weight = 0; + else if (kstrtou8(buf, 0, &weight)) + return -EINVAL; + + new = kzalloc(nr_node_ids, GFP_KERNEL); + if (!new) + return -ENOMEM; + + mutex_lock(&iw_table_lock); + old = rcu_dereference_protected(iw_table, + lockdep_is_held(&iw_table_lock)); + if (old) + memcpy(new, old, nr_node_ids); + new[node_attr->nid] = weight; + rcu_assign_pointer(iw_table, new); + mutex_unlock(&iw_table_lock); + synchronize_rcu(); + kfree(old); + return count; +} + +static struct iw_node_attr **node_attrs; + +static void sysfs_wi_node_release(struct iw_node_attr *node_attr, + struct kobject *parent) +{ + if (!node_attr) + return; + sysfs_remove_file(parent, &node_attr->kobj_attr.attr); + kfree(node_attr->kobj_attr.attr.name); + kfree(node_attr); +} + +static void sysfs_wi_release(struct kobject *wi_kobj) +{ + int i; + + for (i = 0; i < nr_node_ids; i++) + sysfs_wi_node_release(node_attrs[i], wi_kobj); + kobject_put(wi_kobj); +} + +static const struct kobj_type wi_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = sysfs_wi_release, +}; + +static int add_weight_node(int nid, struct kobject *wi_kobj) +{ + struct iw_node_attr *node_attr; + char *name; + + node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL); + if (!node_attr) + return -ENOMEM; + + name = kasprintf(GFP_KERNEL, "node%d", nid); + if (!name) { + kfree(node_attr); + return -ENOMEM; + } + + sysfs_attr_init(&node_attr->kobj_attr.attr); + node_attr->kobj_attr.attr.name = name; + node_attr->kobj_attr.attr.mode = 0644; + node_attr->kobj_attr.show = node_show; + node_attr->kobj_attr.store = node_store; + node_attr->nid = nid; + + if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) { + kfree(node_attr->kobj_attr.attr.name); + kfree(node_attr); + pr_err("failed to add attribute to weighted_interleave\n"); + return -ENOMEM; + } + + node_attrs[nid] = node_attr; + return 0; +} + +static int add_weighted_interleave_group(struct kobject *root_kobj) +{ + struct kobject *wi_kobj; + int nid, err; + + wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!wi_kobj) + return -ENOMEM; + + err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj, + "weighted_interleave"); + if (err) { + kfree(wi_kobj); + return err; + } + + for_each_node_state(nid, N_POSSIBLE) { + err = add_weight_node(nid, wi_kobj); + if (err) { + pr_err("failed to add sysfs [node%d]\n", nid); + break; + } + } + if (err) + kobject_put(wi_kobj); + return 0; +} + +static void mempolicy_kobj_release(struct kobject *kobj) +{ + u8 *old; + + mutex_lock(&iw_table_lock); + old = rcu_dereference_protected(iw_table, + lockdep_is_held(&iw_table_lock)); + rcu_assign_pointer(iw_table, NULL); + mutex_unlock(&iw_table_lock); + synchronize_rcu(); + kfree(old); + kfree(node_attrs); + kfree(kobj); +} + +static const struct kobj_type mempolicy_ktype = { + .release = mempolicy_kobj_release +}; + +static int __init mempolicy_sysfs_init(void) +{ + int err; + static struct kobject *mempolicy_kobj; + + mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL); + if (!mempolicy_kobj) { + err = -ENOMEM; + goto err_out; + } + + node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *), + GFP_KERNEL); + if (!node_attrs) { + err = -ENOMEM; + goto mempol_out; + } + + err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj, + "mempolicy"); + if (err) + goto node_out; + + err = add_weighted_interleave_group(mempolicy_kobj); + if (err) { + pr_err("mempolicy sysfs structure failed to initialize\n"); + kobject_put(mempolicy_kobj); + return err; + } + + return err; +node_out: + kfree(node_attrs); +mempol_out: + kfree(mempolicy_kobj); +err_out: + pr_err("failed to add mempolicy kobject to the system\n"); + return err; +} + +late_initcall(mempolicy_sysfs_init); +#endif /* CONFIG_SYSFS */ From 9685e6e30d116d72fb013b0bce261a676b7575c1 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 2 Feb 2024 12:02:36 -0500 Subject: [PATCH 093/435] mm/mempolicy: refactor a read-once mechanism into a function for re-use Move the use of barrier() to force policy->nodemask onto the stack into a function `read_once_policy_nodemask` so that it may be re-used. Link: https://lkml.kernel.org/r/20240202170238.90004-3-gregory.price@memverge.com Signed-off-by: Gregory Price Suggested-by: "Huang, Ying" Reviewed-by: "Huang, Ying" Cc: Dan Williams Cc: Hasan Al Maruf Cc: Honggyu Kim Cc: Hyeongtak Ji Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Cc: Rakie Kim Cc: Ravi Jonnalagadda Cc: Srinivasulu Thanneeru Signed-off-by: Andrew Morton --- mm/mempolicy.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b4fccc921b62..1bdc7d0d1b0b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1905,6 +1905,20 @@ unsigned int mempolicy_slab_node(void) } } +static unsigned int read_once_policy_nodemask(struct mempolicy *pol, + nodemask_t *mask) +{ + /* + * barrier stabilizes the nodemask locally so that it can be iterated + * over safely without concern for changes. Allocators validate node + * selection does not violate mems_allowed, so this is safe. + */ + barrier(); + memcpy(mask, &pol->nodes, sizeof(nodemask_t)); + barrier(); + return nodes_weight(*mask); +} + /* * Do static interleaving for interleave index @ilx. Returns the ilx'th * node in pol->nodes (starting from ilx=0), wrapping around if ilx @@ -1912,20 +1926,12 @@ unsigned int mempolicy_slab_node(void) */ static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { - nodemask_t nodemask = pol->nodes; + nodemask_t nodemask; unsigned int target, nnodes; int i; int nid; - /* - * The barrier will stabilize the nodemask in a register or on - * the stack so that it will stop changing under the code. - * - * Between first_node() and next_node(), pol->nodes could be changed - * by other threads. So we put pol->nodes in a local stack. - */ - barrier(); - nnodes = nodes_weight(nodemask); + nnodes = read_once_policy_nodemask(pol, &nodemask); if (!nnodes) return numa_node_id(); target = ilx % nnodes; From fa3bea4e1f8202d787709b7e3654eb0a99aed758 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 2 Feb 2024 12:02:37 -0500 Subject: [PATCH 094/435] mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf Signed-off-by: Gregory Price Co-developed-by: Rakie Kim Signed-off-by: Rakie Kim Co-developed-by: Honggyu Kim Signed-off-by: Honggyu Kim Co-developed-by: Hyeongtak Ji Signed-off-by: Hyeongtak Ji Co-developed-by: Srinivasulu Thanneeru Signed-off-by: Srinivasulu Thanneeru Co-developed-by: Ravi Jonnalagadda Signed-off-by: Ravi Jonnalagadda Reviewed-by: "Huang, Ying" Cc: Dan Williams Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Signed-off-by: Andrew Morton --- .../admin-guide/mm/numa_memory_policy.rst | 9 + include/linux/sched.h | 1 + include/uapi/linux/mempolicy.h | 1 + mm/mempolicy.c | 218 +++++++++++++++++- 4 files changed, 225 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index eca38fa81e0f..a70f20ce1ffb 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -250,6 +250,15 @@ MPOL_PREFERRED_MANY can fall back to all existing numa nodes. This is effectively MPOL_PREFERRED allowed for a mask rather than a single node. +MPOL_WEIGHTED_INTERLEAVE + This mode operates the same as MPOL_INTERLEAVE, except that + interleaving behavior is executed based on weights set in + /sys/kernel/mm/mempolicy/weighted_interleave/ + + Weighted interleave allocates pages on nodes according to a + weight. For example if nodes [0,1] are weighted [5,2], 5 pages + will be allocated on node0 for every 2 pages allocated on node1. + NUMA memory policy supports the following optional mode flags: MPOL_F_STATIC_NODES diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618ab86..b9ce285d8c9c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1259,6 +1259,7 @@ struct task_struct { /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; + u8 il_weight; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index a8963f7ef4c2..1f9bb10d1a47 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -23,6 +23,7 @@ enum { MPOL_INTERLEAVE, MPOL_LOCAL, MPOL_PREFERRED_MANY, + MPOL_WEIGHTED_INTERLEAVE, MPOL_MAX, /* always last member of enum */ }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1bdc7d0d1b0b..a8db92c23697 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -19,6 +19,13 @@ * for anonymous memory. For process policy an process counter * is used. * + * weighted interleave + * Allocate memory interleaved over a set of nodes based on + * a set of weights (per-node), with normal fallback if it + * fails. Otherwise operates the same as interleave. + * Example: nodeset(0,1) & weights (2,1) - 2 pages allocated + * on node 0 for every 1 page allocated on node 1. + * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node @@ -441,6 +448,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_preferred, }, + [MPOL_WEIGHTED_INTERLEAVE] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_nodemask, + }, }; static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, @@ -858,8 +869,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, old = current->mempolicy; current->mempolicy = new; - if (new && new->mode == MPOL_INTERLEAVE) + if (new && (new->mode == MPOL_INTERLEAVE || + new->mode == MPOL_WEIGHTED_INTERLEAVE)) { current->il_prev = MAX_NUMNODES-1; + current->il_weight = 0; + } task_unlock(current); mpol_put(old); ret = 0; @@ -884,6 +898,7 @@ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: + case MPOL_WEIGHTED_INTERLEAVE: *nodes = pol->nodes; break; case MPOL_LOCAL: @@ -968,6 +983,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->nodes); + } else if (pol == current->mempolicy && + pol->mode == MPOL_WEIGHTED_INTERLEAVE) { + if (current->il_weight) + *policy = current->il_prev; + else + *policy = next_node_in(current->il_prev, + pol->nodes); } else { err = -EINVAL; goto out; @@ -1332,7 +1354,8 @@ static long do_mbind(unsigned long start, unsigned long len, * VMAs, the nodes will still be interleaved from the targeted * nodemask, but one by one may be selected differently. */ - if (new->mode == MPOL_INTERLEAVE) { + if (new->mode == MPOL_INTERLEAVE || + new->mode == MPOL_WEIGHTED_INTERLEAVE) { struct page *page; unsigned int order; unsigned long addr = -EFAULT; @@ -1780,7 +1803,8 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup * @order: 0, or appropriate huge_page_order for interleaving - * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE + * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or + * MPOL_WEIGHTED_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. @@ -1797,7 +1821,8 @@ struct mempolicy *get_vma_policy(struct vm_area_struct *vma, pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); - if (pol->mode == MPOL_INTERLEAVE) { + if (pol->mode == MPOL_INTERLEAVE || + pol->mode == MPOL_WEIGHTED_INTERLEAVE) { *ilx += vma->vm_pgoff >> order; *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); } @@ -1847,6 +1872,22 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) return zone >= dynamic_policy_zone; } +static unsigned int weighted_interleave_nodes(struct mempolicy *policy) +{ + unsigned int node = current->il_prev; + + if (!current->il_weight || !node_isset(node, policy->nodes)) { + node = next_node_in(node, policy->nodes); + /* can only happen if nodemask is being rebound */ + if (node == MAX_NUMNODES) + return node; + current->il_prev = node; + current->il_weight = get_il_weight(node); + } + current->il_weight--; + return node; +} + /* Do dynamic interleaving for a process */ static unsigned int interleave_nodes(struct mempolicy *policy) { @@ -1881,6 +1922,9 @@ unsigned int mempolicy_slab_node(void) case MPOL_INTERLEAVE: return interleave_nodes(policy); + case MPOL_WEIGHTED_INTERLEAVE: + return weighted_interleave_nodes(policy); + case MPOL_BIND: case MPOL_PREFERRED_MANY: { @@ -1919,6 +1963,45 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol, return nodes_weight(*mask); } +static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx) +{ + nodemask_t nodemask; + unsigned int target, nr_nodes; + u8 *table; + unsigned int weight_total = 0; + u8 weight; + int nid; + + nr_nodes = read_once_policy_nodemask(pol, &nodemask); + if (!nr_nodes) + return numa_node_id(); + + rcu_read_lock(); + table = rcu_dereference(iw_table); + /* calculate the total weight */ + for_each_node_mask(nid, nodemask) { + /* detect system default usage */ + weight = table ? table[nid] : 1; + weight = weight ? weight : 1; + weight_total += weight; + } + + /* Calculate the node offset based on totals */ + target = ilx % weight_total; + nid = first_node(nodemask); + while (target) { + /* detect system default usage */ + weight = table ? table[nid] : 1; + weight = weight ? weight : 1; + if (target < weight) + break; + target -= weight; + nid = next_node_in(nid, nodemask); + } + rcu_read_unlock(); + return nid; +} + /* * Do static interleaving for interleave index @ilx. Returns the ilx'th * node in pol->nodes (starting from ilx=0), wrapping around if ilx @@ -1979,6 +2062,11 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, *nid = (ilx == NO_INTERLEAVE_INDEX) ? interleave_nodes(pol) : interleave_nid(pol, ilx); break; + case MPOL_WEIGHTED_INTERLEAVE: + *nid = (ilx == NO_INTERLEAVE_INDEX) ? + weighted_interleave_nodes(pol) : + weighted_interleave_nid(pol, ilx); + break; } return nodemask; @@ -2040,6 +2128,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: *mask = mempolicy->nodes; break; @@ -2140,6 +2229,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, * node in its nodemask, we allocate the standard way. */ if (pol->mode != MPOL_INTERLEAVE && + pol->mode != MPOL_WEIGHTED_INTERLEAVE && (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but @@ -2275,6 +2365,114 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, return total_allocated; } +static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + struct task_struct *me = current; + unsigned long total_allocated = 0; + unsigned long nr_allocated = 0; + unsigned long rounds; + unsigned long node_pages, delta; + u8 *table, *weights, weight; + unsigned int weight_total = 0; + unsigned long rem_pages = nr_pages; + nodemask_t nodes; + int nnodes, node; + int resume_node = MAX_NUMNODES - 1; + u8 resume_weight = 0; + int prev_node; + int i; + + if (!nr_pages) + return 0; + + nnodes = read_once_policy_nodemask(pol, &nodes); + if (!nnodes) + return 0; + + /* Continue allocating from most recent node and adjust the nr_pages */ + node = me->il_prev; + weight = me->il_weight; + if (weight && node_isset(node, nodes)) { + node_pages = min(rem_pages, weight); + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + NULL, page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + /* if that's all the pages, no need to interleave */ + if (rem_pages <= weight) { + me->il_weight -= rem_pages; + return total_allocated; + } + /* Otherwise we adjust remaining pages, continue from there */ + rem_pages -= weight; + } + /* clear active weight in case of an allocation failure */ + me->il_weight = 0; + prev_node = node; + + /* create a local copy of node weights to operate on outside rcu */ + weights = kzalloc(nr_node_ids, GFP_KERNEL); + if (!weights) + return total_allocated; + + rcu_read_lock(); + table = rcu_dereference(iw_table); + if (table) + memcpy(weights, table, nr_node_ids); + rcu_read_unlock(); + + /* calculate total, detect system default usage */ + for_each_node_mask(node, nodes) { + if (!weights[node]) + weights[node] = 1; + weight_total += weights[node]; + } + + /* + * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls. + * Track which node weighted interleave should resume from. + * + * if (rounds > 0) and (delta == 0), resume_node will always be + * the node following prev_node and its weight. + */ + rounds = rem_pages / weight_total; + delta = rem_pages % weight_total; + resume_node = next_node_in(prev_node, nodes); + resume_weight = weights[resume_node]; + for (i = 0; i < nnodes; i++) { + node = next_node_in(prev_node, nodes); + weight = weights[node]; + node_pages = weight * rounds; + /* If a delta exists, add this node's portion of the delta */ + if (delta > weight) { + node_pages += weight; + delta -= weight; + } else if (delta) { + /* when delta is depleted, resume from that node */ + node_pages += delta; + resume_node = node; + resume_weight = weight - delta; + delta = 0; + } + /* node_pages can be 0 if an allocation fails and rounds == 0 */ + if (!node_pages) + break; + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + NULL, page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + if (total_allocated == nr_pages) + break; + prev_node = node; + } + me->il_prev = resume_node; + me->il_weight = resume_weight; + kfree(weights); + return total_allocated; +} + static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) @@ -2315,6 +2513,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, return alloc_pages_bulk_array_interleave(gfp, pol, nr_pages, page_array); + if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) + return alloc_pages_bulk_array_weighted_interleave( + gfp, pol, nr_pages, page_array); + if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); @@ -2390,6 +2592,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: + case MPOL_WEIGHTED_INTERLEAVE: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; @@ -2526,6 +2729,10 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, polnid = interleave_nid(pol, ilx); break; + case MPOL_WEIGHTED_INTERLEAVE: + polnid = weighted_interleave_nid(pol, ilx); + break; + case MPOL_PREFERRED: if (node_isset(curnid, pol->nodes)) goto out; @@ -2900,6 +3107,7 @@ static const char * const policy_modes[] = [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", + [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", }; @@ -2959,6 +3167,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) } break; case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ @@ -3069,6 +3278,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: nodes = pol->nodes; break; default: From 274519ed414bd2b9a77c5db78ee51778d37ceacf Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 2 Feb 2024 12:02:38 -0500 Subject: [PATCH 095/435] mm/mempolicy: protect task interleave functions with tsk->mems_allowed_seq In the event of rebind, pol->nodemask can change at the same time as an allocation occurs. We can detect this with tsk->mems_allowed_seq and prevent a miscount or an allocation failure from occurring. The same thing happens in the allocators to detect failure, but this can prevent spurious failures in a much smaller critical section. [gourry.memverge@gmail.com: weighted interleave checks wrong parameter] Link: https://lkml.kernel.org/r/20240206192853.3589-1-gregory.price@memverge.com Link: https://lkml.kernel.org/r/20240202170238.90004-5-gregory.price@memverge.com Signed-off-by: Gregory Price Suggested-by: "Huang, Ying" Cc: Dan Williams Cc: Hasan Al Maruf Cc: Honggyu Kim Cc: Hyeongtak Ji Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Cc: Rakie Kim Cc: Ravi Jonnalagadda Cc: Srinivasulu Thanneeru Signed-off-by: Andrew Morton --- mm/mempolicy.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a8db92c23697..56f9a6ed939a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1874,11 +1874,17 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) static unsigned int weighted_interleave_nodes(struct mempolicy *policy) { - unsigned int node = current->il_prev; + unsigned int node; + unsigned int cpuset_mems_cookie; +retry: + /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */ + cpuset_mems_cookie = read_mems_allowed_begin(); + node = current->il_prev; if (!current->il_weight || !node_isset(node, policy->nodes)) { node = next_node_in(node, policy->nodes); - /* can only happen if nodemask is being rebound */ + if (read_mems_allowed_retry(cpuset_mems_cookie)) + goto retry; if (node == MAX_NUMNODES) return node; current->il_prev = node; @@ -1892,8 +1898,14 @@ static unsigned int weighted_interleave_nodes(struct mempolicy *policy) static unsigned int interleave_nodes(struct mempolicy *policy) { unsigned int nid; + unsigned int cpuset_mems_cookie; + + /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */ + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + nid = next_node_in(current->il_prev, policy->nodes); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); - nid = next_node_in(current->il_prev, policy->nodes); if (nid < MAX_NUMNODES) current->il_prev = nid; return nid; @@ -2370,6 +2382,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, struct page **page_array) { struct task_struct *me = current; + unsigned int cpuset_mems_cookie; unsigned long total_allocated = 0; unsigned long nr_allocated = 0; unsigned long rounds; @@ -2387,7 +2400,13 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, if (!nr_pages) return 0; - nnodes = read_once_policy_nodemask(pol, &nodes); + /* read the nodes onto the stack, retry if done during rebind */ + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + nnodes = read_once_policy_nodemask(pol, &nodes); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + /* if the nodemask has become invalid, we cannot do anything */ if (!nnodes) return 0; From a90f0a02f139a13d3c26dd20644b50fc731f17da Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Jan 2024 11:34:32 +0100 Subject: [PATCH 096/435] arm: ptdump: rename CONFIG_DEBUG_WX to CONFIG_ARM_DEBUG_WX Patch series "mm: ptdump: Refactor CONFIG_DEBUG_WX and check_wx_pages debugfs attribute", v2. This series refactors CONFIG_DEBUG_WX for the 5 architectures implementing CONFIG_GENERIC_PTDUMP First rename stuff in ARM which uses similar names while not implementing CONFIG_GENERIC_PTDUMP. Then define a generic version of debug_checkwx() that calls ptdump_check_wx() when CONFIG_DEBUG_WX is set. Call it immediately after calling mark_rodata_ro() instead of calling it at the end of every mark_rodata_ro(). Then implement a debugfs attribute that can be used to trigger a W^X test at anytime and regardless of CONFIG_DEBUG_WX This patch (of 5): CONFIG_DEBUG_WX is a core option defined in mm/Kconfig.debug To avoid any future conflict, rename ARM version into CONFIG_ARM_DEBUG_WX. Link: https://lore.kernel.org/lkml/20200422152656.GF676@willie-the-truck/T/#m802eaf33efd6f8d575939d157301b35ac0d4a64f Link: https://github.com/KSPP/linux/issues/35 Link: https://lkml.kernel.org/r/cover.1706610398.git.christophe.leroy@csgroup.eu Link: https://lkml.kernel.org/r/fa297aa90caeb61eee2b70c6c5897a2ab58a9562.1706610398.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Albert Ou Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V (IBM)" Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gerald Schaefer Cc: Greg KH Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mark Rutland Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Phong Tran Cc: Russell King Cc: Steven Price Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Cc: Alexandre Ghiti Signed-off-by: Andrew Morton --- arch/arm/Kconfig.debug | 2 +- arch/arm/configs/aspeed_g4_defconfig | 2 +- arch/arm/configs/aspeed_g5_defconfig | 2 +- arch/arm/include/asm/ptdump.h | 6 +++--- arch/arm/mm/init.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 5fbbac1b708b..f1fc278081d0 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -17,7 +17,7 @@ config ARM_PTDUMP_DEBUGFS kernel. If in doubt, say "N" -config DEBUG_WX +config ARM_DEBUG_WX bool "Warn on W+X mappings at boot" depends on MMU select ARM_PTDUMP_CORE diff --git a/arch/arm/configs/aspeed_g4_defconfig b/arch/arm/configs/aspeed_g4_defconfig index b3dc0465796f..28b724d59e7e 100644 --- a/arch/arm/configs/aspeed_g4_defconfig +++ b/arch/arm/configs/aspeed_g4_defconfig @@ -252,7 +252,7 @@ CONFIG_DEBUG_INFO_REDUCED=y CONFIG_GDB_SCRIPTS=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y -CONFIG_DEBUG_WX=y +CONFIG_ARM_DEBUG_WX=y CONFIG_SCHED_STACK_END_CHECK=y CONFIG_PANIC_ON_OOPS=y CONFIG_PANIC_TIMEOUT=-1 diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig index 3fdf4dbfdea5..61cee1e7ebea 100644 --- a/arch/arm/configs/aspeed_g5_defconfig +++ b/arch/arm/configs/aspeed_g5_defconfig @@ -302,7 +302,7 @@ CONFIG_DEBUG_INFO_REDUCED=y CONFIG_GDB_SCRIPTS=y CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_FS=y -CONFIG_DEBUG_WX=y +CONFIG_ARM_DEBUG_WX=y CONFIG_SCHED_STACK_END_CHECK=y CONFIG_PANIC_ON_OOPS=y CONFIG_PANIC_TIMEOUT=-1 diff --git a/arch/arm/include/asm/ptdump.h b/arch/arm/include/asm/ptdump.h index aad1d034136c..46a4575146ee 100644 --- a/arch/arm/include/asm/ptdump.h +++ b/arch/arm/include/asm/ptdump.h @@ -32,10 +32,10 @@ void ptdump_check_wx(void); #endif /* CONFIG_ARM_PTDUMP_CORE */ -#ifdef CONFIG_DEBUG_WX -#define debug_checkwx() ptdump_check_wx() +#ifdef CONFIG_ARM_DEBUG_WX +#define arm_debug_checkwx() ptdump_check_wx() #else -#define debug_checkwx() do { } while (0) +#define arm_debug_checkwx() do { } while (0) #endif #endif /* __ASM_PTDUMP_H */ diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index a42e4cd11db2..4c3d78691279 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -458,7 +458,7 @@ static int __mark_rodata_ro(void *unused) void mark_rodata_ro(void) { stop_machine(__mark_rodata_ro, NULL, NULL); - debug_checkwx(); + arm_debug_checkwx(); } #else From a5e8131a0329673f70faee2e9ffb02e8a5bb3c89 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Jan 2024 11:34:33 +0100 Subject: [PATCH 097/435] arm64, powerpc, riscv, s390, x86: ptdump: refactor CONFIG_DEBUG_WX All architectures using the core ptdump functionality also implement CONFIG_DEBUG_WX, and they all do it more or less the same way, with a function called debug_checkwx() that is called by mark_rodata_ro(), which is a substitute to ptdump_check_wx() when CONFIG_DEBUG_WX is set and a no-op otherwise. Refactor by centrally defining debug_checkwx() in linux/ptdump.h and call debug_checkwx() immediately after calling mark_rodata_ro() instead of calling it at the end of every mark_rodata_ro(). On x86_32, mark_rodata_ro() first checks __supported_pte_mask has _PAGE_NX before calling debug_checkwx(). Now the check is inside the callee ptdump_walk_pgd_level_checkwx(). On powerpc_64, mark_rodata_ro() bails out early before calling ptdump_check_wx() when the MMU doesn't have KERNEL_RO feature. The check is now also done in ptdump_check_wx() as it is called outside mark_rodata_ro(). Link: https://lkml.kernel.org/r/a59b102d7964261d31ead0316a9f18628e4e7a8e.1706610398.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Alexandre Ghiti Cc: Albert Ou Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V (IBM)" Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gerald Schaefer Cc: Greg KH Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mark Rutland Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Phong Tran Cc: Russell King Cc: Steven Price Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/ptdump.h | 7 ------- arch/arm64/mm/mmu.c | 2 -- arch/powerpc/mm/mmu_decl.h | 6 ------ arch/powerpc/mm/pgtable_32.c | 4 ---- arch/powerpc/mm/pgtable_64.c | 3 --- arch/powerpc/mm/ptdump/ptdump.c | 3 +++ arch/riscv/include/asm/ptdump.h | 22 ---------------------- arch/riscv/mm/init.c | 3 --- arch/riscv/mm/ptdump.c | 1 - arch/s390/include/asm/ptdump.h | 14 -------------- arch/s390/mm/dump_pagetables.c | 1 - arch/s390/mm/init.c | 2 -- arch/x86/include/asm/pgtable.h | 3 +-- arch/x86/mm/dump_pagetables.c | 3 +++ arch/x86/mm/init_32.c | 2 -- arch/x86/mm/init_64.c | 2 -- include/linux/ptdump.h | 7 +++++++ init/main.c | 2 ++ 18 files changed, 16 insertions(+), 71 deletions(-) delete mode 100644 arch/riscv/include/asm/ptdump.h delete mode 100644 arch/s390/include/asm/ptdump.h diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index 581caac525b0..5b1701c76d1c 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -29,13 +29,6 @@ void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name); static inline void ptdump_debugfs_register(struct ptdump_info *info, const char *name) { } #endif -void ptdump_check_wx(void); #endif /* CONFIG_PTDUMP_CORE */ -#ifdef CONFIG_DEBUG_WX -#define debug_checkwx() ptdump_check_wx() -#else -#define debug_checkwx() do { } while (0) -#endif - #endif /* __ASM_PTDUMP_H */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1ac7467d34c9..3a27d887f7dd 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -632,8 +632,6 @@ void mark_rodata_ro(void) section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); - - debug_checkwx(); } static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end, diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 72341b9fb552..90dcc2844056 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -171,12 +171,6 @@ static inline void mmu_mark_rodata_ro(void) { } void __init mmu_mapin_immr(void); #endif -#ifdef CONFIG_DEBUG_WX -void ptdump_check_wx(void); -#else -static inline void ptdump_check_wx(void) { } -#endif - static inline bool debug_pagealloc_enabled_or_kfence(void) { return IS_ENABLED(CONFIG_KFENCE) || debug_pagealloc_enabled(); diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 5c02fd08d61e..12498017da8e 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -153,7 +153,6 @@ void mark_rodata_ro(void) if (v_block_mapped((unsigned long)_stext + 1)) { mmu_mark_rodata_ro(); - ptdump_check_wx(); return; } @@ -166,9 +165,6 @@ void mark_rodata_ro(void) PFN_DOWN((unsigned long)_stext); set_memory_ro((unsigned long)_stext, numpages); - - // mark_initmem_nx() should have already run by now - ptdump_check_wx(); } #endif diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 5ac1fd30341b..1b366526f4f2 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -150,9 +150,6 @@ void mark_rodata_ro(void) radix__mark_rodata_ro(); else hash__mark_rodata_ro(); - - // mark_initmem_nx() should have already run by now - ptdump_check_wx(); } void mark_initmem_nx(void) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 2313053fe679..620d4917ebe8 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -343,6 +343,9 @@ void ptdump_check_wx(void) } }; + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !mmu_has_feature(MMU_FTR_KERNEL_RO)) + return; + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); if (st.wx_pages) diff --git a/arch/riscv/include/asm/ptdump.h b/arch/riscv/include/asm/ptdump.h deleted file mode 100644 index 3c9ea6dd5af7..000000000000 --- a/arch/riscv/include/asm/ptdump.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2019 SiFive - */ - -#ifndef _ASM_RISCV_PTDUMP_H -#define _ASM_RISCV_PTDUMP_H - -void ptdump_check_wx(void); - -#ifdef CONFIG_DEBUG_WX -static inline void debug_checkwx(void) -{ - ptdump_check_wx(); -} -#else -static inline void debug_checkwx(void) -{ -} -#endif - -#endif /* _ASM_RISCV_PTDUMP_H */ diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index fa34cf55037b..a4f218cfb845 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -723,8 +722,6 @@ void mark_rodata_ro(void) if (IS_ENABLED(CONFIG_64BIT)) set_kernel_memory(lm_alias(__start_rodata), lm_alias(_data), set_memory_ro); - - debug_checkwx(); } #else static __init pgprot_t pgprot_from_va(uintptr_t va) diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 657c27bc07a7..075265603313 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/arch/s390/include/asm/ptdump.h b/arch/s390/include/asm/ptdump.h deleted file mode 100644 index f960b2896606..000000000000 --- a/arch/s390/include/asm/ptdump.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _ASM_S390_PTDUMP_H -#define _ASM_S390_PTDUMP_H - -void ptdump_check_wx(void); - -static inline void debug_checkwx(void) -{ - if (IS_ENABLED(CONFIG_DEBUG_WX)) - ptdump_check_wx(); -} - -#endif /* _ASM_S390_PTDUMP_H */ diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index d37a8f607b71..8dcb4e0c71bd 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 8d9a60ccb777..f6391442c0c2 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -109,7 +108,6 @@ void mark_rodata_ro(void) __set_memory_ro(__start_ro_after_init, __end_ro_after_init); pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); - debug_checkwx(); } int set_memory_encrypted(unsigned long vaddr, int numpages) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 9d077bca6a10..6c979028e521 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -32,6 +32,7 @@ void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user); void ptdump_walk_pgd_level_checkwx(void); +#define ptdump_check_wx ptdump_walk_pgd_level_checkwx void ptdump_walk_user_pgd_level_checkwx(void); /* @@ -41,10 +42,8 @@ void ptdump_walk_user_pgd_level_checkwx(void); #define pgprot_decrypted(prot) __pgprot(cc_mkdec(pgprot_val(prot))) #ifdef CONFIG_DEBUG_WX -#define debug_checkwx() ptdump_walk_pgd_level_checkwx() #define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx() #else -#define debug_checkwx() do { } while (0) #define debug_checkwx_user() do { } while (0) #endif diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index e1b599ecbbc2..0008524eebe9 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -433,6 +433,9 @@ void ptdump_walk_user_pgd_level_checkwx(void) void ptdump_walk_pgd_level_checkwx(void) { + if (!(__supported_pte_mask & _PAGE_NX)) + return; + ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false); } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index b63403d7179d..5c736b707cae 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -800,6 +800,4 @@ void mark_rodata_ro(void) set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); #endif mark_nxdata_nx(); - if (__supported_pte_mask & _PAGE_NX) - debug_checkwx(); } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a0dffaca6d2b..ebdbcae48011 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1412,8 +1412,6 @@ void mark_rodata_ro(void) (void *)text_end, (void *)rodata_start); free_kernel_image_pages("unused kernel image (rodata/data gap)", (void *)rodata_end, (void *)_sdata); - - debug_checkwx(); } /* diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index 2a3a95586425..c10513739bf9 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -19,5 +19,12 @@ struct ptdump_state { }; void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd); +void ptdump_check_wx(void); + +static inline void debug_checkwx(void) +{ + if (IS_ENABLED(CONFIG_DEBUG_WX)) + ptdump_check_wx(); +} #endif /* _LINUX_PTDUMP_H */ diff --git a/init/main.c b/init/main.c index e24b0780fdff..749a9f8d2c9b 100644 --- a/init/main.c +++ b/init/main.c @@ -99,6 +99,7 @@ #include #include #include +#include #include #include @@ -1408,6 +1409,7 @@ static void mark_readonly(void) */ rcu_barrier(); mark_rodata_ro(); + debug_checkwx(); rodata_test(); } else pr_info("Kernel memory protection disabled.\n"); From 592e15f62f943a77e8732db0da2d8a61c1ad7bf3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Jan 2024 11:34:34 +0100 Subject: [PATCH 098/435] powerpc,s390: ptdump: define ptdump_check_wx() regardless of CONFIG_DEBUG_WX Following patch will use ptdump_check_wx() regardless of CONFIG_DEBUG_WX, so define it at all times on powerpc and s390 just like other architectures. Though keep the WARN_ON_ONCE() only when CONFIG_DEBUG_WX is set. Link: https://lkml.kernel.org/r/07bfb04c7fec58e84413e91d2533581be357a696.1706610398.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V (IBM)" Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gerald Schaefer Cc: Greg KH Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mark Rutland Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Phong Tran Cc: Russell King Cc: Steven Price Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/ptdump/ptdump.c | 7 +++---- arch/s390/mm/dump_pagetables.c | 7 ++----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 620d4917ebe8..b835c80371cd 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -184,13 +184,14 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) { pte_t pte = __pte(st->current_flags); - if (!IS_ENABLED(CONFIG_DEBUG_WX) || !st->check_wx) + if (!st->check_wx) return; if (!pte_write(pte) || !pte_exec(pte)) return; - WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n", + WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX), + "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n", (void *)st->start_address, (void *)st->start_address); st->wx_pages += (addr - st->start_address) / PAGE_SIZE; @@ -326,7 +327,6 @@ static void __init build_pgtable_complete_mask(void) pg_level[i].mask |= pg_level[i].flag[j].mask; } -#ifdef CONFIG_DEBUG_WX void ptdump_check_wx(void) { struct pg_state st = { @@ -354,7 +354,6 @@ void ptdump_check_wx(void) else pr_info("Checked W+X mappings: passed, no W+X pages found\n"); } -#endif static int __init ptdump_init(void) { diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 8dcb4e0c71bd..99da5a5602a8 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -121,7 +121,6 @@ static void print_prot(struct seq_file *m, unsigned int pr, int level) static void note_prot_wx(struct pg_state *st, unsigned long addr) { -#ifdef CONFIG_DEBUG_WX if (!st->check_wx) return; if (st->current_prot & _PAGE_INVALID) @@ -138,10 +137,10 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) */ if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear))) return; - WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n", + WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX), + "s390/mm: Found insecure W+X mapping at address %pS\n", (void *)st->start_address); st->wx_pages += (addr - st->start_address) / PAGE_SIZE; -#endif /* CONFIG_DEBUG_WX */ } static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) @@ -193,7 +192,6 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } } -#ifdef CONFIG_DEBUG_WX void ptdump_check_wx(void) { struct pg_state st = { @@ -226,7 +224,6 @@ void ptdump_check_wx(void) (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ? "unexpected " : ""); } -#endif /* CONFIG_DEBUG_WX */ #ifdef CONFIG_PTDUMP_DEBUGFS static int ptdump_show(struct seq_file *m, void *v) From 6cdc82db0c044d36137dd98f33e8aa0b8742987f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Jan 2024 11:34:35 +0100 Subject: [PATCH 099/435] mm: ptdump: have ptdump_check_wx() return bool Have ptdump_check_wx() return true when the check is successful or false otherwise. [akpm@linux-foundation.org: fix a couple of build issues (x86_64 allmodconfig)] Link: https://lkml.kernel.org/r/7943149fe955458cb7b57cd483bf41a3aad94684.1706610398.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V (IBM)" Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gerald Schaefer Cc: Greg KH Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mark Rutland Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Phong Tran Cc: Russell King Cc: Steven Price Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/mm/ptdump.c | 11 ++++++++--- arch/powerpc/mm/ptdump/ptdump.c | 13 +++++++++---- arch/riscv/mm/ptdump.c | 11 ++++++++--- arch/s390/mm/dump_pagetables.c | 13 +++++++++---- arch/x86/include/asm/pgtable.h | 2 +- arch/x86/mm/dump_pagetables.c | 23 ++++++++++++++--------- include/linux/ptdump.h | 5 ++++- 7 files changed, 53 insertions(+), 25 deletions(-) diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index e305b6593c4e..696822f75582 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -345,7 +345,7 @@ static struct ptdump_info kernel_ptdump_info = { .base_addr = PAGE_OFFSET, }; -void ptdump_check_wx(void) +bool ptdump_check_wx(void) { struct pg_state st = { .seq = NULL, @@ -366,11 +366,16 @@ void ptdump_check_wx(void) ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); - if (st.wx_pages || st.uxn_pages) + if (st.wx_pages || st.uxn_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", st.wx_pages, st.uxn_pages); - else + + return false; + } else { pr_info("Checked W+X mappings: passed, no W+X pages found\n"); + + return true; + } } static int __init ptdump_init(void) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index b835c80371cd..9dc239967b77 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -327,7 +327,7 @@ static void __init build_pgtable_complete_mask(void) pg_level[i].mask |= pg_level[i].flag[j].mask; } -void ptdump_check_wx(void) +bool ptdump_check_wx(void) { struct pg_state st = { .seq = NULL, @@ -344,15 +344,20 @@ void ptdump_check_wx(void) }; if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !mmu_has_feature(MMU_FTR_KERNEL_RO)) - return; + return true; ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); - if (st.wx_pages) + if (st.wx_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); - else + + return false; + } else { pr_info("Checked W+X mappings: passed, no W+X pages found\n"); + + return true; + } } static int __init ptdump_init(void) diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 075265603313..1289cc6d3700 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -335,7 +335,7 @@ static void ptdump_walk(struct seq_file *s, struct ptd_mm_info *pinfo) ptdump_walk_pgd(&st.ptdump, pinfo->mm, NULL); } -void ptdump_check_wx(void) +bool ptdump_check_wx(void) { struct pg_state st = { .seq = NULL, @@ -356,11 +356,16 @@ void ptdump_check_wx(void) ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); - if (st.wx_pages) + if (st.wx_pages) { pr_warn("Checked W+X mappings: failed, %lu W+X pages found\n", st.wx_pages); - else + + return false; + } else { pr_info("Checked W+X mappings: passed, no W+X pages found\n"); + + return true; + } } static int ptdump_show(struct seq_file *m, void *v) diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 99da5a5602a8..ffd07ed7b4af 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -192,7 +192,7 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } } -void ptdump_check_wx(void) +bool ptdump_check_wx(void) { struct pg_state st = { .ptdump = { @@ -215,14 +215,19 @@ void ptdump_check_wx(void) }; if (!MACHINE_HAS_NX) - return; + return true; ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); - if (st.wx_pages) + if (st.wx_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); - else + + return false; + } else { pr_info("Checked W+X mappings: passed, no %sW+X pages found\n", (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ? "unexpected " : ""); + + return true; + } } #ifdef CONFIG_PTDUMP_DEBUGFS diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6c979028e521..b50b2ef63672 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -31,7 +31,7 @@ struct seq_file; void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user); -void ptdump_walk_pgd_level_checkwx(void); +bool ptdump_walk_pgd_level_checkwx(void); #define ptdump_check_wx ptdump_walk_pgd_level_checkwx void ptdump_walk_user_pgd_level_checkwx(void); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 0008524eebe9..35b2cfd47914 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -362,9 +362,9 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } } -static void ptdump_walk_pgd_level_core(struct seq_file *m, - struct mm_struct *mm, pgd_t *pgd, - bool checkwx, bool dmesg) +bool ptdump_walk_pgd_level_core(struct seq_file *m, + struct mm_struct *mm, pgd_t *pgd, + bool checkwx, bool dmesg) { const struct ptdump_range ptdump_ranges[] = { #ifdef CONFIG_X86_64 @@ -391,12 +391,17 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, ptdump_walk_pgd(&st.ptdump, mm, pgd); if (!checkwx) - return; - if (st.wx_pages) + return true; + if (st.wx_pages) { pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", st.wx_pages); - else + + return false; + } else { pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); + + return true; + } } void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm) @@ -431,12 +436,12 @@ void ptdump_walk_user_pgd_level_checkwx(void) #endif } -void ptdump_walk_pgd_level_checkwx(void) +bool ptdump_walk_pgd_level_checkwx(void) { if (!(__supported_pte_mask & _PAGE_NX)) - return; + return true; - ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false); + return ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false); } static int __init pt_dump_init(void) diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index c10513739bf9..8dbd51ea8626 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -18,8 +18,11 @@ struct ptdump_state { const struct ptdump_range *range; }; +bool ptdump_walk_pgd_level_core(struct seq_file *m, + struct mm_struct *mm, pgd_t *pgd, + bool checkwx, bool dmesg); void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd); -void ptdump_check_wx(void); +bool ptdump_check_wx(void); static inline void debug_checkwx(void) { From 565474afe08ad28de0d655c5ed36aab3702cb93d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Jan 2024 11:34:36 +0100 Subject: [PATCH 100/435] mm: ptdump: add check_wx_pages debugfs attribute Add a readable attribute in debugfs to trigger a W^X pages check at any time. To trigger the test, just read /sys/kernel/debug/check_wx_pages It will report FAILED if the test failed, SUCCESS otherwise. Detailed result is provided into dmesg. Link: https://lkml.kernel.org/r/e947fb1a9f3f5466344823e532d343ff194ae03d.1706610398.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V (IBM)" Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gerald Schaefer Cc: Greg KH Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mark Rutland Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Phong Tran Cc: Russell King Cc: Steven Price Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/ptdump.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/mm/ptdump.c b/mm/ptdump.c index 03c1bdae4a43..106e1d66e9f9 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include @@ -163,3 +164,24 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd) /* Flush out the last page */ st->note_page(st, 0, -1, 0); } + +static int check_wx_show(struct seq_file *m, void *v) +{ + if (ptdump_check_wx()) + seq_puts(m, "SUCCESS\n"); + else + seq_puts(m, "FAILED\n"); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(check_wx); + +static int ptdump_debugfs_init(void) +{ + debugfs_create_file("check_wx_pages", 0400, NULL, NULL, &check_wx_fops); + + return 0; +} + +device_initcall(ptdump_debugfs_init); From d818c98a52cce29da4f35b73e640bf293036fa03 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 1 Feb 2024 08:07:14 +0530 Subject: [PATCH 101/435] mm/cma: don't treat bad input arguments for cma_alloc() as its failure Invalid cma_alloc() input scenarios - including excess allocation request should neither be counted as CMA_ALLOC_FAIL nor 'cma->nr_pages_failed' be updated when applicable with CONFIG_CMA_SYSFS. This also drops 'out' jump label which has become redundant. Link: https://lkml.kernel.org/r/20240201023714.3871061-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Kalesh Singh Cc: Minchan Kim Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- mm/cma.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index e12cf41d8354..b6720930312d 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -441,13 +441,13 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, trace_cma_alloc_start(name, count, align); if (!cma || !cma->count || !cma->bitmap) - goto out; + return page; pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__, (void *)cma, cma->name, count, align); if (!count) - goto out; + return page; mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); @@ -455,7 +455,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, bitmap_count = cma_bitmap_pages_to_bits(cma, count); if (bitmap_count > bitmap_maxno) - goto out; + return page; for (;;) { spin_lock_irq(&cma->lock); @@ -514,15 +514,13 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, } pr_debug("%s(): returned %p\n", __func__, page); -out: trace_cma_alloc_finish(name, pfn, page, count, align, ret); if (page) { count_vm_event(CMA_ALLOC_SUCCESS); cma_sysfs_account_success_pages(cma, count); } else { count_vm_event(CMA_ALLOC_FAIL); - if (cma) - cma_sysfs_account_fail_pages(cma, count); + cma_sysfs_account_fail_pages(cma, count); } return page; From a60cc288a1a2604bd86d3df129f269887018c3cb Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 31 Jan 2024 14:51:24 -0800 Subject: [PATCH 102/435] test_xarray: add tests for advanced multi-index use Patch series "test_xarray: advanced API multi-index tests", v2. This is a respin of the test_xarray multi-index tests [0] which use and demonstrate the advanced API which is used by the page cache. This should let folks more easily follow how we use multi-index to support for example a min order later in the page cache. It also lets us grow the selftests to mimic more of what we do in the page cache. This patch (of 2): The multi index selftests are great but they don't replicate how we deal with the page cache exactly, which makes it a bit hard to follow as the page cache uses the advanced API. Add tests which use the advanced API, mimicking what we do in the page cache, while at it, extend the example to do what is needed for min order support. [mcgrof@kernel.org: fix soft lockup for advanced-api tests] Link: https://lkml.kernel.org/r/20240216194329.840555-1-mcgrof@kernel.org [akpm@linux-foundation.org: s/i/loops/, make non-static] [akpm@linux-foundation.org: restore static storage for loop counter] Link: https://lkml.kernel.org/r/20240131225125.1370598-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20240131225125.1370598-2-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Tested-by: Daniel Gomez Cc: Darrick J. Wong Cc: Dave Chinner Cc: Hannes Reinecke Cc: Matthew Wilcox Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- lib/test_xarray.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) diff --git a/lib/test_xarray.c b/lib/test_xarray.c index e77d4856442c..1050e9113d2a 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -674,6 +674,181 @@ static noinline void check_multi_store(struct xarray *xa) #endif } +#ifdef CONFIG_XARRAY_MULTI +/* mimics page cache __filemap_add_folio() */ +static noinline void check_xa_multi_store_adv_add(struct xarray *xa, + unsigned long index, + unsigned int order, + void *p) +{ + XA_STATE(xas, xa, index); + unsigned int nrpages = 1UL << order; + + /* users are responsible for index alignemnt to the order when adding */ + XA_BUG_ON(xa, index & (nrpages - 1)); + + xas_set_order(&xas, index, order); + + do { + xas_lock_irq(&xas); + + xas_store(&xas, p); + XA_BUG_ON(xa, xas_error(&xas)); + XA_BUG_ON(xa, xa_load(xa, index) != p); + + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + XA_BUG_ON(xa, xas_error(&xas)); +} + +/* mimics page_cache_delete() */ +static noinline void check_xa_multi_store_adv_del_entry(struct xarray *xa, + unsigned long index, + unsigned int order) +{ + XA_STATE(xas, xa, index); + + xas_set_order(&xas, index, order); + xas_store(&xas, NULL); + xas_init_marks(&xas); +} + +static noinline void check_xa_multi_store_adv_delete(struct xarray *xa, + unsigned long index, + unsigned int order) +{ + xa_lock_irq(xa); + check_xa_multi_store_adv_del_entry(xa, index, order); + xa_unlock_irq(xa); +} + +/* mimics page cache filemap_get_entry() */ +static noinline void *test_get_entry(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + void *p; + static unsigned int loops = 0; + + rcu_read_lock(); +repeat: + xas_reset(&xas); + p = xas_load(&xas); + if (xas_retry(&xas, p)) + goto repeat; + rcu_read_unlock(); + + /* + * This is not part of the page cache, this selftest is pretty + * aggressive and does not want to trust the xarray API but rather + * test it, and for order 20 (4 GiB block size) we can loop over + * over a million entries which can cause a soft lockup. Page cache + * APIs won't be stupid, proper page cache APIs loop over the proper + * order so when using a larger order we skip shared entries. + */ + if (++loops % XA_CHECK_SCHED == 0) + schedule(); + + return p; +} + +static unsigned long some_val = 0xdeadbeef; +static unsigned long some_val_2 = 0xdeaddead; + +/* mimics the page cache usage */ +static noinline void check_xa_multi_store_adv(struct xarray *xa, + unsigned long pos, + unsigned int order) +{ + unsigned int nrpages = 1UL << order; + unsigned long index, base, next_index, next_next_index; + unsigned int i; + + index = pos >> PAGE_SHIFT; + base = round_down(index, nrpages); + next_index = round_down(base + nrpages, nrpages); + next_next_index = round_down(next_index + nrpages, nrpages); + + check_xa_multi_store_adv_add(xa, base, order, &some_val); + + for (i = 0; i < nrpages; i++) + XA_BUG_ON(xa, test_get_entry(xa, base + i) != &some_val); + + XA_BUG_ON(xa, test_get_entry(xa, next_index) != NULL); + + /* Use order 0 for the next item */ + check_xa_multi_store_adv_add(xa, next_index, 0, &some_val_2); + XA_BUG_ON(xa, test_get_entry(xa, next_index) != &some_val_2); + + /* Remove the next item */ + check_xa_multi_store_adv_delete(xa, next_index, 0); + + /* Now use order for a new pointer */ + check_xa_multi_store_adv_add(xa, next_index, order, &some_val_2); + + for (i = 0; i < nrpages; i++) + XA_BUG_ON(xa, test_get_entry(xa, next_index + i) != &some_val_2); + + check_xa_multi_store_adv_delete(xa, next_index, order); + check_xa_multi_store_adv_delete(xa, base, order); + XA_BUG_ON(xa, !xa_empty(xa)); + + /* starting fresh again */ + + /* let's test some holes now */ + + /* hole at base and next_next */ + check_xa_multi_store_adv_add(xa, next_index, order, &some_val_2); + + for (i = 0; i < nrpages; i++) + XA_BUG_ON(xa, test_get_entry(xa, base + i) != NULL); + + for (i = 0; i < nrpages; i++) + XA_BUG_ON(xa, test_get_entry(xa, next_index + i) != &some_val_2); + + for (i = 0; i < nrpages; i++) + XA_BUG_ON(xa, test_get_entry(xa, next_next_index + i) != NULL); + + check_xa_multi_store_adv_delete(xa, next_index, order); + XA_BUG_ON(xa, !xa_empty(xa)); + + /* hole at base and next */ + + check_xa_multi_store_adv_add(xa, next_next_index, order, &some_val_2); + + for (i = 0; i < nrpages; i++) + XA_BUG_ON(xa, test_get_entry(xa, base + i) != NULL); + + for (i = 0; i < nrpages; i++) + XA_BUG_ON(xa, test_get_entry(xa, next_index + i) != NULL); + + for (i = 0; i < nrpages; i++) + XA_BUG_ON(xa, test_get_entry(xa, next_next_index + i) != &some_val_2); + + check_xa_multi_store_adv_delete(xa, next_next_index, order); + XA_BUG_ON(xa, !xa_empty(xa)); +} +#endif + +static noinline void check_multi_store_advanced(struct xarray *xa) +{ +#ifdef CONFIG_XARRAY_MULTI + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1; + unsigned long end = ULONG_MAX/2; + unsigned long pos, i; + + /* + * About 117 million tests below. + */ + for (pos = 7; pos < end; pos = (pos * pos) + 564) { + for (i = 0; i < max_order; i++) { + check_xa_multi_store_adv(xa, pos, i); + check_xa_multi_store_adv(xa, pos + 157, i); + } + } +#endif +} + static noinline void check_xa_alloc_1(struct xarray *xa, unsigned int base) { int i; @@ -1804,6 +1979,7 @@ static int xarray_checks(void) check_reserve(&array); check_reserve(&xa0); check_multi_store(&array); + check_multi_store_advanced(&array); check_get_order(&array); check_xa_alloc(); check_find(&array); From e777ae44e33e48ad01bfdc978076b03c1f091b4f Mon Sep 17 00:00:00 2001 From: Daniel Gomez Date: Wed, 31 Jan 2024 14:51:25 -0800 Subject: [PATCH 103/435] XArray: add cmpxchg order test XArray multi-index entries do not keep track of the order stored once the entry is being marked as used with cmpxchg (conditionally replaced with NULL). Add a test to check the order is actually lost. The test also verifies the order and entries for all the tied indexes before and after the NULL replacement with xa_cmpxchg. Add another entry at 1 << order that keeps the node around and the order information for the NULL-entry after xa_cmpxchg. Link: https://lkml.kernel.org/r/20240131225125.1370598-3-mcgrof@kernel.org Signed-off-by: Daniel Gomez Signed-off-by: Luis Chamberlain Cc: Darrick J. Wong Cc: Dave Chinner Cc: Hannes Reinecke Cc: Matthew Wilcox Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- lib/test_xarray.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 1050e9113d2a..ebe2af2e072d 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -423,6 +423,59 @@ static noinline void check_cmpxchg(struct xarray *xa) XA_BUG_ON(xa, !xa_empty(xa)); } +static noinline void check_cmpxchg_order(struct xarray *xa) +{ +#ifdef CONFIG_XARRAY_MULTI + void *FIVE = xa_mk_value(5); + unsigned int i, order = 3; + + XA_BUG_ON(xa, xa_store_order(xa, 0, order, FIVE, GFP_KERNEL)); + + /* Check entry FIVE has the order saved */ + XA_BUG_ON(xa, xa_get_order(xa, xa_to_value(FIVE)) != order); + + /* Check all the tied indexes have the same entry and order */ + for (i = 0; i < (1 << order); i++) { + XA_BUG_ON(xa, xa_load(xa, i) != FIVE); + XA_BUG_ON(xa, xa_get_order(xa, i) != order); + } + + /* Ensure that nothing is stored at index '1 << order' */ + XA_BUG_ON(xa, xa_load(xa, 1 << order) != NULL); + + /* + * Additionally, keep the node information and the order at + * '1 << order' + */ + XA_BUG_ON(xa, xa_store_order(xa, 1 << order, order, FIVE, GFP_KERNEL)); + for (i = (1 << order); i < (1 << order) + (1 << order) - 1; i++) { + XA_BUG_ON(xa, xa_load(xa, i) != FIVE); + XA_BUG_ON(xa, xa_get_order(xa, i) != order); + } + + /* Conditionally replace FIVE entry at index '0' with NULL */ + XA_BUG_ON(xa, xa_cmpxchg(xa, 0, FIVE, NULL, GFP_KERNEL) != FIVE); + + /* Verify the order is lost at FIVE (and old) entries */ + XA_BUG_ON(xa, xa_get_order(xa, xa_to_value(FIVE)) != 0); + + /* Verify the order and entries are lost in all the tied indexes */ + for (i = 0; i < (1 << order); i++) { + XA_BUG_ON(xa, xa_load(xa, i) != NULL); + XA_BUG_ON(xa, xa_get_order(xa, i) != 0); + } + + /* Verify node and order are kept at '1 << order' */ + for (i = (1 << order); i < (1 << order) + (1 << order) - 1; i++) { + XA_BUG_ON(xa, xa_load(xa, i) != FIVE); + XA_BUG_ON(xa, xa_get_order(xa, i) != order); + } + + xa_store_order(xa, 0, BITS_PER_LONG - 1, NULL, GFP_KERNEL); + XA_BUG_ON(xa, !xa_empty(xa)); +#endif +} + static noinline void check_reserve(struct xarray *xa) { void *entry; @@ -1976,6 +2029,7 @@ static int xarray_checks(void) check_xas_erase(&array); check_insert(&array); check_cmpxchg(&array); + check_cmpxchg_order(&array); check_reserve(&array); check_reserve(&xa0); check_multi_store(&array); From eb1521dad8f391d3f3b88f589db27a288b55b8ed Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 31 Jan 2024 09:56:18 -0800 Subject: [PATCH 104/435] userfaultfd: handle zeropage moves by UFFDIO_MOVE Current implementation of UFFDIO_MOVE fails to move zeropages and returns EBUSY when it encounters one. We can handle them by mapping a zeropage at the destination and clearing the mapping at the source. This is done both for ordinary and for huge zeropages. Link: https://lkml.kernel.org/r/20240131175618.2417291-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202401300107.U8iMAkTl-lkp@intel.com/ Cc: Alexander Viro Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jann Horn Cc: Kalesh Singh Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (IBM) Cc: Nicolas Geoffray Cc: Peter Xu Cc: Ryan Roberts Cc: Shuah Khan Cc: ZhangPeng Signed-off-by: Andrew Morton --- mm/huge_memory.c | 111 +++++++++++++++++++++++++++-------------------- mm/userfaultfd.c | 44 ++++++++++++++++--- 2 files changed, 101 insertions(+), 54 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f005f0424735..016e20bd813e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2200,13 +2200,18 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm } src_page = pmd_page(src_pmdval); - if (unlikely(!PageAnonExclusive(src_page))) { - spin_unlock(src_ptl); - return -EBUSY; - } - src_folio = page_folio(src_page); - folio_get(src_folio); + if (!is_huge_zero_pmd(src_pmdval)) { + if (unlikely(!PageAnonExclusive(src_page))) { + spin_unlock(src_ptl); + return -EBUSY; + } + + src_folio = page_folio(src_page); + folio_get(src_folio); + } else + src_folio = NULL; + spin_unlock(src_ptl); flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE); @@ -2214,19 +2219,22 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm src_addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); - folio_lock(src_folio); + if (src_folio) { + folio_lock(src_folio); - /* - * split_huge_page walks the anon_vma chain without the page - * lock. Serialize against it with the anon_vma lock, the page - * lock is not enough. - */ - src_anon_vma = folio_get_anon_vma(src_folio); - if (!src_anon_vma) { - err = -EAGAIN; - goto unlock_folio; - } - anon_vma_lock_write(src_anon_vma); + /* + * split_huge_page walks the anon_vma chain without the page + * lock. Serialize against it with the anon_vma lock, the page + * lock is not enough. + */ + src_anon_vma = folio_get_anon_vma(src_folio); + if (!src_anon_vma) { + err = -EAGAIN; + goto unlock_folio; + } + anon_vma_lock_write(src_anon_vma); + } else + src_anon_vma = NULL; dst_ptl = pmd_lockptr(mm, dst_pmd); double_pt_lock(src_ptl, dst_ptl); @@ -2235,45 +2243,54 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm err = -EAGAIN; goto unlock_ptls; } - if (folio_maybe_dma_pinned(src_folio) || - !PageAnonExclusive(&src_folio->page)) { - err = -EBUSY; - goto unlock_ptls; + if (src_folio) { + if (folio_maybe_dma_pinned(src_folio) || + !PageAnonExclusive(&src_folio->page)) { + err = -EBUSY; + goto unlock_ptls; + } + + if (WARN_ON_ONCE(!folio_test_head(src_folio)) || + WARN_ON_ONCE(!folio_test_anon(src_folio))) { + err = -EBUSY; + goto unlock_ptls; + } + + folio_move_anon_rmap(src_folio, dst_vma); + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + + src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); + /* Folio got pinned from under us. Put it back and fail the move. */ + if (folio_maybe_dma_pinned(src_folio)) { + set_pmd_at(mm, src_addr, src_pmd, src_pmdval); + err = -EBUSY; + goto unlock_ptls; + } + + _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); + /* Follow mremap() behavior and treat the entry dirty after the move */ + _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); + } else { + src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); + _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot); } - - if (WARN_ON_ONCE(!folio_test_head(src_folio)) || - WARN_ON_ONCE(!folio_test_anon(src_folio))) { - err = -EBUSY; - goto unlock_ptls; - } - - folio_move_anon_rmap(src_folio, dst_vma); - WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); - - src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); - /* Folio got pinned from under us. Put it back and fail the move. */ - if (folio_maybe_dma_pinned(src_folio)) { - set_pmd_at(mm, src_addr, src_pmd, src_pmdval); - err = -EBUSY; - goto unlock_ptls; - } - - _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); - /* Follow mremap() behavior and treat the entry dirty after the move */ - _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd); pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); unlock_ptls: double_pt_unlock(src_ptl, dst_ptl); - anon_vma_unlock_write(src_anon_vma); - put_anon_vma(src_anon_vma); + if (src_anon_vma) { + anon_vma_unlock_write(src_anon_vma); + put_anon_vma(src_anon_vma); + } unlock_folio: /* unblock rmap walks */ - folio_unlock(src_folio); + if (src_folio) + folio_unlock(src_folio); mmu_notifier_invalidate_range_end(&range); - folio_put(src_folio); + if (src_folio) + folio_put(src_folio); return err; } #endif /* CONFIG_USERFAULTFD */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ae80c3714829..9cc93cc1330b 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -959,6 +959,33 @@ static int move_swap_pte(struct mm_struct *mm, return 0; } +static int move_zeropage_pte(struct mm_struct *mm, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + spinlock_t *dst_ptl, spinlock_t *src_ptl) +{ + pte_t zero_pte; + + double_pt_lock(dst_ptl, src_ptl); + if (!pte_same(ptep_get(src_pte), orig_src_pte) || + !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + double_pt_unlock(dst_ptl, src_ptl); + return -EAGAIN; + } + + zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + dst_vma->vm_page_prot)); + ptep_clear_flush(src_vma, src_addr, src_pte); + set_pte_at(mm, dst_addr, dst_pte, zero_pte); + double_pt_unlock(dst_ptl, src_ptl); + + return 0; +} + + /* * The mmap_lock for reading is held by the caller. Just move the page * from src_pmd to dst_pmd if possible, and return true if succeeded @@ -1041,6 +1068,14 @@ retry: } if (pte_present(orig_src_pte)) { + if (is_zero_pfn(pte_pfn(orig_src_pte))) { + err = move_zeropage_pte(mm, dst_vma, src_vma, + dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, + dst_ptl, src_ptl); + goto out; + } + /* * Pin and lock both source folio and anon_vma. Since we are in * RCU read section, we can't block, so on contention have to @@ -1404,19 +1439,14 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, err = -ENOENT; break; } - /* Avoid moving zeropages for now */ - if (is_huge_zero_pmd(*src_pmd)) { - spin_unlock(ptl); - err = -EBUSY; - break; - } /* Check if we can move the pmd without splitting it. */ if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || !pmd_none(dst_pmdval)) { struct folio *folio = pfn_folio(pmd_pfn(*src_pmd)); - if (!folio || !PageAnonExclusive(&folio->page)) { + if (!folio || (!is_huge_zero_page(&folio->page) && + !PageAnonExclusive(&folio->page))) { spin_unlock(ptl); err = -EBUSY; break; From 4838cf70e5395ba85e19f6ec4a621ec41de7defd Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 2 Feb 2024 16:31:08 +0500 Subject: [PATCH 105/435] selftests/mm: map_fixed_noreplace: conform test to TAP format output Patch series "conform tests to TAP format output", v2. This patch (of 12): Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. While at it, convert commenting style from // to /**/. Link: https://lkml.kernel.org/r/20240202113119.2047740-1-usama.anjum@collabora.com Link: https://lkml.kernel.org/r/20240202113119.2047740-2-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/mm/map_fixed_noreplace.c | 96 ++++++------------- 1 file changed, 31 insertions(+), 65 deletions(-) diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c index 598159f3df1f..b74813fdc951 100644 --- a/tools/testing/selftests/mm/map_fixed_noreplace.c +++ b/tools/testing/selftests/mm/map_fixed_noreplace.c @@ -12,6 +12,7 @@ #include #include #include +#include "../kselftest.h" static void dump_maps(void) { @@ -28,15 +29,12 @@ static unsigned long find_base_addr(unsigned long size) flags = MAP_PRIVATE | MAP_ANONYMOUS; addr = mmap(NULL, size, PROT_NONE, flags, -1, 0); - if (addr == MAP_FAILED) { - printf("Error: couldn't map the space we need for the test\n"); - return 0; - } + if (addr == MAP_FAILED) + ksft_exit_fail_msg("Error: couldn't map the space we need for the test\n"); + + if (munmap(addr, size) != 0) + ksft_exit_fail_msg("Error: munmap failed\n"); - if (munmap(addr, size) != 0) { - printf("Error: couldn't map the space we need for the test\n"); - return 0; - } return (unsigned long)addr; } @@ -46,51 +44,39 @@ int main(void) unsigned long flags, addr, size, page_size; char *p; + ksft_print_header(); + ksft_set_plan(9); + page_size = sysconf(_SC_PAGE_SIZE); - //let's find a base addr that is free before we start the tests + /* let's find a base addr that is free before we start the tests */ size = 5 * page_size; base_addr = find_base_addr(size); - if (!base_addr) { - printf("Error: couldn't map the space we need for the test\n"); - return 1; - } flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE; - // Check we can map all the areas we need below - errno = 0; + /* Check we can map all the areas we need below */ addr = base_addr; size = 5 * page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - if (p == MAP_FAILED) { dump_maps(); - printf("Error: couldn't map the space we need for the test\n"); - return 1; + ksft_exit_fail_msg("Error: couldn't map the space we need for the test\n"); } - - errno = 0; if (munmap((void *)addr, 5 * page_size) != 0) { dump_maps(); - printf("Error: munmap failed!?\n"); - return 1; + ksft_exit_fail_msg("Error: munmap failed!?\n"); } - printf("unmap() successful\n"); + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - errno = 0; addr = base_addr + page_size; size = 3 * page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - if (p == MAP_FAILED) { dump_maps(); - printf("Error: first mmap() failed unexpectedly\n"); - return 1; + ksft_exit_fail_msg("Error: first mmap() failed unexpectedly\n"); } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); /* * Exact same mapping again: @@ -100,17 +86,14 @@ int main(void) * +3 | mapped | new * +4 | free | new */ - errno = 0; addr = base_addr; size = 5 * page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - if (p != MAP_FAILED) { dump_maps(); - printf("Error:1: mmap() succeeded when it shouldn't have\n"); - return 1; + ksft_exit_fail_msg("Error:1: mmap() succeeded when it shouldn't have\n"); } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); /* * Second mapping contained within first: @@ -121,17 +104,14 @@ int main(void) * +3 | mapped | * +4 | free | */ - errno = 0; addr = base_addr + (2 * page_size); size = page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - if (p != MAP_FAILED) { dump_maps(); - printf("Error:2: mmap() succeeded when it shouldn't have\n"); - return 1; + ksft_exit_fail_msg("Error:2: mmap() succeeded when it shouldn't have\n"); } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); /* * Overlap end of existing mapping: @@ -141,17 +121,14 @@ int main(void) * +3 | mapped | new * +4 | free | new */ - errno = 0; addr = base_addr + (3 * page_size); size = 2 * page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - if (p != MAP_FAILED) { dump_maps(); - printf("Error:3: mmap() succeeded when it shouldn't have\n"); - return 1; + ksft_exit_fail_msg("Error:3: mmap() succeeded when it shouldn't have\n"); } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); /* * Overlap start of existing mapping: @@ -161,17 +138,14 @@ int main(void) * +3 | mapped | * +4 | free | */ - errno = 0; addr = base_addr; size = 2 * page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - if (p != MAP_FAILED) { dump_maps(); - printf("Error:4: mmap() succeeded when it shouldn't have\n"); - return 1; + ksft_exit_fail_msg("Error:4: mmap() succeeded when it shouldn't have\n"); } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); /* * Adjacent to start of existing mapping: @@ -181,17 +155,14 @@ int main(void) * +3 | mapped | * +4 | free | */ - errno = 0; addr = base_addr; size = page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - if (p == MAP_FAILED) { dump_maps(); - printf("Error:5: mmap() failed when it shouldn't have\n"); - return 1; + ksft_exit_fail_msg("Error:5: mmap() failed when it shouldn't have\n"); } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); /* * Adjacent to end of existing mapping: @@ -201,27 +172,22 @@ int main(void) * +3 | mapped | * +4 | free | new */ - errno = 0; addr = base_addr + (4 * page_size); size = page_size; p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - if (p == MAP_FAILED) { dump_maps(); - printf("Error:6: mmap() failed when it shouldn't have\n"); - return 1; + ksft_exit_fail_msg("Error:6: mmap() failed when it shouldn't have\n"); } + ksft_test_result_pass("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); addr = base_addr; size = 5 * page_size; if (munmap((void *)addr, size) != 0) { dump_maps(); - printf("Error: munmap failed!?\n"); - return 1; + ksft_exit_fail_msg("Error: munmap failed!?\n"); } - printf("unmap() successful\n"); + ksft_test_result_pass("Base Address unmap() successful\n"); - printf("OK\n"); - return 0; + ksft_finished(); } From d1e7bf2c70d6c211640aa40d490f801753647b47 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 2 Feb 2024 16:31:09 +0500 Subject: [PATCH 106/435] selftests/mm: map_hugetlb: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Link: https://lkml.kernel.org/r/20240202113119.2047740-3-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/map_hugetlb.c | 42 +++++++++++------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c index 86e8f2048a40..a1f005a90a4f 100644 --- a/tools/testing/selftests/mm/map_hugetlb.c +++ b/tools/testing/selftests/mm/map_hugetlb.c @@ -16,6 +16,7 @@ #include #include #include "vm_util.h" +#include "../kselftest.h" #define LENGTH (256UL*1024*1024) #define PROTECTION (PROT_READ | PROT_WRITE) @@ -31,7 +32,7 @@ static void check_bytes(char *addr) { - printf("First hex is %x\n", *((unsigned int *)addr)); + ksft_print_msg("First hex is %x\n", *((unsigned int *)addr)); } static void write_bytes(char *addr, size_t length) @@ -42,23 +43,21 @@ static void write_bytes(char *addr, size_t length) *(addr + i) = (char)i; } -static int read_bytes(char *addr, size_t length) +static void read_bytes(char *addr, size_t length) { unsigned long i; check_bytes(addr); for (i = 0; i < length; i++) - if (*(addr + i) != (char)i) { - printf("Mismatch at %lu\n", i); - return 1; - } - return 0; + if (*(addr + i) != (char)i) + ksft_exit_fail_msg("Mismatch at %lu\n", i); + + ksft_test_result_pass("Read correct data\n"); } int main(int argc, char **argv) { void *addr; - int ret; size_t hugepage_size; size_t length = LENGTH; int flags = FLAGS; @@ -69,6 +68,9 @@ int main(int argc, char **argv) if (hugepage_size > length) length = hugepage_size; + ksft_print_header(); + ksft_set_plan(1); + if (argc > 1) length = atol(argv[1]) << 20; if (argc > 2) { @@ -78,27 +80,23 @@ int main(int argc, char **argv) } if (shift) - printf("%u kB hugepages\n", 1 << (shift - 10)); + ksft_print_msg("%u kB hugepages\n", 1 << (shift - 10)); else - printf("Default size hugepages\n"); - printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20); + ksft_print_msg("Default size hugepages\n"); + ksft_print_msg("Mapping %lu Mbytes\n", (unsigned long)length >> 20); addr = mmap(ADDR, length, PROTECTION, flags, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); - printf("Returned address is %p\n", addr); + ksft_print_msg("Returned address is %p\n", addr); check_bytes(addr); write_bytes(addr, length); - ret = read_bytes(addr, length); + read_bytes(addr, length); /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ - if (munmap(addr, length)) { - perror("munmap"); - exit(1); - } + if (munmap(addr, length)) + ksft_exit_fail_msg("munmap: %s\n", strerror(errno)); - return ret; + ksft_finished(); } From 7ef98513c75f9cf68494d231a26513c5e05ef619 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 2 Feb 2024 16:31:10 +0500 Subject: [PATCH 107/435] selftests/mm: map_populate: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Minor cleanups have also been included. Link: https://lkml.kernel.org/r/20240202113119.2047740-4-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/map_populate.c | 37 ++++++++++++++--------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/mm/map_populate.c b/tools/testing/selftests/mm/map_populate.c index 7945d0754875..5c8a53869b1b 100644 --- a/tools/testing/selftests/mm/map_populate.c +++ b/tools/testing/selftests/mm/map_populate.c @@ -16,19 +16,21 @@ #include #include #include +#include "../kselftest.h" #define MMAP_SZ 4096 -#define BUG_ON(condition, description) \ - do { \ - if (condition) { \ - fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \ - __LINE__, (description), strerror(errno)); \ - exit(1); \ - } \ +#define BUG_ON(condition, description) \ + do { \ + if (condition) \ + ksft_exit_fail_msg("[FAIL]\t%s:%d\t%s:%s\n", \ + __func__, __LINE__, (description), \ + strerror(errno)); \ } while (0) -static int parent_f(int sock, unsigned long *smap, int child) +#define TESTS_IN_CHILD 2 + +static void parent_f(int sock, unsigned long *smap, int child) { int status, ret; @@ -43,9 +45,10 @@ static int parent_f(int sock, unsigned long *smap, int child) BUG_ON(ret <= 0, "write(sock)"); waitpid(child, &status, 0); - BUG_ON(!WIFEXITED(status), "child in unexpected state"); - return WEXITSTATUS(status); + /* The ksft macros don't keep counters between processes */ + ksft_cnt.ksft_pass = WEXITSTATUS(status); + ksft_cnt.ksft_fail = TESTS_IN_CHILD - WEXITSTATUS(status); } static int child_f(int sock, unsigned long *smap, int fd) @@ -64,10 +67,11 @@ static int child_f(int sock, unsigned long *smap, int fd) ret = read(sock, &buf, sizeof(int)); BUG_ON(ret <= 0, "read(sock)"); - BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page"); - BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted"); + ksft_test_result(*smap != 0x22222BAD, "MAP_POPULATE COW private page\n"); + ksft_test_result(*smap == 0xdeadbabe, "The mapping state\n"); - return 0; + /* The ksft macros don't keep counters between processes */ + return ksft_cnt.ksft_pass; } int main(int argc, char **argv) @@ -76,6 +80,9 @@ int main(int argc, char **argv) FILE *ftmp; unsigned long *smap; + ksft_print_header(); + ksft_set_plan(TESTS_IN_CHILD); + ftmp = tmpfile(); BUG_ON(!ftmp, "tmpfile()"); @@ -101,7 +108,9 @@ int main(int argc, char **argv) ret = close(sock[0]); BUG_ON(ret, "close()"); - return parent_f(sock[1], smap, child); + parent_f(sock[1], smap, child); + + ksft_finished(); } ret = close(sock[1]); From 244ae27161b183dfb042c2b91ba29f73e7b56ea1 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 2 Feb 2024 16:31:11 +0500 Subject: [PATCH 108/435] selftests/mm: mlock-random-test: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Link: https://lkml.kernel.org/r/20240202113119.2047740-5-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../testing/selftests/mm/mlock-random-test.c | 136 +++++++----------- 1 file changed, 54 insertions(+), 82 deletions(-) diff --git a/tools/testing/selftests/mm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c index 1fba77df7f62..1cd80b0f76c3 100644 --- a/tools/testing/selftests/mm/mlock-random-test.c +++ b/tools/testing/selftests/mm/mlock-random-test.c @@ -13,6 +13,7 @@ #include #include #include +#include "../kselftest.h" #include "mlock2.h" #define CHUNK_UNIT (128 * 1024) @@ -31,14 +32,14 @@ int set_cap_limits(rlim_t max) new.rlim_cur = max; new.rlim_max = max; if (setrlimit(RLIMIT_MEMLOCK, &new)) { - perror("setrlimit() returns error\n"); + ksft_perror("setrlimit() returns error\n"); return -1; } /* drop capabilities including CAP_IPC_LOCK */ if (cap_set_proc(cap)) { - perror("cap_set_proc() returns error\n"); - return -2; + ksft_perror("cap_set_proc() returns error\n"); + return -1; } return 0; @@ -52,27 +53,24 @@ int get_proc_locked_vm_size(void) unsigned long lock_size = 0; f = fopen("/proc/self/status", "r"); - if (!f) { - perror("fopen"); - return -1; - } + if (!f) + ksft_exit_fail_msg("fopen: %s\n", strerror(errno)); while (fgets(line, 1024, f)) { if (strstr(line, "VmLck")) { ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size); if (ret <= 0) { - printf("sscanf() on VmLck error: %s: %d\n", - line, ret); fclose(f); - return -1; + ksft_exit_fail_msg("sscanf() on VmLck error: %s: %d\n", + line, ret); } fclose(f); return (int)(lock_size << 10); } } - perror("cannot parse VmLck in /proc/self/status\n"); fclose(f); + ksft_exit_fail_msg("cannot parse VmLck in /proc/self/status: %s\n", strerror(errno)); return -1; } @@ -91,10 +89,8 @@ int get_proc_page_size(unsigned long addr) size_t size; smaps = seek_to_smaps_entry(addr); - if (!smaps) { - printf("Unable to parse /proc/self/smaps\n"); - return 0; - } + if (!smaps) + ksft_exit_fail_msg("Unable to parse /proc/self/smaps\n"); while (getline(&line, &size, smaps) > 0) { if (!strstr(line, "MMUPageSize")) { @@ -105,12 +101,9 @@ int get_proc_page_size(unsigned long addr) } /* found the MMUPageSize of this section */ - if (sscanf(line, "MMUPageSize: %8lu kB", - &mmupage_size) < 1) { - printf("Unable to parse smaps entry for Size:%s\n", - line); - break; - } + if (sscanf(line, "MMUPageSize: %8lu kB", &mmupage_size) < 1) + ksft_exit_fail_msg("Unable to parse smaps entry for Size:%s\n", + line); } free(line); @@ -136,7 +129,7 @@ int get_proc_page_size(unsigned long addr) * return value: 0 - success * else: failure */ -int test_mlock_within_limit(char *p, int alloc_size) +static void test_mlock_within_limit(char *p, int alloc_size) { int i; int ret = 0; @@ -145,11 +138,9 @@ int test_mlock_within_limit(char *p, int alloc_size) int page_size = 0; getrlimit(RLIMIT_MEMLOCK, &cur); - if (cur.rlim_cur < alloc_size) { - printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n", - alloc_size, (unsigned int)cur.rlim_cur); - return -1; - } + if (cur.rlim_cur < alloc_size) + ksft_exit_fail_msg("alloc_size[%d] < %u rlimit,lead to mlock failure\n", + alloc_size, (unsigned int)cur.rlim_cur); srand(time(NULL)); for (i = 0; i < TEST_LOOP; i++) { @@ -169,13 +160,11 @@ int test_mlock_within_limit(char *p, int alloc_size) ret = mlock2_(p + start_offset, lock_size, MLOCK_ONFAULT); - if (ret) { - printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n", - is_mlock ? "mlock" : "mlock2", - p, alloc_size, - p + start_offset, lock_size); - return ret; - } + if (ret) + ksft_exit_fail_msg("%s() failure at |%p(%d)| mlock:|%p(%d)|\n", + is_mlock ? "mlock" : "mlock2", + p, alloc_size, + p + start_offset, lock_size); } /* @@ -183,18 +172,12 @@ int test_mlock_within_limit(char *p, int alloc_size) */ locked_vm_size = get_proc_locked_vm_size(); page_size = get_proc_page_size((unsigned long)p); - if (page_size == 0) { - printf("cannot get proc MMUPageSize\n"); - return -1; - } - if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) { - printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n", - locked_vm_size, alloc_size); - return -1; - } + if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) + ksft_exit_fail_msg("%s left VmLck:%d on %d chunk\n", + __func__, locked_vm_size, alloc_size); - return 0; + ksft_test_result_pass("%s\n", __func__); } @@ -213,7 +196,7 @@ int test_mlock_within_limit(char *p, int alloc_size) * return value: 0 - success * else: failure */ -int test_mlock_outof_limit(char *p, int alloc_size) +static void test_mlock_outof_limit(char *p, int alloc_size) { int i; int ret = 0; @@ -221,11 +204,9 @@ int test_mlock_outof_limit(char *p, int alloc_size) struct rlimit cur; getrlimit(RLIMIT_MEMLOCK, &cur); - if (cur.rlim_cur >= alloc_size) { - printf("alloc_size[%d] >%u rlimit, violates test condition\n", - alloc_size, (unsigned int)cur.rlim_cur); - return -1; - } + if (cur.rlim_cur >= alloc_size) + ksft_exit_fail_msg("alloc_size[%d] >%u rlimit, violates test condition\n", + alloc_size, (unsigned int)cur.rlim_cur); old_locked_vm_size = get_proc_locked_vm_size(); srand(time(NULL)); @@ -240,56 +221,47 @@ int test_mlock_outof_limit(char *p, int alloc_size) else ret = mlock2_(p + start_offset, lock_size, MLOCK_ONFAULT); - if (ret == 0) { - printf("%s() succeeds? on %p(%d) mlock%p(%d)\n", - is_mlock ? "mlock" : "mlock2", - p, alloc_size, - p + start_offset, lock_size); - return -1; - } + if (ret == 0) + ksft_exit_fail_msg("%s() succeeds? on %p(%d) mlock%p(%d)\n", + is_mlock ? "mlock" : "mlock2", + p, alloc_size, p + start_offset, lock_size); } locked_vm_size = get_proc_locked_vm_size(); - if (locked_vm_size != old_locked_vm_size) { - printf("tests leads to new mlocked page: old[%d], new[%d]\n", - old_locked_vm_size, - locked_vm_size); - return -1; - } + if (locked_vm_size != old_locked_vm_size) + ksft_exit_fail_msg("tests leads to new mlocked page: old[%d], new[%d]\n", + old_locked_vm_size, + locked_vm_size); - return 0; + ksft_test_result_pass("%s\n", __func__); } int main(int argc, char **argv) { char *p = NULL; - int ret = 0; + + ksft_print_header(); if (set_cap_limits(MLOCK_RLIMIT_SIZE)) - return -1; + ksft_finished(); + + ksft_set_plan(2); p = malloc(MLOCK_WITHIN_LIMIT_SIZE); - if (p == NULL) { - perror("malloc() failure\n"); - return -1; - } - ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE); - if (ret) - return ret; + if (p == NULL) + ksft_exit_fail_msg("malloc() failure: %s\n", strerror(errno)); + + test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE); munlock(p, MLOCK_WITHIN_LIMIT_SIZE); free(p); - p = malloc(MLOCK_OUTOF_LIMIT_SIZE); - if (p == NULL) { - perror("malloc() failure\n"); - return -1; - } - ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE); - if (ret) - return ret; + if (p == NULL) + ksft_exit_fail_msg("malloc() failure: %s\n", strerror(errno)); + + test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE); munlock(p, MLOCK_OUTOF_LIMIT_SIZE); free(p); - return 0; + ksft_finished(); } From 65c89684896d25568b9188e1fcdc10f1de487f2d Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 2 Feb 2024 16:31:12 +0500 Subject: [PATCH 109/435] selftests/mm: mlock2-tests: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. I've done some cleanups as well. Link: https://lkml.kernel.org/r/20240202113119.2047740-6-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mlock2-tests.c | 282 +++++++++------------- tools/testing/selftests/mm/mlock2.h | 11 +- 2 files changed, 118 insertions(+), 175 deletions(-) diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c index 80cddc0de206..26f744188ad0 100644 --- a/tools/testing/selftests/mm/mlock2-tests.c +++ b/tools/testing/selftests/mm/mlock2-tests.c @@ -7,9 +7,8 @@ #include #include #include -#include "mlock2.h" - #include "../kselftest.h" +#include "mlock2.h" struct vm_boundaries { unsigned long start; @@ -40,14 +39,14 @@ static int get_vm_area(unsigned long addr, struct vm_boundaries *area) while(fgets(line, 1024, file)) { end_addr = strchr(line, '-'); if (!end_addr) { - printf("cannot parse /proc/self/maps\n"); + ksft_print_msg("cannot parse /proc/self/maps\n"); goto out; } *end_addr = '\0'; end_addr++; stop = strchr(end_addr, ' '); if (!stop) { - printf("cannot parse /proc/self/maps\n"); + ksft_print_msg("cannot parse /proc/self/maps\n"); goto out; } @@ -78,7 +77,7 @@ static bool is_vmflag_set(unsigned long addr, const char *vmflag) smaps = seek_to_smaps_entry(addr); if (!smaps) { - printf("Unable to parse /proc/self/smaps\n"); + ksft_print_msg("Unable to parse /proc/self/smaps\n"); goto out; } @@ -115,7 +114,7 @@ static unsigned long get_value_for_name(unsigned long addr, const char *name) smaps = seek_to_smaps_entry(addr); if (!smaps) { - printf("Unable to parse /proc/self/smaps\n"); + ksft_print_msg("Unable to parse /proc/self/smaps\n"); goto out; } @@ -129,7 +128,7 @@ static unsigned long get_value_for_name(unsigned long addr, const char *name) value_ptr = line + strlen(name); if (sscanf(value_ptr, "%lu kB", &value) < 1) { - printf("Unable to parse smaps entry for Size\n"); + ksft_print_msg("Unable to parse smaps entry for Size\n"); goto out; } break; @@ -180,57 +179,45 @@ static int lock_check(unsigned long addr) static int unlock_lock_check(char *map) { if (is_vmflag_set((unsigned long)map, LOCKED)) { - printf("VMA flag %s is present on page 1 after unlock\n", LOCKED); + ksft_print_msg("VMA flag %s is present on page 1 after unlock\n", LOCKED); return 1; } return 0; } -static int test_mlock_lock() +static void test_mlock_lock(void) { char *map; - int ret = 1; unsigned long page_size = getpagesize(); map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("test_mlock_locked mmap"); - goto out; - } + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap error: %s", strerror(errno)); if (mlock2_(map, 2 * page_size, 0)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock2(0)"); - goto unmap; + munmap(map, 2 * page_size); + ksft_exit_fail_msg("mlock2(0): %s\n", strerror(errno)); } - if (!lock_check((unsigned long)map)) - goto unmap; + ksft_test_result(lock_check((unsigned long)map), "%s: Locked\n", __func__); /* Now unlock and recheck attributes */ if (munlock(map, 2 * page_size)) { - perror("munlock()"); - goto unmap; + munmap(map, 2 * page_size); + ksft_exit_fail_msg("munlock(): %s\n", strerror(errno)); } - ret = unlock_lock_check(map); - -unmap: + ksft_test_result(!unlock_lock_check(map), "%s: Locked\n", __func__); munmap(map, 2 * page_size); -out: - return ret; } static int onfault_check(char *map) { *map = 'a'; if (!is_vma_lock_on_fault((unsigned long)map)) { - printf("VMA is not marked for lock on fault\n"); + ksft_print_msg("VMA is not marked for lock on fault\n"); return 1; } @@ -243,172 +230,131 @@ static int unlock_onfault_check(char *map) if (is_vma_lock_on_fault((unsigned long)map) || is_vma_lock_on_fault((unsigned long)map + page_size)) { - printf("VMA is still lock on fault after unlock\n"); + ksft_print_msg("VMA is still lock on fault after unlock\n"); return 1; } return 0; } -static int test_mlock_onfault() +static void test_mlock_onfault(void) { char *map; - int ret = 1; unsigned long page_size = getpagesize(); map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("test_mlock_locked mmap"); - goto out; - } + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap error: %s", strerror(errno)); if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock2(MLOCK_ONFAULT)"); - goto unmap; + munmap(map, 2 * page_size); + ksft_exit_fail_msg("mlock2(MLOCK_ONFAULT): %s\n", strerror(errno)); } - if (onfault_check(map)) - goto unmap; + ksft_test_result(!onfault_check(map), "%s: VMA marked for lock on fault\n", __func__); /* Now unlock and recheck attributes */ if (munlock(map, 2 * page_size)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("munlock()"); - goto unmap; + munmap(map, 2 * page_size); + ksft_exit_fail_msg("munlock(): %s\n", strerror(errno)); } - ret = unlock_onfault_check(map); -unmap: + ksft_test_result(!unlock_onfault_check(map), "VMA open lock after fault\n"); munmap(map, 2 * page_size); -out: - return ret; } -static int test_lock_onfault_of_present() +static void test_lock_onfault_of_present(void) { char *map; - int ret = 1; unsigned long page_size = getpagesize(); map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("test_mlock_locked mmap"); - goto out; - } + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap error: %s", strerror(errno)); *map = 'a'; if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock2(MLOCK_ONFAULT)"); - goto unmap; + munmap(map, 2 * page_size); + ksft_test_result_fail("mlock2(MLOCK_ONFAULT) error: %s", strerror(errno)); } - if (!is_vma_lock_on_fault((unsigned long)map) || - !is_vma_lock_on_fault((unsigned long)map + page_size)) { - printf("VMA with present pages is not marked lock on fault\n"); - goto unmap; - } - ret = 0; -unmap: + ksft_test_result(is_vma_lock_on_fault((unsigned long)map) || + is_vma_lock_on_fault((unsigned long)map + page_size), + "VMA with present pages is not marked lock on fault\n"); munmap(map, 2 * page_size); -out: - return ret; } -static int test_munlockall() +static void test_munlockall0(void) { char *map; - int ret = 1; unsigned long page_size = getpagesize(); map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - - if (map == MAP_FAILED) { - perror("test_munlockall mmap"); - goto out; - } + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap error: %s\n", strerror(errno)); if (mlockall(MCL_CURRENT)) { - perror("mlockall(MCL_CURRENT)"); - goto out; + munmap(map, 2 * page_size); + ksft_exit_fail_msg("mlockall(MCL_CURRENT): %s\n", strerror(errno)); } - if (!lock_check((unsigned long)map)) - goto unmap; + ksft_test_result(lock_check((unsigned long)map), "%s: Locked memory area\n", __func__); if (munlockall()) { - perror("munlockall()"); - goto unmap; + munmap(map, 2 * page_size); + ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno)); } - if (unlock_lock_check(map)) - goto unmap; - + ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__); munmap(map, 2 * page_size); +} + +static void test_munlockall1(void) +{ + char *map; + unsigned long page_size = getpagesize(); map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - - if (map == MAP_FAILED) { - perror("test_munlockall second mmap"); - goto out; - } + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap error: %s", strerror(errno)); if (mlockall(MCL_CURRENT | MCL_ONFAULT)) { - perror("mlockall(MCL_CURRENT | MCL_ONFAULT)"); - goto unmap; + munmap(map, 2 * page_size); + ksft_exit_fail_msg("mlockall(MCL_CURRENT | MCL_ONFAULT): %s\n", strerror(errno)); } - if (onfault_check(map)) - goto unmap; + ksft_test_result(!onfault_check(map), "%s: VMA marked for lock on fault\n", __func__); if (munlockall()) { - perror("munlockall()"); - goto unmap; + munmap(map, 2 * page_size); + ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno)); } - if (unlock_onfault_check(map)) - goto unmap; + ksft_test_result(!unlock_onfault_check(map), "%s: Unlocked\n", __func__); if (mlockall(MCL_CURRENT | MCL_FUTURE)) { - perror("mlockall(MCL_CURRENT | MCL_FUTURE)"); - goto out; + munmap(map, 2 * page_size); + ksft_exit_fail_msg("mlockall(MCL_CURRENT | MCL_FUTURE): %s\n", strerror(errno)); } - if (!lock_check((unsigned long)map)) - goto unmap; + ksft_test_result(lock_check((unsigned long)map), "%s: Locked\n", __func__); if (munlockall()) { - perror("munlockall()"); - goto unmap; + munmap(map, 2 * page_size); + ksft_exit_fail_msg("munlockall() %s\n", strerror(errno)); } - ret = unlock_lock_check(map); - -unmap: + ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__); munmap(map, 2 * page_size); -out: - munlockall(); - return ret; } -static int test_vma_management(bool call_mlock) +static void test_vma_management(bool call_mlock) { - int ret = 1; void *map; unsigned long page_size = getpagesize(); struct vm_boundaries page1; @@ -417,25 +363,19 @@ static int test_vma_management(bool call_mlock) map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("mmap()"); - return ret; - } + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap error: %s", strerror(errno)); if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock(ONFAULT)\n"); - goto out; + munmap(map, 3 * page_size); + ksft_test_result_fail("mlock error: %s", strerror(errno)); } if (get_vm_area((unsigned long)map, &page1) || get_vm_area((unsigned long)map + page_size, &page2) || get_vm_area((unsigned long)map + page_size * 2, &page3)) { - printf("couldn't find mapping in /proc/self/maps\n"); - goto out; + munmap(map, 3 * page_size); + ksft_test_result_fail("couldn't find mapping in /proc/self/maps"); } /* @@ -444,76 +384,86 @@ static int test_vma_management(bool call_mlock) * not a failure) */ if (page1.start != page2.start || page2.start != page3.start) { - printf("VMAs are not merged to start, aborting test\n"); - ret = 0; - goto out; + munmap(map, 3 * page_size); + ksft_test_result_fail("VMAs are not merged to start, aborting test"); } if (munlock(map + page_size, page_size)) { - perror("munlock()"); - goto out; + munmap(map, 3 * page_size); + ksft_test_result_fail("munlock(): %s", strerror(errno)); } if (get_vm_area((unsigned long)map, &page1) || get_vm_area((unsigned long)map + page_size, &page2) || get_vm_area((unsigned long)map + page_size * 2, &page3)) { - printf("couldn't find mapping in /proc/self/maps\n"); - goto out; + munmap(map, 3 * page_size); + ksft_test_result_fail("couldn't find mapping in /proc/self/maps"); } /* All three VMAs should be different */ if (page1.start == page2.start || page2.start == page3.start) { - printf("failed to split VMA for munlock\n"); - goto out; + munmap(map, 3 * page_size); + ksft_test_result_fail("failed to split VMA for munlock"); } /* Now unlock the first and third page and check the VMAs again */ if (munlock(map, page_size * 3)) { - perror("munlock()"); - goto out; + munmap(map, 3 * page_size); + ksft_test_result_fail("munlock(): %s", strerror(errno)); } if (get_vm_area((unsigned long)map, &page1) || get_vm_area((unsigned long)map + page_size, &page2) || get_vm_area((unsigned long)map + page_size * 2, &page3)) { - printf("couldn't find mapping in /proc/self/maps\n"); - goto out; + munmap(map, 3 * page_size); + ksft_test_result_fail("couldn't find mapping in /proc/self/maps"); } /* Now all three VMAs should be the same */ if (page1.start != page2.start || page2.start != page3.start) { - printf("failed to merge VMAs after munlock\n"); - goto out; + munmap(map, 3 * page_size); + ksft_test_result_fail("failed to merge VMAs after munlock"); } - ret = 0; -out: + ksft_test_result_pass("%s call_mlock %d\n", __func__, call_mlock); munmap(map, 3 * page_size); - return ret; } -static int test_mlockall(int (test_function)(bool call_mlock)) +static void test_mlockall(void) { - int ret = 1; + if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) + ksft_exit_fail_msg("mlockall failed: %s\n", strerror(errno)); - if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) { - perror("mlockall"); - return ret; - } - - ret = test_function(false); + test_vma_management(false); munlockall(); - return ret; } int main(int argc, char **argv) { - int ret = 0; - ret += test_mlock_lock(); - ret += test_mlock_onfault(); - ret += test_munlockall(); - ret += test_lock_onfault_of_present(); - ret += test_vma_management(true); - ret += test_mlockall(test_vma_management); - return ret; + int ret, size = 3 * getpagesize(); + void *map; + + ksft_print_header(); + + map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap error: %s", strerror(errno)); + + ret = mlock2_(map, size, MLOCK_ONFAULT); + if (ret && errno == ENOSYS) + ksft_finished(); + + munmap(map, size); + + ksft_set_plan(13); + + test_mlock_lock(); + test_mlock_onfault(); + test_munlockall0(); + test_munlockall1(); + test_lock_onfault_of_present(); + test_vma_management(true); + test_mlockall(); + + ksft_finished(); } diff --git a/tools/testing/selftests/mm/mlock2.h b/tools/testing/selftests/mm/mlock2.h index 8e02991b313c..4417eaa5cfb7 100644 --- a/tools/testing/selftests/mm/mlock2.h +++ b/tools/testing/selftests/mm/mlock2.h @@ -6,12 +6,7 @@ static int mlock2_(void *start, size_t len, int flags) { -#ifdef __NR_mlock2 return syscall(__NR_mlock2, start, len, flags); -#else - errno = ENOSYS; - return -1; -#endif } static FILE *seek_to_smaps_entry(unsigned long addr) @@ -27,10 +22,8 @@ static FILE *seek_to_smaps_entry(unsigned long addr) char path[BUFSIZ]; file = fopen("/proc/self/smaps", "r"); - if (!file) { - perror("fopen smaps"); - _exit(1); - } + if (!file) + ksft_exit_fail_msg("fopen smaps: %s\n", strerror(errno)); while (getline(&line, &size, file) > 0) { if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n", From 746f356f1170f340100cc30b9ffa100968e81a24 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 2 Feb 2024 16:31:13 +0500 Subject: [PATCH 110/435] selftests/mm: mrelease_test: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Link: https://lkml.kernel.org/r/20240202113119.2047740-7-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mrelease_test.c | 80 +++++++++------------- 1 file changed, 33 insertions(+), 47 deletions(-) diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c index d822004a374e..100370a7111d 100644 --- a/tools/testing/selftests/mm/mrelease_test.c +++ b/tools/testing/selftests/mm/mrelease_test.c @@ -26,19 +26,15 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd) buf = (char *)mmap(NULL, nr_pages * psize(), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, 0, 0); - if (buf == MAP_FAILED) { - perror("mmap failed, halting the test"); - return KSFT_FAIL; - } + if (buf == MAP_FAILED) + ksft_exit_fail_msg("mmap failed, halting the test: %s\n", strerror(errno)); for (i = 0; i < nr_pages; i++) *((unsigned long *)(buf + (i * psize()))) = i; /* Signal the parent that the child is ready */ - if (write(pipefd, "", 1) < 0) { - perror("write"); - return KSFT_FAIL; - } + if (write(pipefd, "", 1) < 0) + ksft_exit_fail_msg("write: %s\n", strerror(errno)); /* Wait to be killed (when reparenting happens) */ while (getppid() == ppid && timeout > 0) { @@ -54,23 +50,17 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd) /* The process_mrelease calls in this test are expected to fail */ static void run_negative_tests(int pidfd) { - int res; /* Test invalid flags. Expect to fail with EINVAL error code. */ if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) || errno != EINVAL) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease with wrong flags"); - exit(res); + ksft_exit_fail_msg("process_mrelease with wrong flags: %s\n", strerror(errno)); } /* * Test reaping while process is alive with no pending SIGKILL. * Expect to fail with EINVAL error code. */ - if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease on a live process"); - exit(res); - } + if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) + ksft_exit_fail_msg("process_mrelease on a live process: %s\n", strerror(errno)); } static int child_main(int pipefd[], size_t size) @@ -93,11 +83,18 @@ int main(void) char byte; int res; + ksft_print_header(); + ksft_set_plan(1); + /* Test a wrong pidfd */ if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease with wrong pidfd"); - exit(res); + if (errno == ENOSYS) { + ksft_test_result_skip("process_mrelease not implemented\n"); + ksft_finished(); + } else { + ksft_exit_fail_msg("process_mrelease with wrong pidfd: %s", + strerror(errno)); + } } /* Start the test with 1MB child memory allocation */ @@ -107,16 +104,14 @@ retry: * Pipe for the child to signal when it's done allocating * memory */ - if (pipe(pipefd)) { - perror("pipe"); - exit(KSFT_FAIL); - } + if (pipe(pipefd)) + ksft_exit_fail_msg("pipe: %s\n", strerror(errno)); + pid = fork(); if (pid < 0) { - perror("fork"); close(pipefd[0]); close(pipefd[1]); - exit(KSFT_FAIL); + ksft_exit_fail_msg("fork: %s\n", strerror(errno)); } if (pid == 0) { @@ -134,28 +129,23 @@ retry: res = read(pipefd[0], &byte, 1); close(pipefd[0]); if (res < 0) { - perror("read"); if (!kill(pid, SIGKILL)) waitpid(pid, NULL, 0); - exit(KSFT_FAIL); + ksft_exit_fail_msg("read: %s\n", strerror(errno)); } pidfd = syscall(__NR_pidfd_open, pid, 0); if (pidfd < 0) { - perror("pidfd_open"); if (!kill(pid, SIGKILL)) waitpid(pid, NULL, 0); - exit(KSFT_FAIL); + ksft_exit_fail_msg("pidfd_open: %s\n", strerror(errno)); } /* Run negative tests which require a live child */ run_negative_tests(pidfd); - if (kill(pid, SIGKILL)) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("kill"); - exit(res); - } + if (kill(pid, SIGKILL)) + ksft_exit_fail_msg("kill: %s\n", strerror(errno)); success = (syscall(__NR_process_mrelease, pidfd, 0) == 0); if (!success) { @@ -169,18 +159,15 @@ retry: if (errno == ESRCH) { retry = (size <= MAX_SIZE_MB); } else { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease"); waitpid(pid, NULL, 0); - exit(res); + ksft_exit_fail_msg("process_mrelease: %s\n", strerror(errno)); } } /* Cleanup to prevent zombies */ - if (waitpid(pid, NULL, 0) < 0) { - perror("waitpid"); - exit(KSFT_FAIL); - } + if (waitpid(pid, NULL, 0) < 0) + ksft_exit_fail_msg("waitpid: %s\n", strerror(errno)); + close(pidfd); if (!success) { @@ -188,11 +175,10 @@ retry: size *= 2; goto retry; } - printf("All process_mrelease attempts failed!\n"); - exit(KSFT_FAIL); + ksft_exit_fail_msg("All process_mrelease attempts failed!\n"); } - printf("Success reaping a child with %zuMB of memory allocations\n", - size); - return KSFT_PASS; + ksft_test_result_pass("Success reaping a child with %zuMB of memory allocations\n", + size); + ksft_finished(); } From a0d470578587213814e724fb37b3cd7c95910264 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 2 Feb 2024 16:31:14 +0500 Subject: [PATCH 111/435] selftests/mm: mremap_dontunmap: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Link: https://lkml.kernel.org/r/20240202113119.2047740-8-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_dontunmap.c | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/mm/mremap_dontunmap.c b/tools/testing/selftests/mm/mremap_dontunmap.c index a06e73ec8568..1d75084b9ca5 100644 --- a/tools/testing/selftests/mm/mremap_dontunmap.c +++ b/tools/testing/selftests/mm/mremap_dontunmap.c @@ -27,14 +27,14 @@ static void dump_maps(void) system(cmd); } -#define BUG_ON(condition, description) \ - do { \ - if (condition) { \ - fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \ - __LINE__, (description), strerror(errno)); \ - dump_maps(); \ - exit(1); \ - } \ +#define BUG_ON(condition, description) \ + do { \ + if (condition) { \ + dump_maps(); \ + ksft_exit_fail_msg("[FAIL]\t%s:%d\t%s:%s\n", \ + __func__, __LINE__, (description), \ + strerror(errno)); \ + } \ } while (0) // Try a simple operation for to "test" for kernel support this prevents @@ -122,6 +122,7 @@ static void mremap_dontunmap_simple() "unable to unmap destination mapping"); BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, "unable to unmap source mapping"); + ksft_test_result_pass("%s\n", __func__); } // This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected. @@ -173,6 +174,7 @@ static void mremap_dontunmap_simple_shmem() "unable to unmap destination mapping"); BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, "unable to unmap source mapping"); + ksft_test_result_pass("%s\n", __func__); } // This test validates MREMAP_DONTUNMAP will move page tables to a specific @@ -219,6 +221,7 @@ static void mremap_dontunmap_simple_fixed() "unable to unmap destination mapping"); BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, "unable to unmap source mapping"); + ksft_test_result_pass("%s\n", __func__); } // This test validates that we can MREMAP_DONTUNMAP for a portion of an @@ -269,6 +272,7 @@ static void mremap_dontunmap_partial_mapping() "unable to unmap destination mapping"); BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, "unable to unmap source mapping"); + ksft_test_result_pass("%s\n", __func__); } // This test validates that we can remap over only a portion of a mapping. @@ -328,19 +332,24 @@ static void mremap_dontunmap_partial_mapping_overwrite(void) "unable to unmap destination mapping"); BUG_ON(munmap(source_mapping, 5 * page_size) == -1, "unable to unmap source mapping"); + ksft_test_result_pass("%s\n", __func__); } int main(void) { + ksft_print_header(); + page_size = sysconf(_SC_PAGE_SIZE); // test for kernel support for MREMAP_DONTUNMAP skipping the test if // not. if (kernel_support_for_mremap_dontunmap() != 0) { - printf("No kernel support for MREMAP_DONTUNMAP\n"); - return KSFT_SKIP; + ksft_print_msg("No kernel support for MREMAP_DONTUNMAP\n"); + ksft_finished(); } + ksft_set_plan(5); + // Keep a page sized buffer around for when we need it. page_buffer = mmap(NULL, page_size, PROT_READ | PROT_WRITE, @@ -356,6 +365,5 @@ int main(void) BUG_ON(munmap(page_buffer, page_size) == -1, "unable to unmap page buffer"); - printf("OK\n"); - return 0; + ksft_finished(); } From 735887041a456152d31099c9a8cc24a2f6fecec3 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 2 Feb 2024 16:31:15 +0500 Subject: [PATCH 112/435] selftests/mm: split_huge_page_test: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Link: https://lkml.kernel.org/r/20240202113119.2047740-9-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/mm/split_huge_page_test.c | 163 ++++++++---------- 1 file changed, 70 insertions(+), 93 deletions(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 0e74635c8c3d..7b698a848bab 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -17,6 +17,7 @@ #include #include #include "vm_util.h" +#include "../kselftest.h" uint64_t pagesize; unsigned int pageshift; @@ -50,21 +51,19 @@ int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) return 0; } -static int write_file(const char *path, const char *buf, size_t buflen) +static void write_file(const char *path, const char *buf, size_t buflen) { int fd; ssize_t numwritten; fd = open(path, O_WRONLY); if (fd == -1) - return 0; + ksft_exit_fail_msg("%s open failed: %s\n", path, strerror(errno)); numwritten = write(fd, buf, buflen - 1); close(fd); if (numwritten < 1) - return 0; - - return (unsigned int) numwritten; + ksft_exit_fail_msg("Write failed\n"); } static void write_debugfs(const char *fmt, ...) @@ -77,15 +76,10 @@ static void write_debugfs(const char *fmt, ...) ret = vsnprintf(input, INPUT_MAX, fmt, argp); va_end(argp); - if (ret >= INPUT_MAX) { - printf("%s: Debugfs input is too long\n", __func__); - exit(EXIT_FAILURE); - } + if (ret >= INPUT_MAX) + ksft_exit_fail_msg("%s: Debugfs input is too long\n", __func__); - if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) { - perror(SPLIT_DEBUGFS); - exit(EXIT_FAILURE); - } + write_file(SPLIT_DEBUGFS, input, ret + 1); } void split_pmd_thp(void) @@ -95,39 +89,30 @@ void split_pmd_thp(void) size_t i; one_page = memalign(pmd_pagesize, len); - - if (!one_page) { - printf("Fail to allocate memory\n"); - exit(EXIT_FAILURE); - } + if (!one_page) + ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno)); madvise(one_page, len, MADV_HUGEPAGE); for (i = 0; i < len; i++) one_page[i] = (char)i; - if (!check_huge_anon(one_page, 4, pmd_pagesize)) { - printf("No THP is allocated\n"); - exit(EXIT_FAILURE); - } + if (!check_huge_anon(one_page, 4, pmd_pagesize)) + ksft_exit_fail_msg("No THP is allocated\n"); /* split all THPs */ write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, (uint64_t)one_page + len); for (i = 0; i < len; i++) - if (one_page[i] != (char)i) { - printf("%ld byte corrupted\n", i); - exit(EXIT_FAILURE); - } + if (one_page[i] != (char)i) + ksft_exit_fail_msg("%ld byte corrupted\n", i); - if (!check_huge_anon(one_page, 0, pmd_pagesize)) { - printf("Still AnonHugePages not split\n"); - exit(EXIT_FAILURE); - } + if (!check_huge_anon(one_page, 0, pmd_pagesize)) + ksft_exit_fail_msg("Still AnonHugePages not split\n"); - printf("Split huge pages successful\n"); + ksft_test_result_pass("Split huge pages successful\n"); free(one_page); } @@ -143,36 +128,29 @@ void split_pte_mapped_thp(void) int pagemap_fd; int kpageflags_fd; - if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) { - perror("get pagemap proc error"); - exit(EXIT_FAILURE); - } - pagemap_fd = open(pagemap_proc, O_RDONLY); + if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) + ksft_exit_fail_msg("get pagemap proc error: %s\n", strerror(errno)); - if (pagemap_fd == -1) { - perror("read pagemap:"); - exit(EXIT_FAILURE); - } + pagemap_fd = open(pagemap_proc, O_RDONLY); + if (pagemap_fd == -1) + ksft_exit_fail_msg("read pagemap: %s\n", strerror(errno)); kpageflags_fd = open(kpageflags_proc, O_RDONLY); - - if (kpageflags_fd == -1) { - perror("read kpageflags:"); - exit(EXIT_FAILURE); - } + if (kpageflags_fd == -1) + ksft_exit_fail_msg("read kpageflags: %s\n", strerror(errno)); one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (one_page == MAP_FAILED) + ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno)); madvise(one_page, len, MADV_HUGEPAGE); for (i = 0; i < len; i++) one_page[i] = (char)i; - if (!check_huge_anon(one_page, 4, pmd_pagesize)) { - printf("No THP is allocated\n"); - exit(EXIT_FAILURE); - } + if (!check_huge_anon(one_page, 4, pmd_pagesize)) + ksft_exit_fail_msg("No THP is allocated\n"); /* remap the first pagesize of first THP */ pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE); @@ -183,10 +161,8 @@ void split_pte_mapped_thp(void) pagesize, pagesize, MREMAP_MAYMOVE|MREMAP_FIXED, pte_mapped + pagesize * i); - if (pte_mapped2 == (char *)-1) { - perror("mremap failed"); - exit(EXIT_FAILURE); - } + if (pte_mapped2 == MAP_FAILED) + ksft_exit_fail_msg("mremap failed: %s\n", strerror(errno)); } /* smap does not show THPs after mremap, use kpageflags instead */ @@ -196,10 +172,8 @@ void split_pte_mapped_thp(void) is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) thp_size++; - if (thp_size != 4) { - printf("Some THPs are missing during mremap\n"); - exit(EXIT_FAILURE); - } + if (thp_size != 4) + ksft_exit_fail_msg("Some THPs are missing during mremap\n"); /* split all remapped THPs */ write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, @@ -208,21 +182,18 @@ void split_pte_mapped_thp(void) /* smap does not show THPs after mremap, use kpageflags instead */ thp_size = 0; for (i = 0; i < pagesize * 4; i++) { - if (pte_mapped[i] != (char)i) { - printf("%ld byte corrupted\n", i); - exit(EXIT_FAILURE); - } + if (pte_mapped[i] != (char)i) + ksft_exit_fail_msg("%ld byte corrupted\n", i); + if (i % pagesize == 0 && is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) thp_size++; } - if (thp_size) { - printf("Still %ld THPs not split\n", thp_size); - exit(EXIT_FAILURE); - } + if (thp_size) + ksft_exit_fail_msg("Still %ld THPs not split\n", thp_size); - printf("Split PTE-mapped huge pages successful\n"); + ksft_test_result_pass("Split PTE-mapped huge pages successful\n"); munmap(one_page, len); close(pagemap_fd); close(kpageflags_fd); @@ -238,24 +209,21 @@ void split_file_backed_thp(void) char testfile[INPUT_MAX]; uint64_t pgoff_start = 0, pgoff_end = 1024; - printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n"); + ksft_print_msg("Please enable pr_debug in split_huge_pages_in_file() for more info.\n"); status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); - if (status) { - printf("Unable to create a tmpfs for testing\n"); - exit(EXIT_FAILURE); - } + if (status) + ksft_exit_fail_msg("Unable to create a tmpfs for testing\n"); status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); if (status >= INPUT_MAX) { - printf("Fail to create file-backed THP split testing file\n"); - goto cleanup; + ksft_exit_fail_msg("Fail to create file-backed THP split testing file\n"); } fd = open(testfile, O_CREAT|O_WRONLY); if (fd == -1) { - perror("Cannot open testing file\n"); + ksft_perror("Cannot open testing file"); goto cleanup; } @@ -264,7 +232,7 @@ void split_file_backed_thp(void) close(fd); if (num_written < 1) { - printf("Fail to write data to testing file\n"); + ksft_perror("Fail to write data to testing file"); goto cleanup; } @@ -272,42 +240,51 @@ void split_file_backed_thp(void) write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end); status = unlink(testfile); - if (status) - perror("Cannot remove testing file\n"); + if (status) { + ksft_perror("Cannot remove testing file"); + goto cleanup; + } -cleanup: status = umount(tmpfs_loc); if (status) { - printf("Unable to umount %s\n", tmpfs_loc); - exit(EXIT_FAILURE); - } - status = rmdir(tmpfs_loc); - if (status) { - perror("cannot remove tmp dir"); - exit(EXIT_FAILURE); + rmdir(tmpfs_loc); + ksft_exit_fail_msg("Unable to umount %s\n", tmpfs_loc); } - printf("file-backed THP split test done, please check dmesg for more information\n"); + status = rmdir(tmpfs_loc); + if (status) + ksft_exit_fail_msg("cannot remove tmp dir: %s\n", strerror(errno)); + + ksft_print_msg("Please check dmesg for more information\n"); + ksft_test_result_pass("File-backed THP split test done\n"); + return; + +cleanup: + umount(tmpfs_loc); + rmdir(tmpfs_loc); + ksft_exit_fail_msg("Error occurred\n"); } int main(int argc, char **argv) { + ksft_print_header(); + if (geteuid() != 0) { - printf("Please run the benchmark as root\n"); - exit(EXIT_FAILURE); + ksft_print_msg("Please run the benchmark as root\n"); + ksft_finished(); } + ksft_set_plan(3); + pagesize = getpagesize(); pageshift = ffs(pagesize) - 1; pmd_pagesize = read_pmd_pagesize(); - if (!pmd_pagesize) { - printf("Reading PMD pagesize failed\n"); - exit(EXIT_FAILURE); - } + if (!pmd_pagesize) + ksft_exit_fail_msg("Reading PMD pagesize failed\n"); split_pmd_thp(); split_pte_mapped_thp(); split_file_backed_thp(); - return 0; + ksft_finished(); } From b38bd9b2c448507444634be06bcb036df4b23bb8 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 2 Feb 2024 16:31:17 +0500 Subject: [PATCH 113/435] selftests/mm: thuge-gen: conform to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Also remove unneeded logging which isn't enabled. Skip a hugepage size if it has less free pages to avoid unnecessary failures. For examples, some systems may not have 1GB hugepage free. So skip 1GB for testing in this test instead of failing the entire test. Link: https://lkml.kernel.org/r/20240202113119.2047740-11-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/thuge-gen.c | 147 +++++++++++++------------ 1 file changed, 75 insertions(+), 72 deletions(-) diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index 622987f12c89..ea7fd8fe2876 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -4,7 +4,7 @@ Before running this huge pages for each huge page size must have been reserved. For large pages beyond MAX_PAGE_ORDER (like 1GB on x86) boot options must - be used. + be used. 1GB wouldn't be tested if it isn't available. Also shmmax must be increased. And you need to run as root to work around some weird permissions in shm. And nothing using huge pages should run in parallel. @@ -26,8 +26,7 @@ #include #include #include "vm_util.h" - -#define err(x) perror(x), exit(1) +#include "../kselftest.h" #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) #define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) @@ -44,11 +43,8 @@ #define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) #define NUM_PAGESIZES 5 - #define NUM_PAGES 4 -#define Dprintf(fmt...) // printf(fmt) - unsigned long page_sizes[NUM_PAGESIZES]; int num_page_sizes; @@ -60,28 +56,15 @@ int ilog2(unsigned long v) return l; } -void find_pagesizes(void) -{ - glob_t g; - int i; - glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g); - assert(g.gl_pathc <= NUM_PAGESIZES); - for (i = 0; i < g.gl_pathc; i++) { - sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB", - &page_sizes[i]); - page_sizes[i] <<= 10; - printf("Found %luMB\n", page_sizes[i] >> 20); - } - num_page_sizes = g.gl_pathc; - globfree(&g); -} - void show(unsigned long ps) { char buf[100]; + if (ps == getpagesize()) return; - printf("%luMB: ", ps >> 20); + + ksft_print_msg("%luMB: ", ps >> 20); + fflush(stdout); snprintf(buf, sizeof buf, "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", @@ -105,7 +88,7 @@ unsigned long read_sysfs(int warn, char *fmt, ...) f = fopen(buf, "r"); if (!f) { if (warn) - printf("missing %s\n", buf); + ksft_print_msg("missing %s\n", buf); return 0; } if (getline(&line, &linelen, f) > 0) { @@ -119,123 +102,143 @@ unsigned long read_sysfs(int warn, char *fmt, ...) unsigned long read_free(unsigned long ps) { return read_sysfs(ps != getpagesize(), - "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", - ps >> 10); + "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", + ps >> 10); } void test_mmap(unsigned long size, unsigned flags) { char *map; unsigned long before, after; - int err; before = read_free(size); map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); - if (map == (char *)-1) err("mmap"); memset(map, 0xff, size*NUM_PAGES); after = read_free(size); - Dprintf("before %lu after %lu diff %ld size %lu\n", - before, after, before - after, size); - assert(size == getpagesize() || (before - after) == NUM_PAGES); + show(size); - err = munmap(map, size * NUM_PAGES); - assert(!err); + ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES, + "%s mmap\n", __func__); + + if (munmap(map, size * NUM_PAGES)) + ksft_exit_fail_msg("%s: unmap %s\n", __func__, strerror(errno)); } void test_shmget(unsigned long size, unsigned flags) { int id; unsigned long before, after; - int err; + struct shm_info i; + char *map; before = read_free(size); id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags); - if (id < 0) err("shmget"); + if (id < 0) { + if (errno == EPERM) { + ksft_test_result_skip("shmget requires root privileges: %s\n", + strerror(errno)); + return; + } + ksft_exit_fail_msg("shmget: %s\n", strerror(errno)); + } - struct shm_info i; - if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl"); - Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss); + if (shmctl(id, SHM_INFO, (void *)&i) < 0) + ksft_exit_fail_msg("shmctl: %s\n", strerror(errno)); - - Dprintf("id %d\n", id); - char *map = shmat(id, NULL, 0600); - if (map == (char*)-1) err("shmat"); + map = shmat(id, NULL, 0600); + if (map == MAP_FAILED) + ksft_exit_fail_msg("shmat: %s\n", strerror(errno)); shmctl(id, IPC_RMID, NULL); memset(map, 0xff, size*NUM_PAGES); after = read_free(size); - Dprintf("before %lu after %lu diff %ld size %lu\n", - before, after, before - after, size); - assert(size == getpagesize() || (before - after) == NUM_PAGES); show(size); - err = shmdt(map); - assert(!err); + ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES, + "%s: mmap\n", __func__); + if (shmdt(map)) + ksft_exit_fail_msg("%s: shmdt: %s\n", __func__, strerror(errno)); } -void sanity_checks(void) +void find_pagesizes(void) { - int i; unsigned long largest = getpagesize(); + int i; + glob_t g; - for (i = 0; i < num_page_sizes; i++) { - if (page_sizes[i] > largest) + glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g); + assert(g.gl_pathc <= NUM_PAGESIZES); + for (i = 0; (i < g.gl_pathc) && (num_page_sizes < NUM_PAGESIZES); i++) { + sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB", + &page_sizes[num_page_sizes]); + page_sizes[num_page_sizes] <<= 10; + ksft_print_msg("Found %luMB\n", page_sizes[i] >> 20); + + if (page_sizes[num_page_sizes] > largest) largest = page_sizes[i]; - if (read_free(page_sizes[i]) < NUM_PAGES) { - printf("Not enough huge pages for page size %lu MB, need %u\n", - page_sizes[i] >> 20, - NUM_PAGES); - exit(0); - } + if (read_free(page_sizes[num_page_sizes]) >= NUM_PAGES) + num_page_sizes++; + else + ksft_print_msg("SKIP for size %lu MB as not enough huge pages, need %u\n", + page_sizes[num_page_sizes] >> 20, NUM_PAGES); } + globfree(&g); - if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) { - printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES); - exit(0); - } + if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) + ksft_exit_fail_msg("Please do echo %lu > /proc/sys/kernel/shmmax", + largest * NUM_PAGES); #if defined(__x86_64__) if (largest != 1U<<30) { - printf("No GB pages available on x86-64\n" - "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES); - exit(0); + ksft_exit_fail_msg("No GB pages available on x86-64\n" + "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES); } #endif } int main(void) { - int i; unsigned default_hps = default_huge_page_size(); + int i; + + ksft_print_header(); find_pagesizes(); - sanity_checks(); + if (!num_page_sizes) + ksft_finished(); + + ksft_set_plan(2 * num_page_sizes + 3); for (i = 0; i < num_page_sizes; i++) { unsigned long ps = page_sizes[i]; int arg = ilog2(ps) << MAP_HUGE_SHIFT; - printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg); + + ksft_print_msg("Testing %luMB mmap with shift %x\n", ps >> 20, arg); test_mmap(ps, MAP_HUGETLB | arg); } - printf("Testing default huge mmap\n"); + + ksft_print_msg("Testing default huge mmap\n"); test_mmap(default_hps, MAP_HUGETLB); - puts("Testing non-huge shmget"); + ksft_print_msg("Testing non-huge shmget\n"); test_shmget(getpagesize(), 0); for (i = 0; i < num_page_sizes; i++) { unsigned long ps = page_sizes[i]; int arg = ilog2(ps) << SHM_HUGE_SHIFT; - printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg); + ksft_print_msg("Testing %luMB shmget with shift %x\n", ps >> 20, arg); test_shmget(ps, SHM_HUGETLB | arg); } - puts("default huge shmget"); + + ksft_print_msg("default huge shmget\n"); test_shmget(default_hps, SHM_HUGETLB); - return 0; + ksft_finished(); } From c811b0ce1263bb3f6ade8f1564765f69f7e7577b Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 2 Feb 2024 16:31:18 +0500 Subject: [PATCH 114/435] selftests/mm: transhuge-stress: conform to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Link: https://lkml.kernel.org/r/20240202113119.2047740-12-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/transhuge-stress.c | 36 +++++++++++-------- tools/testing/selftests/mm/vm_util.c | 6 ++-- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c index c61fb9350b8c..68201192e37c 100644 --- a/tools/testing/selftests/mm/transhuge-stress.c +++ b/tools/testing/selftests/mm/transhuge-stress.c @@ -16,6 +16,7 @@ #include #include #include "vm_util.h" +#include "../kselftest.h" int backing_fd = -1; int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE; @@ -34,6 +35,8 @@ int main(int argc, char **argv) int pagemap_fd; int duration = 0; + ksft_print_header(); + ram = sysconf(_SC_PHYS_PAGES); if (ram > SIZE_MAX / psize() / 4) ram = SIZE_MAX / 4; @@ -43,7 +46,8 @@ int main(int argc, char **argv) while (++i < argc) { if (!strcmp(argv[i], "-h")) - errx(1, "usage: %s [-f ] [-d ] [size in MiB]", argv[0]); + ksft_exit_fail_msg("usage: %s [-f ] [-d ] [size in MiB]\n", + argv[0]); else if (!strcmp(argv[i], "-f")) name = argv[++i]; else if (!strcmp(argv[i], "-d")) @@ -52,10 +56,12 @@ int main(int argc, char **argv) len = atoll(argv[i]) << 20; } + ksft_set_plan(1); + if (name) { backing_fd = open(name, O_RDWR); if (backing_fd == -1) - errx(2, "open %s", name); + ksft_exit_fail_msg("open %s\n", name); mmap_flags = MAP_SHARED; } @@ -65,21 +71,21 @@ int main(int argc, char **argv) pagemap_fd = open("/proc/self/pagemap", O_RDONLY); if (pagemap_fd < 0) - err(2, "open pagemap"); + ksft_exit_fail_msg("open pagemap\n"); len -= len % HPAGE_SIZE; ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0); if (ptr == MAP_FAILED) - err(2, "initial mmap"); + ksft_exit_fail_msg("initial mmap"); ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE; if (madvise(ptr, len, MADV_HUGEPAGE)) - err(2, "MADV_HUGEPAGE"); + ksft_exit_fail_msg("MADV_HUGEPAGE"); map_len = ram >> (HPAGE_SHIFT - 1); map = malloc(map_len); if (!map) - errx(2, "map malloc"); + ksft_exit_fail_msg("map malloc\n"); clock_gettime(CLOCK_MONOTONIC, &start); @@ -103,7 +109,7 @@ int main(int argc, char **argv) if (idx >= map_len) { map = realloc(map, idx + 1); if (!map) - errx(2, "map realloc"); + ksft_exit_fail_msg("map realloc\n"); memset(map + map_len, 0, idx + 1 - map_len); map_len = idx + 1; } @@ -114,17 +120,19 @@ int main(int argc, char **argv) /* split transhuge page, keep last page */ if (madvise(p, HPAGE_SIZE - psize(), MADV_DONTNEED)) - err(2, "MADV_DONTNEED"); + ksft_exit_fail_msg("MADV_DONTNEED"); } clock_gettime(CLOCK_MONOTONIC, &b); s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.; - warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t" - "%4d succeed, %4d failed, %4d different pages", - s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20), - nr_succeed, nr_failed, nr_pages); + ksft_print_msg("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t" + "%4d succeed, %4d failed, %4d different pages\n", + s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20), + nr_succeed, nr_failed, nr_pages); - if (duration > 0 && b.tv_sec - start.tv_sec >= duration) - return 0; + if (duration > 0 && b.tv_sec - start.tv_sec >= duration) { + ksft_test_result_pass("Completed\n"); + ksft_finished(); + } } } diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 05736c615734..5a62530da3b5 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -232,17 +232,17 @@ int64_t allocate_transhuge(void *ptr, int pagemap_fd) if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) - errx(2, "mmap transhuge"); + ksft_exit_fail_msg("mmap transhuge\n"); if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) - err(2, "MADV_HUGEPAGE"); + ksft_exit_fail_msg("MADV_HUGEPAGE\n"); /* allocate transparent huge page */ *(volatile void **)ptr = ptr; if (pread(pagemap_fd, ent, sizeof(ent), (uintptr_t)ptr >> (pshift() - 3)) != sizeof(ent)) - err(2, "read pagemap"); + ksft_exit_fail_msg("read pagemap\n"); if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && From d1d86ce28d0f1974e1038d6927b657206529a01b Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 2 Feb 2024 16:31:19 +0500 Subject: [PATCH 115/435] selftests/mm: virtual_address_range: conform to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Link: https://lkml.kernel.org/r/20240202113119.2047740-13-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/mm/virtual_address_range.c | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index bae0ceaf95b1..7bcf8d48256a 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -12,6 +12,7 @@ #include #include #include +#include "../kselftest.h" /* * Maximum address range mapped with a single mmap() @@ -68,23 +69,15 @@ static char *hind_addr(void) return (char *) (1UL << bits); } -static int validate_addr(char *ptr, int high_addr) +static void validate_addr(char *ptr, int high_addr) { unsigned long addr = (unsigned long) ptr; - if (high_addr) { - if (addr < HIGH_ADDR_MARK) { - printf("Bad address %lx\n", addr); - return 1; - } - return 0; - } + if (high_addr && addr < HIGH_ADDR_MARK) + ksft_exit_fail_msg("Bad address %lx\n", addr); - if (addr > HIGH_ADDR_MARK) { - printf("Bad address %lx\n", addr); - return 1; - } - return 0; + if (addr > HIGH_ADDR_MARK) + ksft_exit_fail_msg("Bad address %lx\n", addr); } static int validate_lower_address_hint(void) @@ -107,23 +100,29 @@ int main(int argc, char *argv[]) char *hint; unsigned long i, lchunks, hchunks; + ksft_print_header(); + ksft_set_plan(1); + for (i = 0; i < NR_CHUNKS_LOW; i++) { ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr[i] == MAP_FAILED) { - if (validate_lower_address_hint()) - return 1; + if (validate_lower_address_hint()) { + ksft_test_result_skip("Memory constraint not fulfilled\n"); + ksft_finished(); + } break; } - if (validate_addr(ptr[i], 0)) - return 1; + validate_addr(ptr[i], 0); } lchunks = i; hptr = (char **) calloc(NR_CHUNKS_HIGH, sizeof(char *)); - if (hptr == NULL) - return 1; + if (hptr == NULL) { + ksft_test_result_skip("Memory constraint not fulfilled\n"); + ksft_finished(); + } for (i = 0; i < NR_CHUNKS_HIGH; i++) { hint = hind_addr(); @@ -133,8 +132,7 @@ int main(int argc, char *argv[]) if (hptr[i] == MAP_FAILED) break; - if (validate_addr(hptr[i], 1)) - return 1; + validate_addr(hptr[i], 1); } hchunks = i; @@ -145,5 +143,7 @@ int main(int argc, char *argv[]) munmap(hptr[i], MAP_CHUNK_SIZE); free(hptr); - return 0; + + ksft_test_result_pass("Test\n"); + ksft_finished(); } From ab755bf4249b992fc2140d615ab0a686d50765b4 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 20 Feb 2024 14:16:31 +0800 Subject: [PATCH 116/435] mm: compaction: update the cc->nr_migratepages when allocating or freeing the freepages Currently we will use 'cc->nr_freepages >= cc->nr_migratepages' comparison to ensure that enough freepages are isolated in isolate_freepages(), however it just decreases the cc->nr_freepages without updating cc->nr_migratepages in compaction_alloc(), which will waste more CPU cycles and cause too many freepages to be isolated. So we should also update the cc->nr_migratepages when allocating or freeing the freepages to avoid isolating excess freepages. And I can see fewer free pages are scanned and isolated when running thpcompact on my Arm64 server: k6.7 k6.7_patched Ops Compaction pages isolated 120692036.00 118160797.00 Ops Compaction migrate scanned 131210329.00 154093268.00 Ops Compaction free scanned 1090587971.00 1080632536.00 Ops Compact scan efficiency 12.03 14.26 Moreover, I did not see an obvious latency improvements, this is likely because isolating freepages is not the bottleneck in the thpcompact test case. k6.7 k6.7_patched Amean fault-both-1 1089.76 ( 0.00%) 1080.16 * 0.88%* Amean fault-both-3 1616.48 ( 0.00%) 1636.65 * -1.25%* Amean fault-both-5 2266.66 ( 0.00%) 2219.20 * 2.09%* Amean fault-both-7 2909.84 ( 0.00%) 2801.90 * 3.71%* Amean fault-both-12 4861.26 ( 0.00%) 4733.25 * 2.63%* Amean fault-both-18 7351.11 ( 0.00%) 6950.51 * 5.45%* Amean fault-both-24 9059.30 ( 0.00%) 9159.99 * -1.11%* Amean fault-both-30 10685.68 ( 0.00%) 11399.02 * -6.68%* Link: https://lkml.kernel.org/r/6440493f18da82298152b6305d6b41c2962a3ce6.1708409245.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: Mel Gorman Reviewed-by: Vlastimil Babka Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/trace/events/compaction.h | 6 +++--- mm/compaction.c | 12 ++++++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 2b2a975efd20..d05759d18538 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -78,10 +78,10 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_fast_isolate_freepage #ifdef CONFIG_COMPACTION TRACE_EVENT(mm_compaction_migratepages, - TP_PROTO(struct compact_control *cc, + TP_PROTO(unsigned int nr_migratepages, unsigned int nr_succeeded), - TP_ARGS(cc, nr_succeeded), + TP_ARGS(nr_migratepages, nr_succeeded), TP_STRUCT__entry( __field(unsigned long, nr_migrated) @@ -90,7 +90,7 @@ TRACE_EVENT(mm_compaction_migratepages, TP_fast_assign( __entry->nr_migrated = nr_succeeded; - __entry->nr_failed = cc->nr_migratepages - nr_succeeded; + __entry->nr_failed = nr_migratepages - nr_succeeded; ), TP_printk("nr_migrated=%lu nr_failed=%lu", diff --git a/mm/compaction.c b/mm/compaction.c index 4add68d40e8d..0ea5519e2833 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1796,6 +1796,7 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data) dst = list_entry(cc->freepages.next, struct folio, lru); list_del(&dst->lru); cc->nr_freepages--; + cc->nr_migratepages--; return dst; } @@ -1811,6 +1812,7 @@ static void compaction_free(struct folio *dst, unsigned long data) list_add(&dst->lru, &cc->freepages); cc->nr_freepages++; + cc->nr_migratepages++; } /* possible outcome of isolate_migratepages */ @@ -2433,7 +2435,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long last_migrated_pfn; const bool sync = cc->mode != MIGRATE_ASYNC; bool update_cached; - unsigned int nr_succeeded = 0; + unsigned int nr_succeeded = 0, nr_migratepages; /* * These counters track activities during zone compaction. Initialize @@ -2551,11 +2553,17 @@ rescan: pageblock_start_pfn(cc->migrate_pfn - 1)); } + /* + * Record the number of pages to migrate since the + * compaction_alloc/free() will update cc->nr_migratepages + * properly. + */ + nr_migratepages = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION, &nr_succeeded); - trace_mm_compaction_migratepages(cc, nr_succeeded); + trace_mm_compaction_migratepages(nr_migratepages, nr_succeeded); /* All pages were either migrated or will be released */ cc->nr_migratepages = 0; From e321d7c934774b7c7e778fe4433d1beb38a1744a Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Wed, 31 Jan 2024 18:38:02 +0800 Subject: [PATCH 117/435] mm/vmscan: change the type of file from int to bool Change the type of file from int to bool because is_file_lru return bool Link: https://lkml.kernel.org/r/20240131103802.122920-1-gehao@kylinos.cn Signed-off-by: Hao Ge Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1f139830b26f..8e52f8795d20 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1998,7 +1998,7 @@ static void shrink_active_list(unsigned long nr_to_scan, LIST_HEAD(l_inactive); unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; - int file = is_file_lru(lru); + bool file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); lru_add_drain(); @@ -2412,7 +2412,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, denominator = ap + fp; out: for_each_evictable_lru(lru) { - int file = is_file_lru(lru); + bool file = is_file_lru(lru); unsigned long lruvec_size; unsigned long low, min; unsigned long scan; From 6e8f588708971e0626f5be808e8c4b6cdb86eb0b Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 29 Jan 2024 13:46:35 +0100 Subject: [PATCH 118/435] arm64/mm: make set_ptes() robust when OAs cross 48-bit boundary Patch series "mm/memory: optimize fork() with PTE-mapped THP", v3. Now that the rmap overhaul[1] is upstream that provides a clean interface for rmap batching, let's implement PTE batching during fork when processing PTE-mapped THPs. This series is partially based on Ryan's previous work[2] to implement cont-pte support on arm64, but its a complete rewrite based on [1] to optimize all architectures independent of any such PTE bits, and to use the new rmap batching functions that simplify the code and prepare for further rmap accounting changes. We collect consecutive PTEs that map consecutive pages of the same large folio, making sure that the other PTE bits are compatible, and (a) adjust the refcount only once per batch, (b) call rmap handling functions only once per batch and (c) perform batch PTE setting/updates. While this series should be beneficial for adding cont-pte support on ARM64[2], it's one of the requirements for maintaining a total mapcount[3] for large folios with minimal added overhead and further changes[4] that build up on top of the total mapcount. Independent of all that, this series results in a speedup during fork with PTE-mapped THP, which is the default with THPs that are smaller than a PMD (for example, 16KiB to 1024KiB mTHPs for anonymous memory[5]). On an Intel Xeon Silver 4210R CPU, fork'ing with 1GiB of PTE-mapped folios of the same size (stddev < 1%) results in the following runtimes for fork() (shorter is better): Folio Size | v6.8-rc1 | New | Change ------------------------------------------ 4KiB | 0.014328 | 0.014035 | - 2% 16KiB | 0.014263 | 0.01196 | -16% 32KiB | 0.014334 | 0.01094 | -24% 64KiB | 0.014046 | 0.010444 | -26% 128KiB | 0.014011 | 0.010063 | -28% 256KiB | 0.013993 | 0.009938 | -29% 512KiB | 0.013983 | 0.00985 | -30% 1024KiB | 0.013986 | 0.00982 | -30% 2048KiB | 0.014305 | 0.010076 | -30% Note that these numbers are even better than the ones from v1 (verified over multiple reboots), even though there were only minimal code changes. Well, I removed a pte_mkclean() call for anon folios, maybe that also plays a role. But my experience is that fork() is extremely sensitive to code size, inlining, ... so I suspect we'll see on other architectures rather a change of -20% instead of -30%, and it will be easy to "lose" some of that speedup in the future by subtle code changes. Next up is PTE batching when unmapping. Only tested on x86-64. Compile-tested on most other architectures. [1] https://lkml.kernel.org/r/20231220224504.646757-1-david@redhat.com [2] https://lkml.kernel.org/r/20231218105100.172635-1-ryan.roberts@arm.com [3] https://lkml.kernel.org/r/20230809083256.699513-1-david@redhat.com [4] https://lkml.kernel.org/r/20231124132626.235350-1-david@redhat.com [5] https://lkml.kernel.org/r/20231207161211.2374093-1-ryan.roberts@arm.com This patch (of 15): Since the high bits [51:48] of an OA are not stored contiguously in the PTE, there is a theoretical bug in set_ptes(), which just adds PAGE_SIZE to the pte to get the pte with the next pfn. This works until the pfn crosses the 48-bit boundary, at which point we overflow into the upper attributes. Of course one could argue (and Matthew Wilcox has :) that we will never see a folio cross this boundary because we only allow naturally aligned power-of-2 allocation, so this would require a half-petabyte folio. So its only a theoretical bug. But its better that the code is robust regardless. I've implemented pte_next_pfn() as part of the fix, which is an opt-in core-mm interface. So that is now available to the core-mm, which will be needed shortly to support forthcoming fork()-batching optimizations. Link: https://lkml.kernel.org/r/20240129124649.189745-1-david@redhat.com Link: https://lkml.kernel.org/r/20240125173534.1659317-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240129124649.189745-2-david@redhat.com Fixes: 4a169d61c2ed ("arm64: implement the new page table range API") Closes: https://lore.kernel.org/linux-mm/fdaeb9a5-d890-499a-92c8-d171df43ad01@arm.com/ Signed-off-by: Ryan Roberts Signed-off-by: David Hildenbrand Reviewed-by: Catalin Marinas Reviewed-by: David Hildenbrand Tested-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Alexandre Ghiti Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 79ce70fbb751..52d0b0a763f1 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -341,6 +341,22 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages) mte_sync_tags(pte, nr_pages); } +/* + * Select all bits except the pfn + */ +static inline pgprot_t pte_pgprot(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); +} + +#define pte_next_pfn pte_next_pfn +static inline pte_t pte_next_pfn(pte_t pte) +{ + return pfn_pte(pte_pfn(pte) + 1, pte_pgprot(pte)); +} + static inline void set_ptes(struct mm_struct *mm, unsigned long __always_unused addr, pte_t *ptep, pte_t pte, unsigned int nr) @@ -354,7 +370,7 @@ static inline void set_ptes(struct mm_struct *mm, if (--nr == 0) break; ptep++; - pte_val(pte) += PAGE_SIZE; + pte = pte_next_pfn(pte); } } #define set_ptes set_ptes @@ -433,16 +449,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE)); } -/* - * Select all bits except the pfn - */ -static inline pgprot_t pte_pgprot(pte_t pte) -{ - unsigned long pfn = pte_pfn(pte); - - return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); -} - #ifdef CONFIG_NUMA_BALANCING /* * See the comment in include/linux/pgtable.h From 12b884f2e09ab42d3879a3e2c703e7157691013c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:36 +0100 Subject: [PATCH 119/435] arm/pgtable: define PFN_PTE_SHIFT We want to make use of pte_next_pfn() outside of set_ptes(). Let's simply define PFN_PTE_SHIFT, required by pte_next_pfn(). Link: https://lkml.kernel.org/r/20240129124649.189745-3-david@redhat.com Signed-off-by: David Hildenbrand Tested-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm/include/asm/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index d657b84b6bf7..be91e376df79 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -209,6 +209,8 @@ static inline void __sync_icache_dcache(pte_t pteval) extern void __sync_icache_dcache(pte_t pteval); #endif +#define PFN_PTE_SHIFT PAGE_SHIFT + void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval, unsigned int nr); #define set_ptes set_ptes From 3a6a6c3fbda8f50fc9f0e5fede8a0f70abdea033 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:37 +0100 Subject: [PATCH 120/435] nios2/pgtable: define PFN_PTE_SHIFT We want to make use of pte_next_pfn() outside of set_ptes(). Let's simply define PFN_PTE_SHIFT, required by pte_next_pfn(). Link: https://lkml.kernel.org/r/20240129124649.189745-4-david@redhat.com Signed-off-by: David Hildenbrand Tested-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/nios2/include/asm/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 5144506dfa69..d052dfcbe8d3 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -178,6 +178,8 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) *ptep = pteval; } +#define PFN_PTE_SHIFT 0 + static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { From f7dc4d689e6fafe3d8424f600b924f2d59d1a3cf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:38 +0100 Subject: [PATCH 121/435] powerpc/pgtable: define PFN_PTE_SHIFT We want to make use of pte_next_pfn() outside of set_ptes(). Let's simply define PFN_PTE_SHIFT, required by pte_next_pfn(). Link: https://lkml.kernel.org/r/20240129124649.189745-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Christophe Leroy Tested-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 9224f23065ff..7a1ba8889aea 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -41,6 +41,8 @@ struct mm_struct; #ifndef __ASSEMBLY__ +#define PFN_PTE_SHIFT PTE_RPN_SHIFT + void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr); #define set_ptes set_ptes From 57c254b2fb31f0160829f4bf1cb993a9e9c302a8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:39 +0100 Subject: [PATCH 122/435] riscv/pgtable: define PFN_PTE_SHIFT We want to make use of pte_next_pfn() outside of set_ptes(). Let's simply define PFN_PTE_SHIFT, required by pte_next_pfn(). Link: https://lkml.kernel.org/r/20240129124649.189745-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Alexandre Ghiti Tested-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/riscv/include/asm/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 0c94260b5d0c..add5cd30ab34 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -523,6 +523,8 @@ static inline void __set_pte_at(pte_t *ptep, pte_t pteval) set_pte(ptep, pteval); } +#define PFN_PTE_SHIFT _PAGE_PFN_SHIFT + static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval, unsigned int nr) { From 4555ac8b3c16f67f74c04ff71ce8c4a8fcee973a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:40 +0100 Subject: [PATCH 123/435] s390/pgtable: define PFN_PTE_SHIFT We want to make use of pte_next_pfn() outside of set_ptes(). Let's simply define PFN_PTE_SHIFT, required by pte_next_pfn(). Link: https://lkml.kernel.org/r/20240129124649.189745-7-david@redhat.com Signed-off-by: David Hildenbrand Tested-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 1299b56e43f6..4b91e65c85d9 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1316,6 +1316,8 @@ pgprot_t pgprot_writecombine(pgprot_t prot); #define pgprot_writethrough pgprot_writethrough pgprot_t pgprot_writethrough(pgprot_t prot); +#define PFN_PTE_SHIFT PAGE_SHIFT + /* * Set multiple PTEs to consecutive pages with a single call. All PTEs * are within the same folio, PMD and VMA. From ce7a9de353da053e55a68e2441196114547e38d0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:41 +0100 Subject: [PATCH 124/435] sparc/pgtable: define PFN_PTE_SHIFT We want to make use of pte_next_pfn() outside of set_ptes(). Let's simply define PFN_PTE_SHIFT, required by pte_next_pfn(). Link: https://lkml.kernel.org/r/20240129124649.189745-8-david@redhat.com Signed-off-by: David Hildenbrand Tested-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/sparc/include/asm/pgtable_64.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index a8c871b7d786..652af9d63fa2 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -929,6 +929,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, maybe_tlb_batch_add(mm, addr, ptep, orig, fullmm, PAGE_SHIFT); } +#define PFN_PTE_SHIFT PAGE_SHIFT + static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { From 6cdfa1d5d5d8285108495c33588c48cdda81b647 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:42 +0100 Subject: [PATCH 125/435] mm/pgtable: make pte_next_pfn() independent of set_ptes() Let's provide pte_next_pfn(), independently of set_ptes(). This allows for using the generic pte_next_pfn() version in some arch-specific set_ptes() implementations, and prepares for reusing pte_next_pfn() in other context. Link: https://lkml.kernel.org/r/20240129124649.189745-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Christophe Leroy Tested-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index f6d0e3513948..351cd9dc7194 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,7 +212,6 @@ static inline int pmd_dirty(pmd_t pmd) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif -#ifndef set_ptes #ifndef pte_next_pfn static inline pte_t pte_next_pfn(pte_t pte) @@ -221,6 +220,7 @@ static inline pte_t pte_next_pfn(pte_t pte) } #endif +#ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. From e5ea320aec811c0e5cddefda17052579e0306415 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:43 +0100 Subject: [PATCH 126/435] arm/mm: use pte_next_pfn() in set_ptes() Let's use our handy helper now that it's available on all archs. Link: https://lkml.kernel.org/r/20240129124649.189745-10-david@redhat.com Signed-off-by: David Hildenbrand Tested-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm/mm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 674ed71573a8..c24e29c0b9a4 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -1814,6 +1814,6 @@ void set_ptes(struct mm_struct *mm, unsigned long addr, if (--nr == 0) break; ptep++; - pte_val(pteval) += PAGE_SIZE; + pteval = pte_next_pfn(pteval); } } From 802cc2ab33b0d8a013c216ca7f4caa9034bfc257 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:44 +0100 Subject: [PATCH 127/435] powerpc/mm: use pte_next_pfn() in set_ptes() Let's use our handy new helper. Note that the implementation is slightly different, but shouldn't really make a difference in practice. Link: https://lkml.kernel.org/r/20240129124649.189745-11-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Christophe Leroy Tested-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/pgtable.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index a04ae4449a02..549a440ed7f6 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -220,10 +220,7 @@ void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, break; ptep++; addr += PAGE_SIZE; - /* - * increment the pfn. - */ - pte = pfn_pte(pte_pfn(pte) + 1, pte_pgprot((pte))); + pte = pte_next_pfn(pte); } } From 23ed190868a65525b8941370630fbb215f12ebe8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:45 +0100 Subject: [PATCH 128/435] mm/memory: factor out copying the actual PTE in copy_present_pte() Let's prepare for further changes. Link: https://lkml.kernel.org/r/20240129124649.189745-12-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/memory.c | 63 ++++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 00f3f4fbd131..bef6fd925d04 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -930,6 +930,29 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma return 0; } +static inline void __copy_present_pte(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, + pte_t pte, unsigned long addr) +{ + struct mm_struct *src_mm = src_vma->vm_mm; + + /* If it's a COW mapping, write protect it both processes. */ + if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) { + ptep_set_wrprotect(src_mm, addr, src_pte); + pte = pte_wrprotect(pte); + } + + /* If it's a shared mapping, mark it clean in the child. */ + if (src_vma->vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + + if (!userfaultfd_wp(dst_vma)) + pte = pte_clear_uffd_wp(pte); + + set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); +} + /* * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page * is required to copy this pte. @@ -939,23 +962,23 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, struct folio **prealloc) { - struct mm_struct *src_mm = src_vma->vm_mm; - unsigned long vm_flags = src_vma->vm_flags; pte_t pte = ptep_get(src_pte); struct page *page; struct folio *folio; page = vm_normal_page(src_vma, addr, pte); - if (page) - folio = page_folio(page); - if (page && folio_test_anon(folio)) { + if (unlikely(!page)) + goto copy_pte; + + folio = page_folio(page); + folio_get(folio); + if (folio_test_anon(folio)) { /* * If this page may have been pinned by the parent process, * copy the page immediately for the child so that we'll always * guarantee the pinned page won't be randomly replaced in the * future. */ - folio_get(folio); if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) { /* Page may be pinned, we have to copy. */ folio_put(folio); @@ -963,34 +986,14 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, addr, rss, prealloc, page); } rss[MM_ANONPAGES]++; - } else if (page) { - folio_get(folio); + VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); + } else { folio_dup_file_rmap_pte(folio, page); rss[mm_counter_file(folio)]++; } - /* - * If it's a COW mapping, write protect it both - * in the parent and the child - */ - if (is_cow_mapping(vm_flags) && pte_write(pte)) { - ptep_set_wrprotect(src_mm, addr, src_pte); - pte = pte_wrprotect(pte); - } - VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page)); - - /* - * If it's a shared mapping, mark it clean in - * the child - */ - if (vm_flags & VM_SHARED) - pte = pte_mkclean(pte); - pte = pte_mkold(pte); - - if (!userfaultfd_wp(dst_vma)) - pte = pte_clear_uffd_wp(pte); - - set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); +copy_pte: + __copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, pte, addr); return 0; } From 53723298ba436830fdf0744c19b57b2a18f44041 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:46 +0100 Subject: [PATCH 129/435] mm/memory: pass PTE to copy_present_pte() We already read it, let's just forward it. This patch is based on work by Ryan Roberts. [david@redhat.com: fix the hmm "exclusive_cow" selftest] Link: https://lkml.kernel.org/r/13f296b8-e882-47fd-b939-c2141dc28717@redhat.com Link: https://lkml.kernel.org/r/20240129124649.189745-13-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/memory.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index bef6fd925d04..ca888431680c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -959,10 +959,9 @@ static inline void __copy_present_pte(struct vm_area_struct *dst_vma, */ static inline int copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, - pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, - struct folio **prealloc) + pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, + int *rss, struct folio **prealloc) { - pte_t pte = ptep_get(src_pte); struct page *page; struct folio *folio; @@ -1094,6 +1093,8 @@ again: progress += 8; continue; } + ptent = ptep_get(src_pte); + VM_WARN_ON_ONCE(!pte_present(ptent)); /* * Device exclusive entry restored, continue by copying @@ -1103,7 +1104,7 @@ again: } /* copy_present_pte() will clear `*prealloc' if consumed */ ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, - addr, rss, &prealloc); + ptent, addr, rss, &prealloc); /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. From f8d937761d65c87e9987b88ea7beb7bddc333a0e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:47 +0100 Subject: [PATCH 130/435] mm/memory: optimize fork() with PTE-mapped THP Let's implement PTE batching when consecutive (present) PTEs map consecutive pages of the same large folio, and all other PTE bits besides the PFNs are equal. We will optimize folio_pte_batch() separately, to ignore selected PTE bits. This patch is based on work by Ryan Roberts. Use __always_inline for __copy_present_ptes() and keep the handling for single PTEs completely separate from the multi-PTE case: we really want the compiler to optimize for the single-PTE case with small folios, to not degrade performance. Note that PTE batching will never exceed a single page table and will always stay within VMA boundaries. Further, processing PTE-mapped THP that maybe pinned and have PageAnonExclusive set on at least one subpage should work as expected, but there is room for improvement: We will repeatedly (1) detect a PTE batch (2) detect that we have to copy a page (3) fall back and allocate a single page to copy a single page. For now we won't care as pinned pages are a corner case, and we should rather look into maintaining only a single PageAnonExclusive bit for large folios. Link: https://lkml.kernel.org/r/20240129124649.189745-14-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 31 +++++++++++ mm/memory.c | 112 +++++++++++++++++++++++++++++++++------- 2 files changed, 124 insertions(+), 19 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 351cd9dc7194..aab227e12493 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -650,6 +650,37 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef wrprotect_ptes +/** + * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same + * folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to write-protect. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_set_wrprotect(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + for (;;) { + ptep_set_wrprotect(mm, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} +#endif + /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings diff --git a/mm/memory.c b/mm/memory.c index ca888431680c..a7eb2301a1d1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -930,15 +930,15 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma return 0; } -static inline void __copy_present_pte(struct vm_area_struct *dst_vma, +static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, - pte_t pte, unsigned long addr) + pte_t pte, unsigned long addr, int nr) { struct mm_struct *src_mm = src_vma->vm_mm; /* If it's a COW mapping, write protect it both processes. */ if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) { - ptep_set_wrprotect(src_mm, addr, src_pte); + wrprotect_ptes(src_mm, addr, src_pte, nr); pte = pte_wrprotect(pte); } @@ -950,26 +950,93 @@ static inline void __copy_present_pte(struct vm_area_struct *dst_vma, if (!userfaultfd_wp(dst_vma)) pte = pte_clear_uffd_wp(pte); - set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); + set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); } /* - * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page - * is required to copy this pte. + * Detect a PTE batch: consecutive (present) PTEs that map consecutive + * pages of the same folio. + * + * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN. + */ +static inline int folio_pte_batch(struct folio *folio, unsigned long addr, + pte_t *start_ptep, pte_t pte, int max_nr) +{ + unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); + const pte_t *end_ptep = start_ptep + max_nr; + pte_t expected_pte = pte_next_pfn(pte); + pte_t *ptep = start_ptep + 1; + + VM_WARN_ON_FOLIO(!pte_present(pte), folio); + + while (ptep != end_ptep) { + pte = ptep_get(ptep); + + if (!pte_same(pte, expected_pte)) + break; + + /* + * Stop immediately once we reached the end of the folio. In + * corner cases the next PFN might fall into a different + * folio. + */ + if (pte_pfn(pte) == folio_end_pfn) + break; + + expected_pte = pte_next_pfn(expected_pte); + ptep++; + } + + return ptep - start_ptep; +} + +/* + * Copy one present PTE, trying to batch-process subsequent PTEs that map + * consecutive pages of the same folio by copying them as well. + * + * Returns -EAGAIN if one preallocated page is required to copy the next PTE. + * Otherwise, returns the number of copied PTEs (at least 1). */ static inline int -copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, +copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, - int *rss, struct folio **prealloc) + int max_nr, int *rss, struct folio **prealloc) { struct page *page; struct folio *folio; + int err, nr; page = vm_normal_page(src_vma, addr, pte); if (unlikely(!page)) goto copy_pte; folio = page_folio(page); + + /* + * If we likely have to copy, just don't bother with batching. Make + * sure that the common "small folio" case is as fast as possible + * by keeping the batching logic separate. + */ + if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { + nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr); + folio_ref_add(folio, nr); + if (folio_test_anon(folio)) { + if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, + nr, src_vma))) { + folio_ref_sub(folio, nr); + return -EAGAIN; + } + rss[MM_ANONPAGES] += nr; + VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); + } else { + folio_dup_file_rmap_ptes(folio, page, nr); + rss[mm_counter_file(folio)] += nr; + } + __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, + addr, nr); + return nr; + } + folio_get(folio); if (folio_test_anon(folio)) { /* @@ -981,8 +1048,9 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) { /* Page may be pinned, we have to copy. */ folio_put(folio); - return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, - addr, rss, prealloc, page); + err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, prealloc, page); + return err ? err : 1; } rss[MM_ANONPAGES]++; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); @@ -992,8 +1060,8 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, } copy_pte: - __copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, pte, addr); - return 0; + __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1); + return 1; } static inline struct folio *folio_prealloc(struct mm_struct *src_mm, @@ -1030,10 +1098,11 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *src_pte, *dst_pte; pte_t ptent; spinlock_t *src_ptl, *dst_ptl; - int progress, ret = 0; + int progress, max_nr, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; struct folio *prealloc = NULL; + int nr; again: progress = 0; @@ -1064,6 +1133,8 @@ again: arch_enter_lazy_mmu_mode(); do { + nr = 1; + /* * We are holding two locks at this point - either of them * could generate latencies in another task on another CPU. @@ -1102,9 +1173,10 @@ again: */ WARN_ON_ONCE(ret != -ENOENT); } - /* copy_present_pte() will clear `*prealloc' if consumed */ - ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, - ptent, addr, rss, &prealloc); + /* copy_present_ptes() will clear `*prealloc' if consumed */ + max_nr = (end - addr) / PAGE_SIZE; + ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, + ptent, addr, max_nr, rss, &prealloc); /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. @@ -1121,8 +1193,10 @@ again: folio_put(prealloc); prealloc = NULL; } - progress += 8; - } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + nr = ret; + progress += 8 * nr; + } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr, + addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(orig_src_pte, src_ptl); @@ -1143,7 +1217,7 @@ again: prealloc = folio_prealloc(src_mm, src_vma, addr, false); if (!prealloc) return -ENOMEM; - } else if (ret) { + } else if (ret < 0) { VM_WARN_ON_ONCE(1); } From 25365e10699aa0e320345d019194fbea9f37a4ae Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:48 +0100 Subject: [PATCH 131/435] mm/memory: ignore dirty/accessed/soft-dirty bits in folio_pte_batch() Let's always ignore the accessed/young bit: we'll always mark the PTE as old in our child process during fork, and upcoming users will similarly not care. Ignore the dirty bit only if we don't want to duplicate the dirty bit into the child process during fork. Maybe, we could just set all PTEs in the child dirty if any PTE is dirty. For now, let's keep the behavior unchanged, this can be optimized later if required. Ignore the soft-dirty bit only if the bit doesn't have any meaning in the src vma, and similarly won't have any in the copied dst vma. For now, we won't bother with the uffd-wp bit. Link: https://lkml.kernel.org/r/20240129124649.189745-15-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/memory.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index a7eb2301a1d1..ec5741f37c77 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -953,24 +953,44 @@ static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma, set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); } +/* Flags for folio_pte_batch(). */ +typedef int __bitwise fpb_t; + +/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ +#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) + +/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ +#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) + +static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) +{ + if (flags & FPB_IGNORE_DIRTY) + pte = pte_mkclean(pte); + if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) + pte = pte_clear_soft_dirty(pte); + return pte_mkold(pte); +} + /* * Detect a PTE batch: consecutive (present) PTEs that map consecutive * pages of the same folio. * - * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN. + * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, + * the accessed bit, dirty bit (with FPB_IGNORE_DIRTY) and soft-dirty bit + * (with FPB_IGNORE_SOFT_DIRTY). */ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, - pte_t *start_ptep, pte_t pte, int max_nr) + pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags) { unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); const pte_t *end_ptep = start_ptep + max_nr; - pte_t expected_pte = pte_next_pfn(pte); + pte_t expected_pte = __pte_batch_clear_ignored(pte_next_pfn(pte), flags); pte_t *ptep = start_ptep + 1; VM_WARN_ON_FOLIO(!pte_present(pte), folio); while (ptep != end_ptep) { - pte = ptep_get(ptep); + pte = __pte_batch_clear_ignored(ptep_get(ptep), flags); if (!pte_same(pte, expected_pte)) break; @@ -1004,6 +1024,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma { struct page *page; struct folio *folio; + fpb_t flags = 0; int err, nr; page = vm_normal_page(src_vma, addr, pte); @@ -1018,7 +1039,12 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * by keeping the batching logic separate. */ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { - nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr); + if (src_vma->vm_flags & VM_SHARED) + flags |= FPB_IGNORE_DIRTY; + if (!vma_soft_dirty_enabled(src_vma)) + flags |= FPB_IGNORE_SOFT_DIRTY; + + nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, From d7c0e5f722ab229153c22efc836bf220479bdce6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:49 +0100 Subject: [PATCH 132/435] mm/memory: ignore writable bit in folio_pte_batch() ... and conditionally return to the caller if any PTE except the first one is writable. fork() has to make sure to properly write-protect in case any PTE is writable. Other users (e.g., page unmaping) are expected to not care. Link: https://lkml.kernel.org/r/20240129124649.189745-16-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/memory.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ec5741f37c77..e5e5056cb53f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -968,7 +968,7 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) pte = pte_mkclean(pte); if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) pte = pte_clear_soft_dirty(pte); - return pte_mkold(pte); + return pte_wrprotect(pte_mkold(pte)); } /* @@ -976,21 +976,32 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) * pages of the same folio. * * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, - * the accessed bit, dirty bit (with FPB_IGNORE_DIRTY) and soft-dirty bit - * (with FPB_IGNORE_SOFT_DIRTY). + * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and + * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). + * + * If "any_writable" is set, it will indicate if any other PTE besides the + * first (given) PTE is writable. */ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, - pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags) + pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, + bool *any_writable) { unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); const pte_t *end_ptep = start_ptep + max_nr; pte_t expected_pte = __pte_batch_clear_ignored(pte_next_pfn(pte), flags); pte_t *ptep = start_ptep + 1; + bool writable; + + if (any_writable) + *any_writable = false; VM_WARN_ON_FOLIO(!pte_present(pte), folio); while (ptep != end_ptep) { - pte = __pte_batch_clear_ignored(ptep_get(ptep), flags); + pte = ptep_get(ptep); + if (any_writable) + writable = !!pte_write(pte); + pte = __pte_batch_clear_ignored(pte, flags); if (!pte_same(pte, expected_pte)) break; @@ -1003,6 +1014,9 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, if (pte_pfn(pte) == folio_end_pfn) break; + if (any_writable) + *any_writable |= writable; + expected_pte = pte_next_pfn(expected_pte); ptep++; } @@ -1024,6 +1038,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma { struct page *page; struct folio *folio; + bool any_writable; fpb_t flags = 0; int err, nr; @@ -1044,7 +1059,8 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma if (!vma_soft_dirty_enabled(src_vma)) flags |= FPB_IGNORE_SOFT_DIRTY; - nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags); + nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, + &any_writable); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, @@ -1058,6 +1074,8 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma folio_dup_file_rmap_ptes(folio, page, nr); rss[mm_counter_file(folio)] += nr; } + if (any_writable) + pte = pte_mkwrite(pte, src_vma); __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, nr); return nr; From d2d20f08e93a417199107abab795c5d40029eaae Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 29 Jan 2024 03:52:46 -0800 Subject: [PATCH 133/435] selftests/mm: run_vmtests.sh: add hugetlb test category The usage of run_vmtests.sh does not include hugetlb, which is a valid test category. Add the 'hugetlb' to the usage of run_vmtests.sh. Link: https://lkml.kernel.org/r/20240129115246.1234253-1-leitao@debian.org Signed-off-by: Breno Leitao Reviewed-by: Muhammad Usama Anjum Reviewed-by: Joel Savitz Cc: Ryan Roberts Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 040f27e21f47..81b5980886da 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -64,6 +64,8 @@ separated by spaces: test copy-on-write semantics - thp test transparent huge pages +- hugetlb + test hugetlbfs huge pages - migration invoke move_pages(2) to exercise the migration entry code paths in the kernel From 2c8b947416a9120d50fd165738de38400f9a0933 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Sat, 3 Feb 2024 09:46:32 +0800 Subject: [PATCH 134/435] mm/mmap: pass vma to vma_merge() These vma_merge() callers will pass mm, anon_vma and file, they all from the same vma. There is no need to pass three parameters at the same time. Pass vma instead of mm, anon_vma and file to vma_merge(), so that it can save two parameters. Link: https://lkml.kernel.org/r/20240203014632.2726545-1-yajun.deng@linux.dev Link: https://lore.kernel.org/lkml/20240125034922.1004671-2-yajun.deng@linux.dev/ Signed-off-by: Yajun Deng Reviewed-by: Liam R. Howlett Cc: Yajun Deng Signed-off-by: Andrew Morton --- mm/mmap.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 1f9e70242858..ccf377ee319f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -860,13 +860,15 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * area is returned, or the function will return NULL */ static struct vm_area_struct -*vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *prev, unsigned long addr, unsigned long end, - unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy, +*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev, + struct vm_area_struct *src, unsigned long addr, unsigned long end, + unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, struct anon_vma_name *anon_name) { + struct mm_struct *mm = src->vm_mm; + struct anon_vma *anon_vma = src->anon_vma; + struct file *file = src->vm_file; struct vm_area_struct *curr, *next, *res; struct vm_area_struct *vma, *adjust, *remove, *remove2; struct vm_area_struct *anon_dup = NULL; @@ -2426,9 +2428,8 @@ struct vm_area_struct *vma_modify(struct vma_iterator *vmi, pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); struct vm_area_struct *merged; - merged = vma_merge(vmi, vma->vm_mm, prev, start, end, vm_flags, - vma->anon_vma, vma->vm_file, pgoff, policy, - uffd_ctx, anon_name); + merged = vma_merge(vmi, prev, vma, start, end, vm_flags, + pgoff, policy, uffd_ctx, anon_name); if (merged) return merged; @@ -2458,9 +2459,8 @@ static struct vm_area_struct struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { - return vma_merge(vmi, vma->vm_mm, prev, start, end, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff, + vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); } /* @@ -2474,10 +2474,9 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); /* vma is specified as prev, so case 1 or 2 will apply. */ - return vma_merge(vmi, vma->vm_mm, vma, vma->vm_end, vma->vm_end + delta, - vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, - vma_policy(vma), vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); + return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta, + vma->vm_flags, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); } /* From 287d5fedb377ddc232b216b882723305b27ae31a Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Fri, 2 Feb 2024 23:38:54 +0000 Subject: [PATCH 135/435] mm: memcg: use larger batches for proactive reclaim Before 388536ac291 ("mm:vmscan: fix inaccurate reclaim during proactive reclaim") we passed the number of pages for the reclaim request directly to try_to_free_mem_cgroup_pages, which could lead to significant overreclaim. After 0388536ac291 the number of pages was limited to a maximum 32 (SWAP_CLUSTER_MAX) to reduce the amount of overreclaim. However such a small batch size caused a regression in reclaim performance due to many more reclaim start/stop cycles inside memory_reclaim. The restart cost is amortized over more pages with larger batch sizes, and becomes a significant component of the runtime if the batch size is too small. Reclaim tries to balance nr_to_reclaim fidelity with fairness across nodes and cgroups over which the pages are spread. As such, the bigger the request, the bigger the absolute overreclaim error. Historic in-kernel users of reclaim have used fixed, small sized requests to approach an appropriate reclaim rate over time. When we reclaim a user request of arbitrary size, use decaying batch sizes to manage error while maintaining reasonable throughput. MGLRU enabled - memcg LRU used root - full reclaim pages/sec time (sec) pre-0388536ac291 : 68047 10.46 post-0388536ac291 : 13742 inf (reclaim-reclaimed)/4 : 67352 10.51 MGLRU enabled - memcg LRU not used /uid_0 - 1G reclaim pages/sec time (sec) overreclaim (MiB) pre-0388536ac291 : 258822 1.12 107.8 post-0388536ac291 : 105174 2.49 3.5 (reclaim-reclaimed)/4 : 233396 1.12 -7.4 MGLRU enabled - memcg LRU not used /uid_0 - full reclaim pages/sec time (sec) pre-0388536ac291 : 72334 7.09 post-0388536ac291 : 38105 14.45 (reclaim-reclaimed)/4 : 72914 6.96 [tjmercier@google.com: v4] Link: https://lkml.kernel.org/r/20240206175251.3364296-1-tjmercier@google.com Link: https://lkml.kernel.org/r/20240202233855.1236422-1-tjmercier@google.com Fixes: 0388536ac291 ("mm:vmscan: fix inaccurate reclaim during proactive reclaim") Signed-off-by: T.J. Mercier Reviewed-by: Yosry Ahmed Acked-by: Johannes Weiner Reviewed-by: Michal Koutny Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Muchun Song Cc: Efly Young Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 484a9d2862d4..cb216d30a221 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6981,6 +6981,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { + /* Will converge on zero, but reclaim enforces a minimum */ + unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; unsigned long reclaimed; if (signal_pending(current)) @@ -6995,8 +6997,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, lru_add_drain_all(); reclaimed = try_to_free_mem_cgroup_pages(memcg, - min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX), - GFP_KERNEL, reclaim_options); + batch_size, GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; From 09dacb7875395b8761cde921fff767a7cd3ab862 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 2 Feb 2024 22:23:18 +0100 Subject: [PATCH 136/435] mm: reduce dependencies on "page_counter.h" does not need . is enough to get LONG_MAX. Files that include page_counter.h are limited. They have been compile tested or checked. $ git grep page_counter\.h include/linux/hugetlb_cgroup.h: struct page_counter hugepage[HUGE_MAX_HSTATE]; --> all files that include it have been compile tested include/linux/memcontrol.h:#include --> has been added, to be safe include/net/sock.h:#include --> already include mm/hugetlb_cgroup.c:#include mm/memcontrol.c:#include mm/page_counter.c:#include --> compile tested Link: https://lkml.kernel.org/r/adfdbe21c4d06400d7bd802868762deb85cae8b6.1706908921.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 1 + include/linux/page_counter.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 20ff87f8e001..4e4caeaea404 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index c141ea9a95ef..8cd858d912c4 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include struct page_counter { From 4e76c8cc3378a20923965e3345f40f6b8ae0bdba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20Heidekr=C3=BCger?= Date: Fri, 2 Feb 2024 11:32:59 +0000 Subject: [PATCH 137/435] kasan: add atomic tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test that KASan can detect some unsafe atomic accesses. As discussed in the linked thread below, these tests attempt to cover the most common uses of atomics and, therefore, aren't exhaustive. Link: https://lkml.kernel.org/r/20240202113259.3045705-1-paul.heidekrueger@tum.de Link: https://lore.kernel.org/all/20240131210041.686657-1-paul.heidekrueger@tum.de/T/#u Signed-off-by: Paul Heidekrüger Closes: https://bugzilla.kernel.org/show_bug.cgi?id=214055 Acked-by: Mark Rutland Cc: Marco Elver Cc: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/kasan_test.c | 79 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 971cfff4ca0b..318d9cec111a 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -697,6 +697,84 @@ static void kmalloc_uaf3(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]); } +static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe) +{ + int *i_unsafe = (int *)unsafe; + + KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*i_unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, WRITE_ONCE(*i_unsafe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, smp_load_acquire(i_unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, smp_store_release(i_unsafe, 42)); + + KUNIT_EXPECT_KASAN_FAIL(test, atomic_read(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_set(unsafe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_add(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_and(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_andnot(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_or(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_xor(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_xchg(unsafe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_cmpxchg(unsafe, 21, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(unsafe, safe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(safe, unsafe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub_and_test(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_and_test(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_and_test(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_negative(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_unless(unsafe, 21, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_not_zero(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_unless_negative(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_unless_positive(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_if_positive(unsafe)); + + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_read(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_set(unsafe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_and(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_andnot(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_or(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xor(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xchg(unsafe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_cmpxchg(unsafe, 21, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(unsafe, safe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(safe, unsafe, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub_and_test(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_and_test(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_and_test(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_negative(42, unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_unless(unsafe, 21, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_not_zero(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_unless_negative(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_unless_positive(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_if_positive(unsafe)); +} + +static void kasan_atomics(struct kunit *test) +{ + void *a1, *a2; + + /* + * Just as with kasan_bitops_tags(), we allocate 48 bytes of memory such + * that the following 16 bytes will make up the redzone. + */ + a1 = kzalloc(48, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a1); + a2 = kzalloc(sizeof(int), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a1); + + /* Use atomics to access the redzone. */ + kasan_atomics_helper(test, a1 + 48, a2); + + kfree(a1); + kfree(a2); +} + static void kmalloc_double_kzfree(struct kunit *test) { char *ptr; @@ -1883,6 +1961,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kasan_strings), KUNIT_CASE(kasan_bitops_generic), KUNIT_CASE(kasan_bitops_tags), + KUNIT_CASE(kasan_atomics), KUNIT_CASE(vmalloc_helpers_tags), KUNIT_CASE(vmalloc_oob), KUNIT_CASE(vmap_tags), From df7a6d1f64056aec572162c5d35ed9ff86ece6f3 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 5 Feb 2024 11:18:41 -0800 Subject: [PATCH 138/435] mm/hugetlb: restore the reservation if needed Patch series "mm/hugetlb: Restore the reservation", v2. This is a fix for a case where a backing huge page could stolen after madvise(MADV_DONTNEED). A full reproducer is in selftest. See https://lore.kernel.org/all/20240105155419.1939484-1-leitao@debian.org/ In order to test this patch, I instrumented the kernel with LOCKDEP and KASAN, and run the following tests, without any regression: * The self test that reproduces the problem * All mm hugetlb selftests SUMMARY: PASS=9 SKIP=0 FAIL=0 * All libhugetlbfs tests PASS: 0 86 FAIL: 0 0 This patch (of 2): Currently there is a bug that a huge page could be stolen, and when the original owner tries to fault in it, it causes a page fault. You can achieve that by: 1) Creating a single page echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages 2) mmap() the page above with MAP_HUGETLB into (void *ptr1). * This will mark the page as reserved 3) touch the page, which causes a page fault and allocates the page * This will move the page out of the free list. * It will also unreserved the page, since there is no more free page 4) madvise(MADV_DONTNEED) the page * This will free the page, but not mark it as reserved. 5) Allocate a secondary page with mmap(MAP_HUGETLB) into (void *ptr2). * it should fail, but, since there is no more available page. * But, since the page above is not reserved, this mmap() succeed. 6) Faulting at ptr1 will cause a SIGBUS * it will try to allocate a huge page, but there is none available A full reproducer is in selftest. See https://lore.kernel.org/all/20240105155419.1939484-1-leitao@debian.org/ Fix this by restoring the reserved page if necessary. These are the condition for the page restore: * The system is not using surplus pages. The goal is to reduce the surplus usage for this case. * If the VMA has the HPAGE_RESV_OWNER flag set, and is PRIVATE. This is safely checked using __vma_private_lock() * The page is anonymous Once this is scenario is found, set the `hugetlb_restore_reserve` bit in the folio. Then check if the resv reservations need to be adjusted later, done later, after the spinlock, since the vma_xxxx_reservation() might touch the file system lock. Link: https://lkml.kernel.org/r/20240205191843.4009640-1-leitao@debian.org Link: https://lkml.kernel.org/r/20240205191843.4009640-2-leitao@debian.org Signed-off-by: Breno Leitao Suggested-by: Rik van Riel Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ed1581b670d4..44f1e6366d04 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5585,6 +5585,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + bool adjust_reservation = false; unsigned long last_addr_mask; bool force_flush = false; @@ -5677,7 +5678,31 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, hugetlb_count_sub(pages_per_huge_page(h), mm); hugetlb_remove_rmap(page_folio(page)); + /* + * Restore the reservation for anonymous page, otherwise the + * backing page could be stolen by someone. + * If there we are freeing a surplus, do not set the restore + * reservation bit. + */ + if (!h->surplus_huge_pages && __vma_private_lock(vma) && + folio_test_anon(page_folio(page))) { + folio_set_hugetlb_restore_reserve(page_folio(page)); + /* Reservation to be adjusted after the spin lock */ + adjust_reservation = true; + } + spin_unlock(ptl); + + /* + * Adjust the reservation for the region that will have the + * reserve restored. Keep in mind that vma_needs_reservation() changes + * resv->adds_in_progress if it succeeds. If this is not done, + * do_exit() will not see it, and will keep the reservation + * forever. + */ + if (adjust_reservation && vma_needs_reservation(h, vma, address)) + vma_add_reservation(h, vma, address); + tlb_remove_page_size(tlb, page, huge_page_size(h)); /* * Bail out after unmapping reference page if supplied From f81ed7c4e1dea1f90aa2304548c59eb5edcc3bd7 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 5 Feb 2024 11:18:42 -0800 Subject: [PATCH 139/435] selftests/mm: run_vmtests.sh: add hugetlb_madv_vs_map hugetlb_madv_vs_map selftest was not part of the mm test-suite since we didn't have a fix for the problem it found. Now that the problem is already fixed (see previous commit), let's enable this selftest in the default test-suite. Link: https://lkml.kernel.org/r/20240205191843.4009640-3-leitao@debian.org Signed-off-by: Breno Leitao Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 81b5980886da..de03d38907d6 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -264,6 +264,7 @@ nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages) # For this test, we need one and just one huge page echo 1 > /proc/sys/vm/nr_hugepages CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv +CATEGORY="hugetlb" run_test ./hugetlb_madv_vs_map # Restore the previous number of huge pages, since further tests rely on it echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages From d8310914848223de7ec04d55bd15f013f0dad803 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 5 Feb 2024 14:09:21 +0800 Subject: [PATCH 140/435] kasan: docs: update descriptions about test file and module After commit f7e01ab828fd ("kasan: move tests to mm/kasan/"), the test file is renamed to mm/kasan/kasan_test.c and the test module is renamed to kasan_test.ko, so update the descriptions in the document. While at it, update the line number and testcase number when the tests kmalloc_large_oob_right and kmalloc_double_kzfree failed to sync with the current code in mm/kasan/kasan_test.c. Link: https://lkml.kernel.org/r/20240205060925.15594-2-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Acked-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 20 +++++++++---------- .../translations/zh_CN/dev-tools/kasan.rst | 20 +++++++++---------- .../translations/zh_TW/dev-tools/kasan.rst | 20 +++++++++---------- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 858c77fe7dc4..a5a6dbe9029f 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -169,7 +169,7 @@ Error reports A typical KASAN report looks like this:: ================================================================== - BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [test_kasan] + BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [kasan_test] Write of size 1 at addr ffff8801f44ec37b by task insmod/2760 CPU: 1 PID: 2760 Comm: insmod Not tainted 4.19.0-rc3+ #698 @@ -179,8 +179,8 @@ A typical KASAN report looks like this:: print_address_description+0x73/0x280 kasan_report+0x144/0x187 __asan_report_store1_noabort+0x17/0x20 - kmalloc_oob_right+0xa8/0xbc [test_kasan] - kmalloc_tests_init+0x16/0x700 [test_kasan] + kmalloc_oob_right+0xa8/0xbc [kasan_test] + kmalloc_tests_init+0x16/0x700 [kasan_test] do_one_initcall+0xa5/0x3ae do_init_module+0x1b6/0x547 load_module+0x75df/0x8070 @@ -200,8 +200,8 @@ A typical KASAN report looks like this:: save_stack+0x43/0xd0 kasan_kmalloc+0xa7/0xd0 kmem_cache_alloc_trace+0xe1/0x1b0 - kmalloc_oob_right+0x56/0xbc [test_kasan] - kmalloc_tests_init+0x16/0x700 [test_kasan] + kmalloc_oob_right+0x56/0xbc [kasan_test] + kmalloc_tests_init+0x16/0x700 [kasan_test] do_one_initcall+0xa5/0x3ae do_init_module+0x1b6/0x547 load_module+0x75df/0x8070 @@ -510,15 +510,15 @@ When a test passes:: When a test fails due to a failed ``kmalloc``:: - # kmalloc_large_oob_right: ASSERTION FAILED at lib/test_kasan.c:163 + # kmalloc_large_oob_right: ASSERTION FAILED at mm/kasan/kasan_test.c:245 Expected ptr is not null, but is - not ok 4 - kmalloc_large_oob_right + not ok 5 - kmalloc_large_oob_right When a test fails due to a missing KASAN report:: - # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:974 + # kmalloc_double_kzfree: EXPECTATION FAILED at mm/kasan/kasan_test.c:709 KASAN failure expected in "kfree_sensitive(ptr)", but none occurred - not ok 44 - kmalloc_double_kzfree + not ok 28 - kmalloc_double_kzfree At the end the cumulative status of all KASAN tests is printed. On success:: @@ -534,7 +534,7 @@ There are a few ways to run KUnit-compatible KASAN tests. 1. Loadable module With ``CONFIG_KUNIT`` enabled, KASAN-KUnit tests can be built as a loadable - module and run by loading ``test_kasan.ko`` with ``insmod`` or ``modprobe``. + module and run by loading ``kasan_test.ko`` with ``insmod`` or ``modprobe``. 2. Built-In diff --git a/Documentation/translations/zh_CN/dev-tools/kasan.rst b/Documentation/translations/zh_CN/dev-tools/kasan.rst index 8fdb20c9665b..2b1e8f74904b 100644 --- a/Documentation/translations/zh_CN/dev-tools/kasan.rst +++ b/Documentation/translations/zh_CN/dev-tools/kasan.rst @@ -137,7 +137,7 @@ KASAN受到通用 ``panic_on_warn`` 命令行参数的影响。当它被启用 典型的KASAN报告如下所示:: ================================================================== - BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [test_kasan] + BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [kasan_test] Write of size 1 at addr ffff8801f44ec37b by task insmod/2760 CPU: 1 PID: 2760 Comm: insmod Not tainted 4.19.0-rc3+ #698 @@ -147,8 +147,8 @@ KASAN受到通用 ``panic_on_warn`` 命令行参数的影响。当它被启用 print_address_description+0x73/0x280 kasan_report+0x144/0x187 __asan_report_store1_noabort+0x17/0x20 - kmalloc_oob_right+0xa8/0xbc [test_kasan] - kmalloc_tests_init+0x16/0x700 [test_kasan] + kmalloc_oob_right+0xa8/0xbc [kasan_test] + kmalloc_tests_init+0x16/0x700 [kasan_test] do_one_initcall+0xa5/0x3ae do_init_module+0x1b6/0x547 load_module+0x75df/0x8070 @@ -168,8 +168,8 @@ KASAN受到通用 ``panic_on_warn`` 命令行参数的影响。当它被启用 save_stack+0x43/0xd0 kasan_kmalloc+0xa7/0xd0 kmem_cache_alloc_trace+0xe1/0x1b0 - kmalloc_oob_right+0x56/0xbc [test_kasan] - kmalloc_tests_init+0x16/0x700 [test_kasan] + kmalloc_oob_right+0x56/0xbc [kasan_test] + kmalloc_tests_init+0x16/0x700 [kasan_test] do_one_initcall+0xa5/0x3ae do_init_module+0x1b6/0x547 load_module+0x75df/0x8070 @@ -421,15 +421,15 @@ KASAN连接到vmap基础架构以懒清理未使用的影子内存。 当由于 ``kmalloc`` 失败而导致测试失败时:: - # kmalloc_large_oob_right: ASSERTION FAILED at lib/test_kasan.c:163 + # kmalloc_large_oob_right: ASSERTION FAILED at mm/kasan/kasan_test.c:245 Expected ptr is not null, but is - not ok 4 - kmalloc_large_oob_right + not ok 5 - kmalloc_large_oob_right 当由于缺少KASAN报告而导致测试失败时:: - # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:974 + # kmalloc_double_kzfree: EXPECTATION FAILED at mm/kasan/kasan_test.c:709 KASAN failure expected in "kfree_sensitive(ptr)", but none occurred - not ok 44 - kmalloc_double_kzfree + not ok 28 - kmalloc_double_kzfree 最后打印所有KASAN测试的累积状态。成功:: @@ -445,7 +445,7 @@ KASAN连接到vmap基础架构以懒清理未使用的影子内存。 1. 可加载模块 启用 ``CONFIG_KUNIT`` 后,KASAN-KUnit测试可以构建为可加载模块,并通过使用 - ``insmod`` 或 ``modprobe`` 加载 ``test_kasan.ko`` 来运行。 + ``insmod`` 或 ``modprobe`` 加载 ``kasan_test.ko`` 来运行。 2. 内置 diff --git a/Documentation/translations/zh_TW/dev-tools/kasan.rst b/Documentation/translations/zh_TW/dev-tools/kasan.rst index 979eb84bc58f..ed342e67d8ed 100644 --- a/Documentation/translations/zh_TW/dev-tools/kasan.rst +++ b/Documentation/translations/zh_TW/dev-tools/kasan.rst @@ -137,7 +137,7 @@ KASAN受到通用 ``panic_on_warn`` 命令行參數的影響。當它被啓用 典型的KASAN報告如下所示:: ================================================================== - BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [test_kasan] + BUG: KASAN: slab-out-of-bounds in kmalloc_oob_right+0xa8/0xbc [kasan_test] Write of size 1 at addr ffff8801f44ec37b by task insmod/2760 CPU: 1 PID: 2760 Comm: insmod Not tainted 4.19.0-rc3+ #698 @@ -147,8 +147,8 @@ KASAN受到通用 ``panic_on_warn`` 命令行參數的影響。當它被啓用 print_address_description+0x73/0x280 kasan_report+0x144/0x187 __asan_report_store1_noabort+0x17/0x20 - kmalloc_oob_right+0xa8/0xbc [test_kasan] - kmalloc_tests_init+0x16/0x700 [test_kasan] + kmalloc_oob_right+0xa8/0xbc [kasan_test] + kmalloc_tests_init+0x16/0x700 [kasan_test] do_one_initcall+0xa5/0x3ae do_init_module+0x1b6/0x547 load_module+0x75df/0x8070 @@ -168,8 +168,8 @@ KASAN受到通用 ``panic_on_warn`` 命令行參數的影響。當它被啓用 save_stack+0x43/0xd0 kasan_kmalloc+0xa7/0xd0 kmem_cache_alloc_trace+0xe1/0x1b0 - kmalloc_oob_right+0x56/0xbc [test_kasan] - kmalloc_tests_init+0x16/0x700 [test_kasan] + kmalloc_oob_right+0x56/0xbc [kasan_test] + kmalloc_tests_init+0x16/0x700 [kasan_test] do_one_initcall+0xa5/0x3ae do_init_module+0x1b6/0x547 load_module+0x75df/0x8070 @@ -421,15 +421,15 @@ KASAN連接到vmap基礎架構以懶清理未使用的影子內存。 當由於 ``kmalloc`` 失敗而導致測試失敗時:: - # kmalloc_large_oob_right: ASSERTION FAILED at lib/test_kasan.c:163 + # kmalloc_large_oob_right: ASSERTION FAILED at mm/kasan/kasan_test.c:245 Expected ptr is not null, but is - not ok 4 - kmalloc_large_oob_right + not ok 5 - kmalloc_large_oob_right 當由於缺少KASAN報告而導致測試失敗時:: - # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:974 + # kmalloc_double_kzfree: EXPECTATION FAILED at mm/kasan/kasan_test.c:709 KASAN failure expected in "kfree_sensitive(ptr)", but none occurred - not ok 44 - kmalloc_double_kzfree + not ok 28 - kmalloc_double_kzfree 最後打印所有KASAN測試的累積狀態。成功:: @@ -445,7 +445,7 @@ KASAN連接到vmap基礎架構以懶清理未使用的影子內存。 1. 可加載模塊 啓用 ``CONFIG_KUNIT`` 後,KASAN-KUnit測試可以構建爲可加載模塊,並通過使用 - ``insmod`` 或 ``modprobe`` 加載 ``test_kasan.ko`` 來運行。 + ``insmod`` 或 ``modprobe`` 加載 ``kasan_test.ko`` 來運行。 2. 內置 From be142b80803049423944ca77be0905564af9a46c Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 5 Feb 2024 14:09:22 +0800 Subject: [PATCH 141/435] kasan: rename test_kasan_module_init to kasan_test_module_init After commit f7e01ab828fd ("kasan: move tests to mm/kasan/"), the test module file is renamed from lib/test_kasan_module.c to mm/kasan/kasan_test_module.c, in order to keep consistent, rename test_kasan_module_init to kasan_test_module_init. Link: https://lkml.kernel.org/r/20240205060925.15594-3-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Acked-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/kasan/kasan_test_module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c index 8b7b3ea2c74e..27ec22767e42 100644 --- a/mm/kasan/kasan_test_module.c +++ b/mm/kasan/kasan_test_module.c @@ -62,7 +62,7 @@ static noinline void __init copy_user_test(void) kfree(kmem); } -static int __init test_kasan_module_init(void) +static int __init kasan_test_module_init(void) { /* * Temporarily enable multi-shot mode. Otherwise, KASAN would only @@ -77,5 +77,5 @@ static int __init test_kasan_module_init(void) return -EAGAIN; } -module_init(test_kasan_module_init); +module_init(kasan_test_module_init); MODULE_LICENSE("GPL"); From fe58582c0e362d5e52b97985142b0dcef920b745 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 5 Feb 2024 08:46:47 +0530 Subject: [PATCH 142/435] mm/cma: drop CONFIG_CMA_DEBUG All pr_debug() prints in (mm/cma.c) could be enabled via standard Makefile based method. Besides cma_debug_show_areas() should always be called during cma_alloc() failure path. This seemingly redundant config, CONFIG_CMA_DEBUG can be dropped without any problem. [lukas.bulwahn@gmail.com: remove debug code to removed CONFIG_CMA_DEBUG] Link: https://lkml.kernel.org/r/20240207143825.986-1-lukas.bulwahn@gmail.com Link: https://lkml.kernel.org/r/20240205031647.283510-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Signed-off-by: Lukas Bulwahn Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Signed-off-by: Andrew Morton --- kernel/dma/contiguous.c | 6 ------ mm/Kconfig | 9 --------- mm/cma.c | 9 --------- 3 files changed, 24 deletions(-) diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index f005c66f378c..055da410ac71 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -37,12 +37,6 @@ #define pr_fmt(fmt) "cma: " fmt -#ifdef CONFIG_CMA_DEBUG -#ifndef DEBUG -# define DEBUG -#endif -#endif - #include #include diff --git a/mm/Kconfig b/mm/Kconfig index ffc3a2ba3a8c..35fa9940e61f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -901,15 +901,6 @@ config CMA If unsure, say "n". -config CMA_DEBUG - bool "CMA debug messages (DEVELOPMENT)" - depends on DEBUG_KERNEL && CMA - help - Turns on debug messages in CMA. This produces KERN_DEBUG - messages for every CMA call as well as various messages while - processing calls such as dma_alloc_from_contiguous(). - This option does not affect warning and error messages. - config CMA_DEBUGFS bool "CMA debugfs interface" depends on CMA && DEBUG_FS diff --git a/mm/cma.c b/mm/cma.c index b6720930312d..4902bbfe24f1 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -14,11 +14,6 @@ #define pr_fmt(fmt) "cma: " fmt -#ifdef CONFIG_CMA_DEBUG -#ifndef DEBUG -# define DEBUG -#endif -#endif #define CREATE_TRACE_POINTS #include @@ -387,7 +382,6 @@ err: return ret; } -#ifdef CONFIG_CMA_DEBUG static void cma_debug_show_areas(struct cma *cma) { unsigned long next_zero_bit, next_set_bit, nr_zero; @@ -412,9 +406,6 @@ static void cma_debug_show_areas(struct cma *cma) pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); spin_unlock_irq(&cma->lock); } -#else -static inline void cma_debug_show_areas(struct cma *cma) { } -#endif /** * cma_alloc() - allocate pages from contiguous area From 73307523c9bbc4e3b35f0058cdbc15e32bd83c52 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 5 Feb 2024 10:49:29 +0530 Subject: [PATCH 143/435] mm/cma: make MAX_CMA_AREAS = CONFIG_CMA_AREAS There is no real difference between the global area, and other additionally configured CMA areas via CONFIG_CMA_AREAS that always defaults without user input. This makes MAX_CMA_AREAS same as CONFIG_CMA_AREAS, also incrementing its default values, thus maintaining current default for MAX_CMA_AREAS both for UMA and NUMA systems. Link: https://lkml.kernel.org/r/20240205051929.298559-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/cma.h | 6 +----- mm/Kconfig | 6 +++--- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/include/linux/cma.h b/include/linux/cma.h index 63873b93deaa..9db877506ea8 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -6,12 +6,8 @@ #include #include -/* - * There is always at least global CMA area and a few optional - * areas configured in kernel .config. - */ #ifdef CONFIG_CMA_AREAS -#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS) +#define MAX_CMA_AREAS CONFIG_CMA_AREAS #endif #define CMA_MAX_NAME 64 diff --git a/mm/Kconfig b/mm/Kconfig index 35fa9940e61f..88ba99d84ac3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -917,14 +917,14 @@ config CMA_SYSFS config CMA_AREAS int "Maximum count of the CMA areas" depends on CMA - default 19 if NUMA - default 7 + default 20 if NUMA + default 8 help CMA allows to create CMA areas for particular purpose, mainly, used as device private area. This parameter sets the maximum number of CMA area in the system. - If unsure, leave the default value "7" in UMA and "19" in NUMA. + If unsure, leave the default value "8" in UMA and "20" in NUMA. config MEM_SOFT_DIRTY bool "Track memory changes" From 98141718528526ef080a9bb78a40f79aca0774a7 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Mon, 5 Feb 2024 12:26:18 +0800 Subject: [PATCH 144/435] mm/vmscan: make too_many_isolated return bool too_many_isolated() should return bool as does the similar too_many_isolated() in mm/compaction.c. Link: https://lkml.kernel.org/r/20240205042618.108140-1-gehao@kylinos.cn Signed-off-by: Hao Ge Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/vmscan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8e52f8795d20..327bf904fdcd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1744,17 +1744,17 @@ bool folio_isolate_lru(struct folio *folio) * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM. */ -static int too_many_isolated(struct pglist_data *pgdat, int file, +static bool too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc) { unsigned long inactive, isolated; bool too_many; if (current_is_kswapd()) - return 0; + return false; if (!writeback_throttling_sane(sc)) - return 0; + return false; if (file) { inactive = node_page_state(pgdat, NR_INACTIVE_FILE); From e374ae2be2f7cb4aad46e17e3fa5da7bbb0d2a09 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Sun, 4 Feb 2024 10:56:44 -0300 Subject: [PATCH 145/435] memory tier: make memory_tier_subsys const Now that the driver core can properly handle constant struct bus_type, move the memory_tier_subsys variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Link: https://lkml.kernel.org/r/20240204-bus_cleanup-mm-v1-1-00f49286f164@marliere.net Signed-off-by: Ricardo B. Marliere Suggested-by: Greg Kroah-Hartman Reviewed-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 5462d9e3c84c..ed20f96bf89d 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -39,7 +39,7 @@ static LIST_HEAD(memory_tiers); static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; struct memory_dev_type *default_dram_type; -static struct bus_type memory_tier_subsys = { +static const struct bus_type memory_tier_subsys = { .name = "memory_tiering", .dev_name = "memory_tier", }; From f9c0f1c32cb568e16ef0676d8e7827a3ad443742 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 4 Feb 2024 03:05:59 +0000 Subject: [PATCH 146/435] mm/zswap: add more comments in shrink_memcg_cb() Patch series "mm/zswap: optimize zswap lru list", v2. This series is motivated when observe the zswap lru list shrinking, noted there are some unexpected cases in zswap_writeback_entry(). bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}' There are some -ENOMEM because when the swap entry is freed to per-cpu swap pool, it doesn't invalidate/drop zswap entry. Then the shrinker encounter these trashy zswap entries, it can't be reclaimed and return -ENOMEM. So move the invalidation ahead to when swap entry freed to the per-cpu swap pool, since there is no any benefit to leave trashy zswap entries on the zswap tree and lru list. Another case is -EEXIST, which is seen more in the case of !zswap_exclusive_loads_enabled, in which case the swapin folio will leave compressed copy on the tree and lru list. And it can't be reclaimed until the folio is removed from swapcache. Changing to zswap_exclusive_loads_enabled mode will invalidate when folio swapin, which has its own drawback if that folio is still clean in swapcache and swapout again, we need to compress it again. Please see the commit for details on why we choose exclusive load as the default for zswap. Another optimization for -EEXIST is that we add LRU_STOP to support terminating the shrinking process to avoid evicting warmer region. Testing using kernel build in tmpfs, one 50GB swapfile and zswap shrinker_enabled, with memory.max set to 2GB. mm-unstable zswap-optimize real 63.90s 63.25s user 1064.05s 1063.40s sys 292.32s 270.94s The main optimization is in sys cpu, about 7% improvement. This patch (of 6): Add more comments in shrink_memcg_cb() to describe the deref dance which is implemented to fix race problem between lru writeback and swapoff, and the reason why we rotate the entry at the beginning. Also fix the stale comments in zswap_writeback_entry(), and add more comments to state that we only deref the tree after we get the swapcache reference. Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-0-99d4084260a0@bytedance.com Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-1-99d4084260a0@bytedance.com Signed-off-by: Johannes Weiner Signed-off-by: Chengming Zhou Suggested-by: Yosry Ahmed Suggested-by: Johannes Weiner Acked-by: Yosry Ahmed Reviewed-by: Nhat Pham Signed-off-by: Andrew Morton --- mm/zswap.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 2bf4bf1d356c..35da20d3617f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1207,10 +1207,12 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* * folio is locked, and the swapcache is now secured against - * concurrent swapping to and from the slot. Verify that the - * swap entry hasn't been invalidated and recycled behind our - * backs (our zswap_entry reference doesn't prevent that), to - * avoid overwriting a new swap folio with old compressed data. + * concurrent swapping to and from the slot, and concurrent + * swapoff so we can safely dereference the zswap tree here. + * Verify that the swap entry hasn't been invalidated and recycled + * behind our backs, to avoid overwriting a new swap folio with + * old compressed data. Only when this is successful can the entry + * be dereferenced. */ tree = swap_zswap_tree(swpentry); spin_lock(&tree->lock); @@ -1263,22 +1265,29 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o int writeback_result; /* - * Rotate the entry to the tail before unlocking the LRU, - * so that in case of an invalidation race concurrent - * reclaimers don't waste their time on it. + * As soon as we drop the LRU lock, the entry can be freed by + * a concurrent invalidation. This means the following: * - * If writeback succeeds, or failure is due to the entry - * being invalidated by the swap subsystem, the invalidation - * will unlink and free it. + * 1. We extract the swp_entry_t to the stack, allowing + * zswap_writeback_entry() to pin the swap entry and + * then validate the zwap entry against that swap entry's + * tree using pointer value comparison. Only when that + * is successful can the entry be dereferenced. * - * Temporary failures, where the same entry should be tried - * again immediately, almost never happen for this shrinker. - * We don't do any trylocking; -ENOMEM comes closest, - * but that's extremely rare and doesn't happen spuriously - * either. Don't bother distinguishing this case. + * 2. Usually, objects are taken off the LRU for reclaim. In + * this case this isn't possible, because if reclaim fails + * for whatever reason, we have no means of knowing if the + * entry is alive to put it back on the LRU. * - * But since they do exist in theory, the entry cannot just - * be unlinked, or we could leak it. Hence, rotate. + * So rotate it before dropping the lock. If the entry is + * written back or invalidated, the free path will unlink + * it. For failures, rotation is the right thing as well. + * + * Temporary failures, where the same entry should be tried + * again immediately, almost never happen for this shrinker. + * We don't do any trylocking; -ENOMEM comes closest, + * but that's extremely rare and doesn't happen spuriously + * either. Don't bother distinguishing this case. */ list_move_tail(item, &l->list); From 0827a1fb143fae588cb6f5b9a97c405d6c2ddec9 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 4 Feb 2024 03:06:00 +0000 Subject: [PATCH 147/435] mm/zswap: invalidate zswap entry when swap entry free During testing I found there are some times the zswap_writeback_entry() return -ENOMEM, which is not we expected: bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}' @[-12]: 1563 @[0]: 277221 The reason is that __read_swap_cache_async() return NULL because swapcache_prepare() failed. The reason is that we won't invalidate zswap entry when swap entry freed to the per-cpu pool, these zswap entries are still on the zswap tree and lru list. This patch moves the invalidation ahead to when swap entry freed to the per-cpu pool, since there is no any benefit to leave trashy zswap entry on the tree and lru list. With this patch: bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}' @[0]: 259744 Note: large folio can't have zswap entry for now, so don't bother to add zswap entry invalidation in the large folio swap free path. Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-2-99d4084260a0@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Nhat Pham Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 ++-- mm/swap_slots.c | 3 +++ mm/swapfile.c | 1 - mm/zswap.c | 5 +++-- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 91895ce1fdbc..341aea490070 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -29,7 +29,7 @@ struct zswap_lruvec_state { bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); -void zswap_invalidate(int type, pgoff_t offset); +void zswap_invalidate(swp_entry_t swp); int zswap_swapon(int type, unsigned long nr_pages); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); @@ -50,7 +50,7 @@ static inline bool zswap_load(struct folio *folio) return false; } -static inline void zswap_invalidate(int type, pgoff_t offset) {} +static inline void zswap_invalidate(swp_entry_t swp) {} static inline int zswap_swapon(int type, unsigned long nr_pages) { return 0; diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 0bec1f705f8e..90973ce7881d 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -273,6 +273,9 @@ void free_swap_slot(swp_entry_t entry) { struct swap_slots_cache *cache; + /* Large folio swap slot is not covered. */ + zswap_invalidate(entry); + cache = raw_cpu_ptr(&swp_slots); if (likely(use_swap_slot_cache && cache->slots_ret)) { spin_lock_irq(&cache->free_lock); diff --git a/mm/swapfile.c b/mm/swapfile.c index a8edaf4e5b8a..d1bd8d1e17bd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -744,7 +744,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify = NULL; while (offset <= end) { arch_swap_invalidate_page(si->type, offset); - zswap_invalidate(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); offset++; diff --git a/mm/zswap.c b/mm/zswap.c index 35da20d3617f..ef41a7bd81f2 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1739,9 +1739,10 @@ bool zswap_load(struct folio *folio) return true; } -void zswap_invalidate(int type, pgoff_t offset) +void zswap_invalidate(swp_entry_t swp) { - struct zswap_tree *tree = swap_zswap_tree(swp_entry(type, offset)); + pgoff_t offset = swp_offset(swp); + struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry; spin_lock(&tree->lock); From b49547ade38a63ff39c9fbc53fb38622cb63854a Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 4 Feb 2024 03:06:01 +0000 Subject: [PATCH 148/435] mm/zswap: stop lru list shrinking when encounter warm region When the shrinker encounter an existing folio in swap cache, it means we are shrinking into the warmer region. We should terminate shrinking if we're in the dynamic shrinker context. This patch add LRU_STOP to support this, to avoid overshrinking. Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-3-99d4084260a0@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Acked-by: Nhat Pham Reviewed-by: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 2 ++ mm/list_lru.c | 3 +++ mm/zswap.c | 4 +++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index f2882a820690..792b67ceb631 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -24,6 +24,8 @@ enum lru_status { LRU_SKIP, /* item cannot be locked, skip */ LRU_RETRY, /* item not freeable. May drop the lock internally, but has to return locked. */ + LRU_STOP, /* stop lru list walking. May drop the lock + internally, but has to return locked. */ }; struct list_lru_one { diff --git a/mm/list_lru.c b/mm/list_lru.c index 61f3b6b1134f..3fd64736bc45 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -243,6 +243,9 @@ restart: */ assert_spin_locked(&nlru->lock); goto restart; + case LRU_STOP: + assert_spin_locked(&nlru->lock); + goto out; default: BUG(); } diff --git a/mm/zswap.c b/mm/zswap.c index ef41a7bd81f2..f8a4ac389118 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1315,8 +1315,10 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o * into the warmer region. We should terminate shrinking (if we're in the dynamic * shrinker context). */ - if (writeback_result == -EEXIST && encountered_page_in_swapcache) + if (writeback_result == -EEXIST && encountered_page_in_swapcache) { + ret = LRU_STOP; *encountered_page_in_swapcache = true; + } } else { zswap_written_back_pages++; } From 3b631bd06550040b37cd2ebb00e2374404db0360 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 4 Feb 2024 03:06:02 +0000 Subject: [PATCH 149/435] mm/zswap: remove duplicate_entry debug value cat /sys/kernel/debug/zswap/duplicate_entry 2086447 When testing, the duplicate_entry value is very high, but no warning message in the kernel log. From the comment of duplicate_entry "Duplicate store was encountered (rare)", it seems something goes wrong. Actually it's incremented in the beginning of zswap_store(), which found its zswap entry has already on the tree. And this is a normal case, since the folio could leave zswap entry on the tree after swapin, later it's dirtied and swapout/zswap_store again, found its original zswap entry. So duplicate_entry should be only incremented in the real bug case, which already have "WARN_ON(1)", it looks redundant to count bug case, so this patch just remove it. Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-4-99d4084260a0@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index f8a4ac389118..91df925359c8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -71,8 +71,6 @@ static u64 zswap_reject_compress_poor; static u64 zswap_reject_alloc_fail; /* Store failed because the entry metadata could not be allocated (rare) */ static u64 zswap_reject_kmemcache_fail; -/* Duplicate store was encountered (rare) */ -static u64 zswap_duplicate_entry; /* Shrinker work queue */ static struct workqueue_struct *shrink_wq; @@ -1568,10 +1566,8 @@ bool zswap_store(struct folio *folio) */ spin_lock(&tree->lock); entry = zswap_rb_search(&tree->rbroot, offset); - if (entry) { + if (entry) zswap_invalidate_entry(tree, entry); - zswap_duplicate_entry++; - } spin_unlock(&tree->lock); if (!zswap_enabled) @@ -1662,7 +1658,6 @@ insert_entry: */ while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { WARN_ON(1); - zswap_duplicate_entry++; zswap_invalidate_entry(tree, dupentry); } if (entry->length) { @@ -1823,8 +1818,6 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("written_back_pages", 0444, zswap_debugfs_root, &zswap_written_back_pages); - debugfs_create_u64("duplicate_entry", 0444, - zswap_debugfs_root, &zswap_duplicate_entry); debugfs_create_u64("pool_total_size", 0444, zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", 0444, From c2e2ba770200b379069011a1fdeeb41e4569c486 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 4 Feb 2024 03:06:03 +0000 Subject: [PATCH 150/435] mm/zswap: only support zswap_exclusive_loads_enabled The !zswap_exclusive_loads_enabled mode will leave compressed copy in the zswap tree and lru list after the folio swapin. There are some disadvantages in this mode: 1. It's a waste of memory since there are two copies of data, one is folio, the other one is compressed data in zswap. And it's unlikely the compressed data is useful in the near future. 2. If that folio is dirtied, the compressed data must be not useful, but we don't know and don't invalidate the trashy memory in zswap. 3. It's not reclaimable from zswap shrinker since zswap_writeback_entry() will always return -EEXIST and terminate the shrinking process. On the other hand, the only downside of zswap_exclusive_loads_enabled is a little more cpu usage/latency when compression, and the same if the folio is removed from swapcache or dirtied. More explanation by Johannes on why we should consider exclusive load as the default for zswap: Caching "swapout work" is helpful when the system is thrashing. Then recently swapped in pages might get swapped out again very soon. It certainly makes sense with conventional swap, because keeping a clean copy on the disk saves IO work and doesn't cost any additional memory. But with zswap, it's different. It saves some compression work on a thrashing page. But the act of keeping compressed memory contributes to a higher rate of thrashing. And that can cause IO in other places like zswap writeback and file memory. And the A/B test results of the kernel build in tmpfs with limited memory can support this theory: !exclusive exclusive real 63.80 63.01 user 1063.83 1061.32 sys 290.31 266.15 workingset_refault_anon 2383084.40 1976397.40 workingset_refault_file 44134.00 45689.40 workingset_activate_anon 837878.00 728441.20 workingset_activate_file 4710.00 4085.20 workingset_restore_anon 732622.60 639428.40 workingset_restore_file 1007.00 926.80 workingset_nodereclaim 0.00 0.00 pgscan 14343003.40 12409570.20 pgscan_kswapd 0.00 0.00 pgscan_direct 14343003.40 12409570.20 pgscan_khugepaged 0.00 0.00 Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-5-99d4084260a0@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Yosry Ahmed Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Signed-off-by: Andrew Morton --- mm/Kconfig | 16 ---------------- mm/zswap.c | 14 +++----------- 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 88ba99d84ac3..2b267553f793 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -45,22 +45,6 @@ config ZSWAP_DEFAULT_ON The selection made here can be overridden by using the kernel command line 'zswap.enabled=' option. -config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON - bool "Invalidate zswap entries when pages are loaded" - depends on ZSWAP - help - If selected, exclusive loads for zswap will be enabled at boot, - otherwise it will be disabled. - - If exclusive loads are enabled, when a page is loaded from zswap, - the zswap entry is invalidated at once, as opposed to leaving it - in zswap until the swap entry is freed. - - This avoids having two copies of the same page in memory - (compressed and uncompressed) after faulting in a page from zswap. - The cost is that if the page was never dirtied and needs to be - swapped out again, it will be re-compressed. - config ZSWAP_SHRINKER_DEFAULT_ON bool "Shrink the zswap pool on memory pressure" depends on ZSWAP diff --git a/mm/zswap.c b/mm/zswap.c index 91df925359c8..7a69142817cb 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -139,10 +139,6 @@ static bool zswap_non_same_filled_pages_enabled = true; module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, bool, 0644); -static bool zswap_exclusive_loads_enabled = IS_ENABLED( - CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); -module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); - /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 @@ -1723,16 +1719,12 @@ bool zswap_load(struct folio *folio) count_objcg_event(entry->objcg, ZSWPIN); spin_lock(&tree->lock); - if (zswap_exclusive_loads_enabled) { - zswap_invalidate_entry(tree, entry); - folio_mark_dirty(folio); - } else if (entry->length) { - zswap_lru_del(&entry->pool->list_lru, entry); - zswap_lru_add(&entry->pool->list_lru, entry); - } + zswap_invalidate_entry(tree, entry); zswap_entry_put(entry); spin_unlock(&tree->lock); + folio_mark_dirty(folio); + return true; } From a230c20e63efef3daab510979898b25a0e446b36 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 4 Feb 2024 03:06:04 +0000 Subject: [PATCH 151/435] mm/zswap: zswap entry doesn't need refcount anymore Since we don't need to leave zswap entry on the zswap tree anymore, we should remove it from tree once we find it from the tree. Then after using it, we can directly free it, no concurrent path can find it from tree. Only the shrinker can see it from lru list, which will also double check under tree lock, so no race problem. So we don't need refcount in zswap entry anymore and don't need to take the spinlock for the second time to invalidate it. The side effect is that zswap_entry_free() maybe not happen in tree spinlock, but it's ok since nothing need to be protected by the lock. Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-6-99d4084260a0@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Nhat Pham Acked-by: Johannes Weiner Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 63 ++++++++++-------------------------------------------- 1 file changed, 11 insertions(+), 52 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 7a69142817cb..96664cdee207 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -193,12 +193,6 @@ struct zswap_pool { * * rbnode - links the entry into red-black tree for the appropriate swap type * swpentry - associated swap entry, the offset indexes into the red-black tree - * refcount - the number of outstanding reference to the entry. This is needed - * to protect against premature freeing of the entry by code - * concurrent calls to load, invalidate, and writeback. The lock - * for the zswap_tree structure that contains the entry must - * be held while changing the refcount. Since the lock must - * be held, there is no reason to also make refcount atomic. * length - the length in bytes of the compressed page data. Needed during * decompression. For a same value filled page length is 0, and both * pool and lru are invalid and must be ignored. @@ -211,7 +205,6 @@ struct zswap_pool { struct zswap_entry { struct rb_node rbnode; swp_entry_t swpentry; - int refcount; unsigned int length; struct zswap_pool *pool; union { @@ -222,11 +215,6 @@ struct zswap_entry { struct list_head lru; }; -/* - * The tree lock in the zswap_tree struct protects a few things: - * - the rbtree - * - the refcount field of each entry in the tree - */ struct zswap_tree { struct rb_root rbroot; spinlock_t lock; @@ -890,14 +878,10 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, return 0; } -static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) +static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) { - if (!RB_EMPTY_NODE(&entry->rbnode)) { - rb_erase(&entry->rbnode, root); - RB_CLEAR_NODE(&entry->rbnode); - return true; - } - return false; + rb_erase(&entry->rbnode, root); + RB_CLEAR_NODE(&entry->rbnode); } /********************************* @@ -911,7 +895,6 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; - entry->refcount = 1; RB_CLEAR_NODE(&entry->rbnode); return entry; } @@ -954,33 +937,15 @@ static void zswap_entry_free(struct zswap_entry *entry) zswap_update_total_size(); } -/* caller must hold the tree lock */ -static void zswap_entry_get(struct zswap_entry *entry) -{ - WARN_ON_ONCE(!entry->refcount); - entry->refcount++; -} - -/* caller must hold the tree lock */ -static void zswap_entry_put(struct zswap_entry *entry) -{ - WARN_ON_ONCE(!entry->refcount); - if (--entry->refcount == 0) { - WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); - zswap_entry_free(entry); - } -} - /* - * If the entry is still valid in the tree, drop the initial ref and remove it - * from the tree. This function must be called with an additional ref held, - * otherwise it may race with another invalidation freeing the entry. + * The caller hold the tree lock and search the entry from the tree, + * so it must be on the tree, remove it from the tree and free it. */ static void zswap_invalidate_entry(struct zswap_tree *tree, struct zswap_entry *entry) { - if (zswap_rb_erase(&tree->rbroot, entry)) - zswap_entry_put(entry); + zswap_rb_erase(&tree->rbroot, entry); + zswap_entry_free(entry); } /********************************* @@ -1219,7 +1184,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, } /* Safe to deref entry after the entry is verified above. */ - zswap_entry_get(entry); + zswap_rb_erase(&tree->rbroot, entry); spin_unlock(&tree->lock); zswap_decompress(entry, &folio->page); @@ -1228,10 +1193,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, if (entry->objcg) count_objcg_event(entry->objcg, ZSWPWB); - spin_lock(&tree->lock); - zswap_invalidate_entry(tree, entry); - zswap_entry_put(entry); - spin_unlock(&tree->lock); + zswap_entry_free(entry); /* folio is up to date */ folio_mark_uptodate(folio); @@ -1703,7 +1665,7 @@ bool zswap_load(struct folio *folio) spin_unlock(&tree->lock); return false; } - zswap_entry_get(entry); + zswap_rb_erase(&tree->rbroot, entry); spin_unlock(&tree->lock); if (entry->length) @@ -1718,10 +1680,7 @@ bool zswap_load(struct folio *folio) if (entry->objcg) count_objcg_event(entry->objcg, ZSWPIN); - spin_lock(&tree->lock); - zswap_invalidate_entry(tree, entry); - zswap_entry_put(entry); - spin_unlock(&tree->lock); + zswap_entry_free(entry); folio_mark_dirty(folio); From 055267feaecc9c6c53f128aa50746f64c4ef5ca2 Mon Sep 17 00:00:00 2001 From: Paul Gofman Date: Tue, 6 Feb 2024 13:48:01 +0500 Subject: [PATCH 152/435] mm/migrate: preserve exact soft-dirty state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pte_mkdirty() sets both _PAGE_DIRTY and _PAGE_SOFT_DIRTY bits. The _PAGE_SOFT_DIRTY can get set even if it wasn't set on original page before migration. This makes non-soft-dirty pages soft-dirty just because of migration/compaction. Clear the _PAGE_SOFT_DIRTY flag if it wasn't set on original page. By definition of soft-dirty feature, there can be spurious soft-dirty pages because of kernel's internal activity such as VMA merging or migration/compaction. This patch is eliminating the spurious soft-dirty pages because of migration/compaction. Link: https://lkml.kernel.org/r/20240206084838.34560-1-usama.anjum@collabora.com Signed-off-by: Paul Gofman Signed-off-by: Muhammad Usama Anjum Acked-by: Andrei Vagin Cc: Michał Mirosław Signed-off-by: Andrew Morton --- mm/migrate.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index cc9f2bcd73b4..05d6ca437321 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -211,14 +211,17 @@ static bool remove_migration_pte(struct folio *folio, folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); old_pte = ptep_get(pvmw.pte); - if (pte_swp_soft_dirty(old_pte)) - pte = pte_mksoft_dirty(pte); entry = pte_to_swp_entry(old_pte); if (!is_migration_entry_young(entry)) pte = pte_mkold(pte); if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) pte = pte_mkdirty(pte); + if (pte_swp_soft_dirty(old_pte)) + pte = pte_mksoft_dirty(pte); + else + pte = pte_clear_soft_dirty(pte); + if (is_writable_migration_entry(entry)) pte = pte_mkwrite(pte, vma); else if (pte_swp_uffd_wp(old_pte)) From 831bc31a5e82dc189040111fa2ab2f81b8521a9b Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 6 Feb 2024 11:08:11 +0800 Subject: [PATCH 153/435] mm: hugetlb: improve the handling of hugetlb allocation failure for freed or in-use hugetlb alloc_and_dissolve_hugetlb_folio() preallocates a new hugetlb page before it takes hugetlb_lock. In 3 out of 4 cases the page is not really used and therefore the newly allocated page is just freed right away. This is wasteful and it might cause pre-mature failures in those cases. Address that by moving the allocation down to the only case (hugetlb page is really in the free pages pool). We need to drop hugetlb_lock to do so and therefore need to recheck the page state after regaining it. The patch is more of a cleanup than an actual fix to an existing problem. There are no known reports about pre-mature failures. Link: https://lkml.kernel.org/r/62890fd60b1ecd5bf1cdc476c973f60fe37aa0cb.1707181934.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: Michal Hocko Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/hugetlb.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 44f1e6366d04..3d651fc1dd66 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3029,21 +3029,9 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nid = folio_nid(old_folio); - struct folio *new_folio; + struct folio *new_folio = NULL; int ret = 0; - /* - * Before dissolving the folio, we need to allocate a new one for the - * pool to remain stable. Here, we allocate the folio and 'prep' it - * by doing everything but actually updating counters and adding to - * the pool. This simplifies and let us do most of the processing - * under the lock. - */ - new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); - if (!new_folio) - return -ENOMEM; - __prep_new_hugetlb_folio(h, new_folio); - retry: spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(old_folio)) { @@ -3073,6 +3061,16 @@ retry: cond_resched(); goto retry; } else { + if (!new_folio) { + spin_unlock_irq(&hugetlb_lock); + new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, + NULL, NULL); + if (!new_folio) + return -ENOMEM; + __prep_new_hugetlb_folio(h, new_folio); + goto retry; + } + /* * Ok, old_folio is still a genuine free hugepage. Remove it from * the freelist and decrease the counters. These will be @@ -3100,9 +3098,11 @@ retry: free_new: spin_unlock_irq(&hugetlb_lock); - /* Folio has a zero ref count, but needs a ref to be freed */ - folio_ref_unfreeze(new_folio, 1); - update_and_free_hugetlb_folio(h, new_folio, false); + if (new_folio) { + /* Folio has a zero ref count, but needs a ref to be freed */ + folio_ref_unfreeze(new_folio, 1); + update_and_free_hugetlb_folio(h, new_folio, false); + } return ret; } From 6a080670d68834d22d47506033e8940b16da4322 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Feb 2024 18:51:58 -0800 Subject: [PATCH 154/435] mm/damon/sysfs: handle 'state' file inputs for every sampling interval if possible DAMON sysfs interface need to access kdamond-touching data for some of kdamond user commands. It uses ->after_aggregation() kdamond callback to safely access the data in the case. It had to use the aggregation interval callback because that was the only callback that users can access complete monitoring results. Since patch series "mm/damon: provide pseudo-moving sum based access rate", which starts from commit 78fbfb155d20 ("mm/damon/core: define and use a dedicated function for region access rate update"), DAMON provides good-to-use quality moitoring results for every sampling interval. It aims to help users who need to quickly retrieve the monitoring results. When the aggregation interval is set too long and therefore waiting for the aggregation interval can degrade user experience, or when the access pattern is expected to be significantly changed[1] could be such cases. However, because DAMON sysfs interface is still handling the commands per aggregation interval, the end user cannot get the benefit. Update DAMON sysfs interface to handle kdamond commands for every sampling interval if applicable. Specifically, all kdamond data accessing commands except 'commit' command are applicable. [1] https://lore.kernel.org/r/20240129121316.GA9706@cuiyangpei Link: https://lkml.kernel.org/r/20240206025158.203097-1-sj@kernel.org Signed-off-by: SeongJae Park Cc: xiongping1 Signed-off-by: Andrew Morton --- mm/damon/sysfs-common.h | 2 ++ mm/damon/sysfs-schemes.c | 22 +++++++++------------- mm/damon/sysfs.c | 21 ++++++++++++++++++--- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 4c37a166eb81..ec0703e1e90b 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -49,6 +49,8 @@ int damon_sysfs_schemes_update_regions_start( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx, bool total_bytes_only); +void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx); + bool damos_sysfs_regions_upd_done(void); int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index ae0f0b314f3a..f6c7f43f06cc 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -127,17 +127,17 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = { * * Once the tried regions update request is received, the request handling * start function (damon_sysfs_scheme_update_regions_start()) sets the status - * of all schemes as 'idle' again, and register ->before_damos_apply() and - * ->after_sampling() callbacks. + * of all schemes as 'idle' again, and register ->before_damos_apply() + * callback. * * Then, the first followup ->before_damos_apply() callback * (damon_sysfs_before_damos_apply()) sets the status 'started'. The first - * ->after_sampling() callback (damon_sysfs_after_sampling()) after the call - * is called only after the scheme is completely applied - * to the given snapshot. Hence the callback knows the situation by showing - * 'started' status, and sets the status as 'finished'. Then, - * damon_sysfs_before_damos_apply() understands the situation by showing the - * 'finished' status and do nothing. + * ->after_sampling() or ->after_aggregation() callback + * (damon_sysfs_cmd_request_callback()) after the call is called only after + * the scheme is completely applied to the given snapshot. Hence the callback + * knows the situation by showing 'started' status, and sets the status as + * 'finished'. Then, damon_sysfs_before_damos_apply() understands the + * situation by showing the 'finished' status and do nothing. * * If DAMOS is not applied to any region due to any reasons including the * access pattern, the watermarks, the quotas, and the filters, @@ -2122,7 +2122,7 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, * callback is registered, damon_sysfs_lock should be held to ensure the * regions directories exist. */ -static int damon_sysfs_after_sampling(struct damon_ctx *ctx) +void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx) { struct damon_sysfs_schemes *sysfs_schemes = damon_sysfs_schemes_for_damos_callback; @@ -2138,8 +2138,6 @@ static int damon_sysfs_after_sampling(struct damon_ctx *ctx) sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_FINISHED; } - - return 0; } /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ @@ -2212,7 +2210,6 @@ int damon_sysfs_schemes_update_regions_start( damos_tried_regions_init_upd_status(sysfs_schemes, ctx); damos_regions_upd_total_bytes_only = total_bytes_only; ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; - ctx->callback.after_sampling = damon_sysfs_after_sampling; return 0; } @@ -2241,7 +2238,6 @@ int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx) { damon_sysfs_schemes_for_damos_callback = NULL; ctx->callback.before_damos_apply = NULL; - ctx->callback.after_sampling = NULL; damon_sysfs_schemes_region_idx = 0; return 0; } diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 1f891e18b4ee..678de97fcc88 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1379,11 +1379,13 @@ static int damon_sysfs_commit_schemes_quota_goals( * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests. * @c: The DAMON context of the callback. * @active: Whether @c is not deactivated due to watermarks. + * @after_aggr: Whether this is called from after_aggregation() callback. * * This function is periodically called back from the kdamond thread for @c. * Then, it checks if there is a waiting DAMON sysfs request and handles it. */ -static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active) +static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active, + bool after_aggregation) { struct damon_sysfs_kdamond *kdamond; bool total_bytes_only = false; @@ -1401,6 +1403,8 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active) err = damon_sysfs_upd_schemes_stats(kdamond); break; case DAMON_SYSFS_CMD_COMMIT: + if (!after_aggregation) + goto out; err = damon_sysfs_commit_input(kdamond); break; case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS: @@ -1418,6 +1422,7 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active) goto keep_lock_out; } } else { + damos_sysfs_mark_finished_regions_updates(c); /* * Continue regions updating if DAMON is till * active and the update for all schemes is not @@ -1450,7 +1455,16 @@ static int damon_sysfs_after_wmarks_check(struct damon_ctx *c) * after_wmarks_check() is called back while the context is deactivated * by watermarks. */ - return damon_sysfs_cmd_request_callback(c, false); + return damon_sysfs_cmd_request_callback(c, false, false); +} + +static int damon_sysfs_after_sampling(struct damon_ctx *c) +{ + /* + * after_sampling() is called back only while the context is not + * deactivated by watermarks. + */ + return damon_sysfs_cmd_request_callback(c, true, false); } static int damon_sysfs_after_aggregation(struct damon_ctx *c) @@ -1459,7 +1473,7 @@ static int damon_sysfs_after_aggregation(struct damon_ctx *c) * after_aggregation() is called back only while the context is not * deactivated by watermarks. */ - return damon_sysfs_cmd_request_callback(c, true); + return damon_sysfs_cmd_request_callback(c, true, true); } static struct damon_ctx *damon_sysfs_build_ctx( @@ -1478,6 +1492,7 @@ static struct damon_ctx *damon_sysfs_build_ctx( } ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check; + ctx->callback.after_sampling = damon_sysfs_after_sampling; ctx->callback.after_aggregation = damon_sysfs_after_aggregation; ctx->callback.before_terminate = damon_sysfs_before_terminate; return ctx; From 601e793a749d6798764cd2eb7f459452f971ee61 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Feb 2024 10:01:51 +0800 Subject: [PATCH 155/435] mm/demotion: print demotion targets Currently, when a demotion occurs, it will prioritize selecting a node from the preferred targets as the destination node for the demotion. If the preferred node does not meet the requirements, it will try from all the lower memory tier nodes until it finds a suitable demotion destination node or ultimately fails. However, the demotion target information isn't exposed to the users, especially the preferred target information, which relies on more factors. This makes it hard for users to understand the exact demotion behavior. Rather than having a new sysfs interface to expose this information, printing directly to kernel messages, just like the current page allocation fallback order does. A dmesg example with this patch is as follows: [ 0.704860] Demotion targets for Node 0: null [ 0.705456] Demotion targets for Node 1: null // node 2 is onlined [ 32.259775] Demotion targets for Node 0: perferred: 2, fallback: 2 [ 32.261290] Demotion targets for Node 1: perferred: 2, fallback: 2 [ 32.262726] Demotion targets for Node 2: null // node 3 is onlined [ 42.448809] Demotion targets for Node 0: perferred: 2, fallback: 2-3 [ 42.450704] Demotion targets for Node 1: perferred: 2, fallback: 2-3 [ 42.452556] Demotion targets for Node 2: perferred: 3, fallback: 3 [ 42.454136] Demotion targets for Node 3: null // node 4 is onlined [ 52.676833] Demotion targets for Node 0: perferred: 2, fallback: 2-4 [ 52.678735] Demotion targets for Node 1: perferred: 2, fallback: 2-4 [ 52.680493] Demotion targets for Node 2: perferred: 4, fallback: 3-4 [ 52.682154] Demotion targets for Node 3: null [ 52.683405] Demotion targets for Node 4: null // node 5 is onlined [ 62.931902] Demotion targets for Node 0: perferred: 2, fallback: 2-5 [ 62.938266] Demotion targets for Node 1: perferred: 5, fallback: 2-5 [ 62.943515] Demotion targets for Node 2: perferred: 4, fallback: 3-4 [ 62.947471] Demotion targets for Node 3: null [ 62.949908] Demotion targets for Node 4: null [ 62.952137] Demotion targets for Node 5: perferred: 3, fallback: 3-4 Regarding this requirement, we have previously discussed [1]. The initial proposal involved introducing a new sysfs interface. However, due to concerns about potential changes and compatibility issues with the interface in the future, a consensus was not reached with the community. Therefore, this time, we are directly printing out the information. [1] https://lore.kernel.org/all/d1d5add8-8f4a-4578-8bf0-2cbe79b09989@fujitsu.com/ Link: https://lkml.kernel.org/r/20240206020151.605516-1-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Reviewed-by: "Huang, Ying" Cc: Aneesh Kumar K.V Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index ed20f96bf89d..0537664620e5 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -359,6 +359,26 @@ static void disable_all_demotion_targets(void) synchronize_rcu(); } +static void dump_demotion_targets(void) +{ + int node; + + for_each_node_state(node, N_MEMORY) { + struct memory_tier *memtier = __node_get_memory_tier(node); + nodemask_t preferred = node_demotion[node].preferred; + + if (!memtier) + continue; + + if (nodes_empty(preferred)) + pr_info("Demotion targets for Node %d: null\n", node); + else + pr_info("Demotion targets for Node %d: preferred: %*pbl, fallback: %*pbl\n", + node, nodemask_pr_args(&preferred), + nodemask_pr_args(&memtier->lower_tier_mask)); + } +} + /* * Find an automatic demotion target for all memory * nodes. Failing here is OK. It might just indicate @@ -443,7 +463,7 @@ static void establish_demotion_targets(void) * Now build the lower_tier mask for each node collecting node mask from * all memory tier below it. This allows us to fallback demotion page * allocation to a set of nodes that is closer the above selected - * perferred node. + * preferred node. */ lower_tier = node_states[N_MEMORY]; list_for_each_entry(memtier, &memory_tiers, list) { @@ -456,6 +476,8 @@ static void establish_demotion_targets(void) nodes_andnot(lower_tier, lower_tier, tier_nodes); memtier->lower_tier_mask = lower_tier; } + + dump_demotion_targets(); } #else From 80ba4caf8ba92bb8f852b5e09251274c11e4b647 Mon Sep 17 00:00:00 2001 From: Mark-PK Tsai Date: Sat, 7 Oct 2023 15:05:53 +0800 Subject: [PATCH 156/435] zram: use copy_page for full page copy Some architectures, such as arm, have implemented optimized copy_page for full page copying. Replace the full page memcpy with copy_page to take advantage of the optimization. Link: https://lkml.kernel.org/r/20231007070554.8657-1-mark-pk.tsai@mediatek.com Signed-off-by: Mark-PK Tsai Reviewed-by: Sergey Senozhatsky Cc: AngeloGioacchino Del Regno Cc: Jens Axboe Cc: Matthias Brugger Cc: Minchan Kim Cc: YJ Chiang Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6772e0c654fa..242a1fece18d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1337,7 +1337,7 @@ static int zram_read_from_zspool(struct zram *zram, struct page *page, src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); if (size == PAGE_SIZE) { dst = kmap_local_page(page); - memcpy(dst, src, PAGE_SIZE); + copy_page(dst, src); kunmap_local(dst); ret = 0; } else { From 0c32c9f7a58e7736b27f9d6766e9f21d34a26eff Mon Sep 17 00:00:00 2001 From: John Groves Date: Mon, 5 Feb 2024 18:57:37 -0600 Subject: [PATCH 157/435] memremap.h: correct an error in a comment It tried to send me off to memory_hotplug.h for an enum that is a few lines above... Link: https://lkml.kernel.org/r/dba0f5f01162d6fa16e4da2a9fede7f97080e92d.1707179960.git.john@groves.net Signed-off-by: John Groves Reviewed-by: Dan Williams Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/memremap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 9837f3e6fb95..3f7143ade32c 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -109,7 +109,7 @@ struct dev_pagemap_ops { * @altmap: pre-allocated/reserved memory for vmemmap allocations * @ref: reference count that pins the devm_memremap_pages() mapping * @done: completion for @ref - * @type: memory type: see MEMORY_* in memory_hotplug.h + * @type: memory type: see MEMORY_* above in memremap.h * @flags: PGMAP_* flags to specify defailed behavior * @vmemmap_shift: structural definition of how the vmemmap page metadata * is populated, specifically the metadata page order. From faf4977ef0842538f85edf9d55be1209d8185e33 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 7 Feb 2024 12:31:27 -0800 Subject: [PATCH 158/435] selftests/damon/_damon_sysfs: support DAMOS quota Patch series "selftests/damon: add more tests for core functionalities and corner cases". Continue DAMON selftests' test coverage improvement works with a trivial improvement of the test code itself. The sequence of the patches in patchset is as follows. The first five patches add two DAMON core functionalities tests. Those begins with three patches (patches 1-3) that update the test-purpose DAMON sysfs interface wrapper to support DAMOS quota, stats, and apply interval features, respectively. The fourth patch implements and adds a selftest for DAMOS quota feature, using the DAMON sysfs interface wrapper's newly added support of the quota and the stats feature. The fifth patch further implements and adds a selftest for DAMOS apply interval using the DAMON sysfs interface wrapper's newly added support of the apply interval and the stats feature. Two patches (patches 6 and 7) for implementing and adding two corner cases handling selftests follow. Those try to avoid two previously fixed bugs from recurring. Finally, a patch for making DAMON debugfs selftests dependency checker to use /proc/mounts instead of the hard-coded mount point assumption follows. This patch (of 8): Update the test-purpose DAMON sysfs control Python module to support DAMOS quota. Link: https://lkml.kernel.org/r/20240207203134.69976-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240207203134.69976-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 42 +++++++++++++++---- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index e98cf4b6a4b7..b4f6e385c564 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -70,18 +70,48 @@ class DamosAccessPattern: if err != None: return err +class DamosQuota: + sz = None # size quota, in bytes + ms = None # time quota + reset_interval_ms = None # quota reset interval + scheme = None # owner scheme + + def __init__(self, sz=0, ms=0, reset_interval_ms=0): + self.sz = sz + self.ms = ms + self.reset_interval_ms = reset_interval_ms + + def sysfs_dir(self): + return os.path.join(self.scheme.sysfs_dir(), 'quotas') + + def stage(self): + err = write_file(os.path.join(self.sysfs_dir(), 'bytes'), self.sz) + if err != None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'ms'), self.ms) + if err != None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'reset_interval_ms'), + self.reset_interval_ms) + if err != None: + return err + class Damos: action = None access_pattern = None - # todo: Support quotas, watermarks, stats, tried_regions + quota = None + # todo: Support watermarks, stats, tried_regions idx = None context = None tried_bytes = None - def __init__(self, action='stat', access_pattern=DamosAccessPattern()): + def __init__(self, action='stat', access_pattern=DamosAccessPattern(), + quota=DamosQuota()): self.action = action self.access_pattern = access_pattern self.access_pattern.scheme = self + self.quota = quota + self.quota.scheme = self def sysfs_dir(self): return os.path.join( @@ -94,13 +124,7 @@ class Damos: err = self.access_pattern.stage() if err != None: return err - - # disable quotas - err = write_file(os.path.join(self.sysfs_dir(), 'quotas', 'ms'), '0') - if err != None: - return err - err = write_file( - os.path.join(self.sysfs_dir(), 'quotas', 'bytes'), '0') + err = self.quota.stage() if err != None: return err From a0f87454c0e397431d1eb0bb1d9e6c0fab0fbe2b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 7 Feb 2024 12:31:28 -0800 Subject: [PATCH 159/435] selftests/damon/_damon_sysfs: support DAMOS stats Update the test-purpose DAMON sysfs control Python module to support DAMOS stats. Link: https://lkml.kernel.org/r/20240207203134.69976-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index b4f6e385c564..a75244451684 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -96,6 +96,20 @@ class DamosQuota: if err != None: return err +class DamosStats: + nr_tried = None + sz_tried = None + nr_applied = None + sz_applied = None + qt_exceeds = None + + def __init__(self, nr_tried, sz_tried, nr_applied, sz_applied, qt_exceeds): + self.nr_tried = nr_tried + self.sz_tried = sz_tried + self.nr_applied = nr_applied + self.sz_applied = sz_applied + self.qt_exceeds = qt_exceeds + class Damos: action = None access_pattern = None @@ -104,6 +118,7 @@ class Damos: idx = None context = None tried_bytes = None + stats = None def __init__(self, action='stat', access_pattern=DamosAccessPattern(), quota=DamosQuota()): @@ -322,6 +337,23 @@ class Kdamond: return err scheme.tried_bytes = int(content) + def update_schemes_stats(self): + err = write_file(os.path.join(self.sysfs_dir(), 'state'), + 'update_schemes_stats') + if err != None: + return err + for context in self.contexts: + for scheme in context.schemes: + stat_values = [] + for stat in ['nr_tried', 'sz_tried', 'nr_applied', + 'sz_applied', 'qt_exceeds']: + content, err = read_file( + os.path.join(scheme.sysfs_dir(), 'stats', stat)) + if err != None: + return err + stat_values.append(int(content)) + scheme.stats = DamosStats(*stat_values) + class Kdamonds: kdamonds = [] From a8622625bf645bc095cb4676d36ff12b53d5d0a4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 7 Feb 2024 12:31:29 -0800 Subject: [PATCH 160/435] selftests/damon/_damon_sysfs: support DAMOS apply interval Update the test-purpose DAMON sysfs control Python module to support DAMOS apply interval. Link: https://lkml.kernel.org/r/20240207203134.69976-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index a75244451684..d23d7398a27a 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -114,6 +114,7 @@ class Damos: action = None access_pattern = None quota = None + apply_interval_us = None # todo: Support watermarks, stats, tried_regions idx = None context = None @@ -121,12 +122,13 @@ class Damos: stats = None def __init__(self, action='stat', access_pattern=DamosAccessPattern(), - quota=DamosQuota()): + quota=DamosQuota(), apply_interval_us=0): self.action = action self.access_pattern = access_pattern self.access_pattern.scheme = self self.quota = quota self.quota.scheme = self + self.apply_interval_us = apply_interval_us def sysfs_dir(self): return os.path.join( @@ -139,6 +141,11 @@ class Damos: err = self.access_pattern.stage() if err != None: return err + err = write_file(os.path.join(self.sysfs_dir(), 'apply_interval_us'), + '%d' % self.apply_interval_us) + if err != None: + return err + err = self.quota.stage() if err != None: return err From 51f58c9da14b1680e352f28a16365803c125af6d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 7 Feb 2024 12:31:30 -0800 Subject: [PATCH 161/435] selftests/damon: add a test for DAMOS quota Add a selftest for verifying the DAMOS quota feature. The test is very similar to sysfs_update_schemes_tried_regions_wss_estimation.py. It starts an artificial workload of 20 MiB working set, run DAMON to find the working set size, but with 1 MiB/100 ms size quota. Then, it collect the DAMON-found working set size every 100 ms and check if the quota was always applied as expected. For the confirmation, the tests shows the stat-applied region size and the qt_exceeds stat. Link: https://lkml.kernel.org/r/20240207203134.69976-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 1 + tools/testing/selftests/damon/damos_quota.py | 67 ++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 tools/testing/selftests/damon/damos_quota.py diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 8a1cc2bf1864..9c3783f1a39d 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -12,6 +12,7 @@ TEST_PROGS += debugfs_rm_non_contexts.sh TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py +TEST_PROGS += damos_quota.py TEST_PROGS += reclaim.sh lru_sort.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/damos_quota.py b/tools/testing/selftests/damon/damos_quota.py new file mode 100644 index 000000000000..7d4c6bb2e3cd --- /dev/null +++ b/tools/testing/selftests/damon/damos_quota.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +import time + +import _damon_sysfs + +def main(): + # access two 10 MiB memory regions, 2 second per each + sz_region = 10 * 1024 * 1024 + proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000']) + + # Set quota up to 1 MiB per 100 ms + sz_quota = 1024 * 1024 # 1 MiB + quota_reset_interval = 100 # 100 ms + kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond( + contexts=[_damon_sysfs.DamonCtx( + ops='vaddr', + targets=[_damon_sysfs.DamonTarget(pid=proc.pid)], + schemes=[_damon_sysfs.Damos( + access_pattern=_damon_sysfs.DamosAccessPattern( + # >= 25% access rate, >= 200ms age + nr_accesses=[5, 20], age=[2, 2**64 - 1]), + quota=_damon_sysfs.DamosQuota( + sz=sz_quota, reset_interval_ms=quota_reset_interval) + )] # schemes + )] # contexts + )]) # kdamonds + + err = kdamonds.start() + if err != None: + print('kdamond start failed: %s' % err) + exit(1) + + wss_collected = [] + nr_quota_exceeds = 0 + while proc.poll() == None: + time.sleep(0.1) + err = kdamonds.kdamonds[0].update_schemes_tried_bytes() + if err != None: + print('tried bytes update failed: %s' % err) + exit(1) + err = kdamonds.kdamonds[0].update_schemes_stats() + if err != None: + print('stats update failed: %s' % err) + exit(1) + + scheme = kdamonds.kdamonds[0].contexts[0].schemes[0] + wss_collected.append(scheme.tried_bytes) + nr_quota_exceeds = scheme.stats.qt_exceeds + + wss_collected.sort() + for wss in wss_collected: + if wss > sz_quota: + print('quota is not kept: %s > %s' % (wss, sz_quota)) + print('collected samples are as below') + print('\n'.join(['%d' % wss for wss in wss_collected])) + exit(1) + + if nr_quota_exceeds < len(wss_collected): + print('quota is not always exceeded: %d > %d' % + (len(wss_collected), nr_quota_exceeds)) + exit(1) + +if __name__ == '__main__': + main() From ce7a2834659fd18eda042574696fe4191f069e31 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 7 Feb 2024 12:31:31 -0800 Subject: [PATCH 162/435] selftests/damon: add a test for DAMOS apply intervals Add a selftest for DAMOS apply intervals. It runs two schemes having different apply interval agains an artificial memory access workload, and check if the scheme with smaller apply interval was applied more frequently. Link: https://lkml.kernel.org/r/20240207203134.69976-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 2 +- .../selftests/damon/damos_apply_interval.py | 67 +++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/damon/damos_apply_interval.py diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 9c3783f1a39d..b545fedafb3b 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -12,7 +12,7 @@ TEST_PROGS += debugfs_rm_non_contexts.sh TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py -TEST_PROGS += damos_quota.py +TEST_PROGS += damos_quota.py damos_apply_interval.py TEST_PROGS += reclaim.sh lru_sort.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/damos_apply_interval.py b/tools/testing/selftests/damon/damos_apply_interval.py new file mode 100644 index 000000000000..f04d43702481 --- /dev/null +++ b/tools/testing/selftests/damon/damos_apply_interval.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +import time + +import _damon_sysfs + +def main(): + # access two 10 MiB memory regions, 2 second per each + sz_region = 10 * 1024 * 1024 + proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000']) + + # Set quota up to 1 MiB per 100 ms + kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond( + contexts=[_damon_sysfs.DamonCtx( + ops='vaddr', + targets=[_damon_sysfs.DamonTarget(pid=proc.pid)], + schemes=[ + _damon_sysfs.Damos( + access_pattern=_damon_sysfs.DamosAccessPattern( + # >= 25% access rate, >= 200ms age + nr_accesses=[5, 20], age=[2, 2**64 - 1]), + # aggregation interval (100 ms) is used + apply_interval_us=0), + # use 10ms apply interval + _damon_sysfs.Damos( + access_pattern=_damon_sysfs.DamosAccessPattern( + # >= 25% access rate, >= 200ms age + nr_accesses=[5, 20], age=[2, 2**64 - 1]), + # explicitly set 10 ms apply interval + apply_interval_us=10 * 1000) + ] # schemes + )] # contexts + )]) # kdamonds + + err = kdamonds.start() + if err != None: + print('kdamond start failed: %s' % err) + exit(1) + + wss_collected = [] + nr_quota_exceeds = 0 + while proc.poll() == None: + time.sleep(0.1) + err = kdamonds.kdamonds[0].update_schemes_stats() + if err != None: + print('stats update failed: %s' % err) + exit(1) + schemes = kdamonds.kdamonds[0].contexts[0].schemes + nr_tried_stats = [s.stats.nr_tried for s in schemes] + if nr_tried_stats[0] == 0 or nr_tried_stats[1] == 0: + print('scheme(s) are not tried') + exit(1) + + # Because the second scheme was having the apply interval that is ten times + # lower than that of the first scheme, the second scheme should be tried + # about ten times more frequently than the first scheme. For possible + # timing errors, check if it was at least nine times more freuqnetly tried. + ratio = nr_tried_stats[1] / nr_tried_stats[0] + if ratio < 9: + print('%d / %d = %f (< 9)' % + (nr_tried_stats[1], nr_tried_stats[0], ratio)) + exit(1) + +if __name__ == '__main__': + main() From e6255a297628ddd7ead9cf8fb2ce11168595c89d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 7 Feb 2024 12:31:32 -0800 Subject: [PATCH 163/435] selftests/damon: add a test for a race between target_ids_read() and dbgfs_before_terminate() commit 34796417964b ("mm/damon/dbgfs: protect targets destructions with kdamond_lock") fixed a race of DAMON debugfs interface. Specifically, the race was happening between target_ids_read() and dbgfs_before_terminate(). Add a test for the issue to prevent the problem from accidentally recurring. Link: https://lkml.kernel.org/r/20240207203134.69976-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/.gitignore | 1 + tools/testing/selftests/damon/Makefile | 2 + ...fs_target_ids_read_before_terminate_race.c | 80 +++++++++++++++++++ ...s_target_ids_read_before_terminate_race.sh | 14 ++++ 4 files changed, 97 insertions(+) create mode 100644 tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c create mode 100644 tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore index c6c2965a6607..7d6c6e062be7 100644 --- a/tools/testing/selftests/damon/.gitignore +++ b/tools/testing/selftests/damon/.gitignore @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only huge_count_read_write +debugfs_target_ids_read_before_terminate_race diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index b545fedafb3b..8a3a8df003db 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -2,6 +2,7 @@ # Makefile for damon selftests TEST_GEN_FILES += huge_count_read_write +TEST_GEN_FILES += debugfs_target_ids_read_before_terminate_race TEST_GEN_FILES += access_memory TEST_FILES = _chk_dependency.sh _debugfs_common.sh @@ -9,6 +10,7 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += debugfs_rm_non_contexts.sh +TEST_PROGS += debugfs_target_ids_read_before_terminate_race.sh TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c new file mode 100644 index 000000000000..b06f52a8ce2d --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author: SeongJae Park + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DBGFS_MONITOR_ON "/sys/kernel/debug/damon/monitor_on_DEPRECATED" +#define DBGFS_TARGET_IDS "/sys/kernel/debug/damon/target_ids" + +static void turn_damon_on_exit(void) +{ + int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR); + int monitor_on_fd = open(DBGFS_MONITOR_ON, O_RDWR); + char pid_str[128]; + + snprintf(pid_str, sizeof(pid_str), "%d", getpid()); + write(target_ids_fd, pid_str, sizeof(pid_str)); + write(monitor_on_fd, "on\n", 3); + close(target_ids_fd); + close(monitor_on_fd); + usleep(1000); + exit(0); +} + +static void try_race(void) +{ + int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR); + int pid = fork(); + int buf[256]; + + if (pid < 0) { + fprintf(stderr, "fork() failed\n"); + exit(1); + } + if (pid == 0) + turn_damon_on_exit(); + while (true) { + int status; + + read(target_ids_fd, buf, sizeof(buf)); + if (waitpid(-1, &status, WNOHANG) == pid) + break; + } + close(target_ids_fd); +} + +static inline uint64_t ts_to_ms(struct timespec *ts) +{ + return (uint64_t)ts->tv_sec * 1000 + (uint64_t)ts->tv_nsec / 1000000; +} + +int main(int argc, char *argv[]) +{ + struct timespec start_time, now; + int runtime_ms; + + if (argc != 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + exit(1); + } + runtime_ms = atoi(argv[1]); + clock_gettime(CLOCK_MONOTONIC, &start_time); + while (true) { + try_race(); + clock_gettime(CLOCK_MONOTONIC, &now); + if (ts_to_ms(&now) - ts_to_ms(&start_time) > runtime_ms) + break; + } + return 0; +} diff --git a/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh new file mode 100644 index 000000000000..fc793c4c9aea --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_target_ids_read_before_terminate_race.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +dmesg -C + +./debugfs_target_ids_read_before_terminate_race 5000 + +if dmesg | grep -q dbgfs_target_ids_read +then + dmesg + exit 1 +else + exit 0 +fi From f08db42b1c3a6c62ab13ac466f5d8d72d3dd79c5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 7 Feb 2024 12:31:33 -0800 Subject: [PATCH 164/435] selftests/damon: add a test for the pid leak of dbgfs_target_ids_write() Commit ebb3f994dd92 ("mm/damon/dbgfs: fix 'struct pid' leaks in 'dbgfs_target_ids_write()'") fixes a pid leak bug in DAMON debugfs interface, namely dbgfs_target_ids_write() function. Add a selftest for the issue to prevent the problem from mistakenly recurring. Link: https://lkml.kernel.org/r/20240207203134.69976-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/.gitignore | 1 + tools/testing/selftests/damon/Makefile | 2 + .../damon/debugfs_target_ids_pid_leak.c | 68 +++++++++++++++++++ .../damon/debugfs_target_ids_pid_leak.sh | 22 ++++++ 4 files changed, 93 insertions(+) create mode 100644 tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c create mode 100644 tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore index 7d6c6e062be7..d861701f0327 100644 --- a/tools/testing/selftests/damon/.gitignore +++ b/tools/testing/selftests/damon/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only huge_count_read_write debugfs_target_ids_read_before_terminate_race +debugfs_target_ids_pid_leak diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 8a3a8df003db..789d6949c247 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -3,6 +3,7 @@ TEST_GEN_FILES += huge_count_read_write TEST_GEN_FILES += debugfs_target_ids_read_before_terminate_race +TEST_GEN_FILES += debugfs_target_ids_pid_leak TEST_GEN_FILES += access_memory TEST_FILES = _chk_dependency.sh _debugfs_common.sh @@ -11,6 +12,7 @@ TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += debugfs_rm_non_contexts.sh TEST_PROGS += debugfs_target_ids_read_before_terminate_race.sh +TEST_PROGS += debugfs_target_ids_pid_leak.sh TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c new file mode 100644 index 000000000000..0cc2eef7d142 --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author: SeongJae Park + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DBGFS_TARGET_IDS "/sys/kernel/debug/damon/target_ids" + +static void write_targetid_exit(void) +{ + int target_ids_fd = open(DBGFS_TARGET_IDS, O_RDWR); + char pid_str[128]; + + snprintf(pid_str, sizeof(pid_str), "%d", getpid()); + write(target_ids_fd, pid_str, sizeof(pid_str)); + close(target_ids_fd); + exit(0); +} + +unsigned long msec_timestamp(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000UL + tv.tv_usec / 1000; +} + +int main(int argc, char *argv[]) +{ + unsigned long start_ms; + int time_to_run, nr_forks = 0; + + if (argc != 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + exit(1); + } + time_to_run = atoi(argv[1]); + + start_ms = msec_timestamp(); + while (true) { + int pid = fork(); + + if (pid < 0) { + fprintf(stderr, "fork() failed\n"); + exit(1); + } + if (pid == 0) + write_targetid_exit(); + wait(NULL); + nr_forks++; + + if (msec_timestamp() - start_ms > time_to_run) + break; + } + printf("%d\n", nr_forks); + return 0; +} diff --git a/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh new file mode 100644 index 000000000000..31fe33c2b032 --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_target_ids_pid_leak.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +before=$(grep "^pid " /proc/slabinfo | awk '{print $2}') + +nr_leaks=$(./debugfs_target_ids_pid_leak 1000) +expected_after_max=$((before + nr_leaks / 2)) + +after=$(grep "^pid " /proc/slabinfo | awk '{print $2}') + +echo > /sys/kernel/debug/damon/target_ids + +echo "tried $nr_leaks pid leak" +echo "number of active pid slabs: $before -> $after" +echo "(up to $expected_after_max expected)" +if [ $after -gt $expected_after_max ] +then + echo "maybe pids are leaking" + exit 1 +else + exit 0 +fi From 501e3dc505f7c8ce8da0532db81e99fae6688074 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 7 Feb 2024 12:31:34 -0800 Subject: [PATCH 165/435] selftests/damon/_chk_dependency: get debugfs mount point from /proc/mounts DAMON debugfs selftests dependency checker assumes debugfs would be mounted at /sys/kernel/debug. That would be ok for many cases, but some systems might mounted the file system on some different places. Parse the real mount point using /proc/mounts file. Link: https://lkml.kernel.org/r/20240207203134.69976-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_chk_dependency.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/damon/_chk_dependency.sh b/tools/testing/selftests/damon/_chk_dependency.sh index 350f8c2b071d..dda3a87dc00a 100644 --- a/tools/testing/selftests/damon/_chk_dependency.sh +++ b/tools/testing/selftests/damon/_chk_dependency.sh @@ -4,7 +4,14 @@ # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 -DBGFS=/sys/kernel/debug/damon +DBGFS=$(grep debugfs /proc/mounts --max-count 1 | awk '{print $2}') +if [ "$DBGFS" = "" ] +then + echo "debugfs not mounted" + exit $ksft_skip +fi + +DBGFS+="/damon" if [ $EUID -ne 0 ]; then From b9ad003af13a1fe34319da6c2082038bce833831 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 6 Feb 2024 10:27:31 +0530 Subject: [PATCH 166/435] mm/cma: add sysfs file 'release_pages_success' This adds the following new sysfs file tracking the number of successfully released pages from a given CMA heap area. This file will be available via CONFIG_CMA_SYSFS and help in determining active CMA pages available on the CMA heap area. This adds a new 'nr_pages_released' (CONFIG_CMA_SYSFS) into 'struct cma' which gets updated during cma_release(). /sys/kernel/mm/cma//release_pages_success After this change, an user will be able to find active CMA pages available in a given CMA heap area via the following method. Active pages = alloc_pages_success - release_pages_success That's valuable information for both software designers, and system admins as it allows them to tune the number of CMA pages available in the system. This increases user visibility for allocated CMA area and its utilization. Link: https://lkml.kernel.org/r/20240206045731.472759-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-cma | 6 ++++++ mm/cma.c | 1 + mm/cma.h | 5 +++++ mm/cma_sysfs.c | 15 +++++++++++++++ 4 files changed, 27 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma b/Documentation/ABI/testing/sysfs-kernel-mm-cma index 02b2bb60c296..dfd755201142 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-cma +++ b/Documentation/ABI/testing/sysfs-kernel-mm-cma @@ -23,3 +23,9 @@ Date: Feb 2021 Contact: Minchan Kim Description: the number of pages CMA API failed to allocate + +What: /sys/kernel/mm/cma//release_pages_success +Date: Feb 2024 +Contact: Anshuman Khandual +Description: + the number of pages CMA API succeeded to release diff --git a/mm/cma.c b/mm/cma.c index 4902bbfe24f1..01f5a8f71ddf 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -562,6 +562,7 @@ bool cma_release(struct cma *cma, const struct page *pages, free_contig_range(pfn, count); cma_clear_bitmap(cma, pfn, count); + cma_sysfs_account_release_pages(cma, count); trace_cma_release(cma->name, pfn, pages, count); return true; diff --git a/mm/cma.h b/mm/cma.h index 88a0595670b7..ad61cc6dd439 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -27,6 +27,8 @@ struct cma { atomic64_t nr_pages_succeeded; /* the number of CMA page allocation failures */ atomic64_t nr_pages_failed; + /* the number of CMA page released */ + atomic64_t nr_pages_released; /* kobject requires dynamic object */ struct cma_kobject *cma_kobj; #endif @@ -44,10 +46,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma) #ifdef CONFIG_CMA_SYSFS void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages); void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages); +void cma_sysfs_account_release_pages(struct cma *cma, unsigned long nr_pages); #else static inline void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages) {}; static inline void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages) {}; +static inline void cma_sysfs_account_release_pages(struct cma *cma, + unsigned long nr_pages) {}; #endif #endif diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c index 56347d15b7e8..f50db3973171 100644 --- a/mm/cma_sysfs.c +++ b/mm/cma_sysfs.c @@ -24,6 +24,11 @@ void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages) atomic64_add(nr_pages, &cma->nr_pages_failed); } +void cma_sysfs_account_release_pages(struct cma *cma, unsigned long nr_pages) +{ + atomic64_add(nr_pages, &cma->nr_pages_released); +} + static inline struct cma *cma_from_kobj(struct kobject *kobj) { return container_of(kobj, struct cma_kobject, kobj)->cma; @@ -48,6 +53,15 @@ static ssize_t alloc_pages_fail_show(struct kobject *kobj, } CMA_ATTR_RO(alloc_pages_fail); +static ssize_t release_pages_success_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_released)); +} +CMA_ATTR_RO(release_pages_success); + static void cma_kobj_release(struct kobject *kobj) { struct cma *cma = cma_from_kobj(kobj); @@ -60,6 +74,7 @@ static void cma_kobj_release(struct kobject *kobj) static struct attribute *cma_attrs[] = { &alloc_pages_success_attr.attr, &alloc_pages_fail_attr.attr, + &release_pages_success_attr.attr, NULL, }; ATTRIBUTE_GROUPS(cma); From 3e40b3f41723c67c989bec03e704268ac432b759 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 8 Feb 2024 09:36:07 +0800 Subject: [PATCH 167/435] mm: compaction: refactor compact_node() Refactor compact_node() to handle both proactive and synchronous compact memory, which cleanups code a bit. Link: https://lkml.kernel.org/r/20240208013607.1731817-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Baolin Wang Signed-off-by: Andrew Morton --- mm/compaction.c | 65 ++++++++++++++++--------------------------------- 1 file changed, 21 insertions(+), 44 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 0ea5519e2833..12709d0bcf40 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2796,25 +2796,27 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, } /* - * Compact all zones within a node till each zone's fragmentation score - * reaches within proactive compaction thresholds (as determined by the - * proactiveness tunable). + * compact_node() - compact all zones within a node + * @pgdat: The node page data + * @proactive: Whether the compaction is proactive * - * It is possible that the function returns before reaching score targets - * due to various back-off conditions, such as, contention on per-node or - * per-zone locks. + * For proactive compaction, compact till each zone's fragmentation score + * reaches within proactive compaction thresholds (as determined by the + * proactiveness tunable), it is possible that the function returns before + * reaching score targets due to various back-off conditions, such as, + * contention on per-node or per-zone locks. */ -static void proactive_compact_node(pg_data_t *pgdat) +static void compact_node(pg_data_t *pgdat, bool proactive) { int zoneid; struct zone *zone; struct compact_control cc = { .order = -1, - .mode = MIGRATE_SYNC_LIGHT, + .mode = proactive ? MIGRATE_SYNC_LIGHT : MIGRATE_SYNC, .ignore_skip_hint = true, .whole_zone = true, .gfp_mask = GFP_KERNEL, - .proactive_compaction = true, + .proactive_compaction = proactive, }; for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { @@ -2826,41 +2828,16 @@ static void proactive_compact_node(pg_data_t *pgdat) compact_zone(&cc, NULL); - count_compact_events(KCOMPACTD_MIGRATE_SCANNED, - cc.total_migrate_scanned); - count_compact_events(KCOMPACTD_FREE_SCANNED, - cc.total_free_scanned); + if (proactive) { + count_compact_events(KCOMPACTD_MIGRATE_SCANNED, + cc.total_migrate_scanned); + count_compact_events(KCOMPACTD_FREE_SCANNED, + cc.total_free_scanned); + } } } -/* Compact all zones within a node */ -static void compact_node(int nid) -{ - pg_data_t *pgdat = NODE_DATA(nid); - int zoneid; - struct zone *zone; - struct compact_control cc = { - .order = -1, - .mode = MIGRATE_SYNC, - .ignore_skip_hint = true, - .whole_zone = true, - .gfp_mask = GFP_KERNEL, - }; - - - for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { - - zone = &pgdat->node_zones[zoneid]; - if (!populated_zone(zone)) - continue; - - cc.zone = zone; - - compact_zone(&cc, NULL); - } -} - -/* Compact all nodes in the system */ +/* Compact all zones of all nodes in the system */ static void compact_nodes(void) { int nid; @@ -2869,7 +2846,7 @@ static void compact_nodes(void) lru_add_drain_all(); for_each_online_node(nid) - compact_node(nid); + compact_node(NODE_DATA(nid), false); } static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, @@ -2931,7 +2908,7 @@ static ssize_t compact_store(struct device *dev, /* Flush pending updates to the LRU lists */ lru_add_drain_all(); - compact_node(nid); + compact_node(NODE_DATA(nid), false); } return count; @@ -3140,7 +3117,7 @@ static int kcompactd(void *p) unsigned int prev_score, score; prev_score = fragmentation_score_node(pgdat); - proactive_compact_node(pgdat); + compact_node(pgdat, true); score = fragmentation_score_node(pgdat); /* * Defer proactive compaction if the fragmentation From 9c1490d911f8a79b24da0bc1742588fce0e5db41 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 9 Feb 2024 14:30:03 +0000 Subject: [PATCH 168/435] selftests/mm: log skipped compaction test as a skip Patch series "selftests/mm: Output cleanups for the compaction test". A couple of small updates for the check_compaction selftest which make it play more nicely with test automation systems. This patch (of 2): When the compaction test is run it checks to make sure that prerequistives the test requires are available and skips the tests if not. When this happens we log the test as a pass rather than a skip, log as a skip so that the distinction is clear and automation can see unexpected skips. Link: https://lkml.kernel.org/r/20240209-kselftest-mm-cleanup-v1-0-a3c0386496b5@kernel.org Link: https://lkml.kernel.org/r/20240209-kselftest-mm-cleanup-v1-1-a3c0386496b5@kernel.org Signed-off-by: Mark Brown Cc: Muhammad Usama Anjum Cc: Ryan Roberts Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/compaction_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c index 656afba02dbc..30150929c8c5 100644 --- a/tools/testing/selftests/mm/compaction_test.c +++ b/tools/testing/selftests/mm/compaction_test.c @@ -174,7 +174,7 @@ int main(int argc, char **argv) ksft_print_header(); if (prereq() || geteuid()) - return ksft_exit_pass(); + return ksft_exit_skip("Prerequisites unsatisfied\n"); ksft_set_plan(1); From f3b7568c49420d2dcd251032c9ca1e069ec8a6c9 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 9 Feb 2024 14:30:04 +0000 Subject: [PATCH 169/435] selftests/mm: log a consistent test name for check_compaction Every test result report in the compaction test prints a distinct log messae, and some of the reports print a name that varies at runtime. This causes problems for automation since a lot of automation software uses the printed string as the name of the test, if the name varies from run to run and from pass to fail then the automation software can't identify that a test changed result or that the same tests are being run. Refactor the logging to use a consistent name when printing the result of the test, printing the existing messages as diagnostic information instead so they are still available for people trying to interpret the results. Link: https://lkml.kernel.org/r/20240209-kselftest-mm-cleanup-v1-2-a3c0386496b5@kernel.org Signed-off-by: Mark Brown Cc: Muhammad Usama Anjum Cc: Ryan Roberts Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/compaction_test.c | 35 +++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c index 30150929c8c5..533999b6c284 100644 --- a/tools/testing/selftests/mm/compaction_test.c +++ b/tools/testing/selftests/mm/compaction_test.c @@ -95,21 +95,22 @@ int check_compaction(unsigned long mem_free, unsigned int hugepage_size) fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK); if (fd < 0) { - ksft_test_result_fail("Failed to open /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); - return -1; + ksft_print_msg("Failed to open /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); + ret = -1; + goto out; } if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) { - ksft_test_result_fail("Failed to read from /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); + ksft_print_msg("Failed to read from /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); goto close_fd; } /* Start with the initial condition of 0 huge pages*/ if (write(fd, "0", sizeof(char)) != sizeof(char)) { - ksft_test_result_fail("Failed to write 0 to /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); + ksft_print_msg("Failed to write 0 to /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); goto close_fd; } @@ -118,16 +119,16 @@ int check_compaction(unsigned long mem_free, unsigned int hugepage_size) /* Request a large number of huge pages. The Kernel will allocate as much as it can */ if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) { - ksft_test_result_fail("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); + ksft_print_msg("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); goto close_fd; } lseek(fd, 0, SEEK_SET); if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) { - ksft_test_result_fail("Failed to re-read from /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); + ksft_print_msg("Failed to re-read from /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); goto close_fd; } @@ -139,24 +140,26 @@ int check_compaction(unsigned long mem_free, unsigned int hugepage_size) if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages)) != strlen(initial_nr_hugepages)) { - ksft_test_result_fail("Failed to write value to /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); + ksft_print_msg("Failed to write value to /proc/sys/vm/nr_hugepages: %s\n", + strerror(errno)); goto close_fd; } + ksft_print_msg("Number of huge pages allocated = %d\n", + atoi(nr_hugepages)); + if (compaction_index > 3) { ksft_print_msg("ERROR: Less that 1/%d of memory is available\n" "as huge pages\n", compaction_index); - ksft_test_result_fail("No of huge pages allocated = %d\n", (atoi(nr_hugepages))); goto close_fd; } - ksft_test_result_pass("Memory compaction succeeded. No of huge pages allocated = %d\n", - (atoi(nr_hugepages))); ret = 0; close_fd: close(fd); + out: + ksft_test_result(ret == 0, "check_compaction\n"); return ret; } From f576a1e80c3a97ea26b436bc060df42cf6d19026 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 9 Feb 2024 04:41:12 +0000 Subject: [PATCH 170/435] mm/zswap: optimize and cleanup the invalidation of duplicate entry We may encounter duplicate entry in the zswap_store(): 1. swap slot that freed to per-cpu swap cache, doesn't invalidate the zswap entry, then got reused. This has been fixed. 2. !exclusive load mode, swapin folio will leave its zswap entry on the tree, then swapout again. This has been removed. 3. one folio can be dirtied again after zswap_store(), so need to zswap_store() again. This should be handled correctly. So we must invalidate the old duplicate entry before inserting the new one, which actually doesn't have to be done at the beginning of zswap_store(). The good point is that we don't need to lock the tree twice in the normal store success path. And cleanup the loop as we are here. Note we still need to invalidate the old duplicate entry when store failed or zswap is disabled , otherwise the new data in swapfile could be overwrite by the old data in zswap pool when lru writeback. Link: https://lkml.kernel.org/r/20240209044112.3883835-1-chengming.zhou@linux.dev Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Acked-by: Chris Li Acked-by: Nhat Pham Signed-off-by: Andrew Morton --- mm/zswap.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 96664cdee207..62fe307521c9 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1517,19 +1517,8 @@ bool zswap_store(struct folio *folio) if (folio_test_large(folio)) return false; - /* - * If this is a duplicate, it must be removed before attempting to store - * it, otherwise, if the store fails the old page won't be removed from - * the tree, and it might be written back overriding the new data. - */ - spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); - if (entry) - zswap_invalidate_entry(tree, entry); - spin_unlock(&tree->lock); - if (!zswap_enabled) - return false; + goto check_old; objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) { @@ -1609,14 +1598,12 @@ insert_entry: /* map */ spin_lock(&tree->lock); /* - * A duplicate entry should have been removed at the beginning of this - * function. Since the swap entry should be pinned, if a duplicate is - * found again here it means that something went wrong in the swap - * cache. + * The folio may have been dirtied again, invalidate the + * possibly stale entry before inserting the new entry. */ - while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { - WARN_ON(1); + if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { zswap_invalidate_entry(tree, dupentry); + WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry)); } if (entry->length) { INIT_LIST_HEAD(&entry->lru); @@ -1639,6 +1626,17 @@ freepage: reject: if (objcg) obj_cgroup_put(objcg); +check_old: + /* + * If the zswap store fails or zswap is disabled, we must invalidate the + * possibly stale entry which was previously stored at this offset. + * Otherwise, writeback could overwrite the new data in the swapfile. + */ + spin_lock(&tree->lock); + entry = zswap_rb_search(&tree->rbroot, offset); + if (entry) + zswap_invalidate_entry(tree, entry); + spin_unlock(&tree->lock); return false; shrink: From cfb837e8433179995709578ca5b4741adcd54ec7 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 12 Feb 2024 19:29:51 +0100 Subject: [PATCH 171/435] mm: document memalloc_noreclaim_save() and memalloc_pin_save() The memalloc_noreclaim_save() function currently has no documentation comment, so the implications of its usage are not obvious. Namely that it not only prevents entering reclaim (as the name suggests), but also allows using all memory reserves and thus should be only used in contexts that are allocating memory to free memory. This may lead to new improper usages being added. Thus add a documenting comment, based on the description of __GFP_MEMALLOC. While at it, also document memalloc_pin_save() so that all the memalloc_ scopes are documented. For those already documented, add missing Return: descriptions, and mark Context: description per kernel-docs style guide. In the comments describing the relevant PF_MEMALLOC flags, refer to their scope setting functions. [vbabka@suse.cz: fix issues that Mike pointed out] Link: https://lkml.kernel.org/r/20240215095827.13756-2-vbabka@suse.cz Link: https://lkml.kernel.org/r/20240212182950.32730-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Mike Rapoport (IBM) Acked-by: Michal Hocko Cc: Kent Overstreet Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/sched.h | 9 ++++--- include/linux/sched/mm.h | 55 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b9ce285d8c9c..998861865b84 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1624,15 +1624,15 @@ extern struct pid *cad_pid; #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ -#define PF_MEMALLOC 0x00000800 /* Allocating memory */ +#define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF__HOLE__00010000 0x00010000 #define PF_KSWAPD 0x00020000 /* I am kswapd */ -#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ -#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ +#define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ +#define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ @@ -1642,7 +1642,8 @@ extern struct pid *cad_pid; #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ +#define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning. + * See memalloc_pin_save() */ #define PF__HOLE__20000000 0x20000000 #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 9a19f1b42f64..7a4066d22883 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -315,7 +315,8 @@ static inline void might_alloc(gfp_t gfp_mask) * point of view. Use memalloc_noio_restore to end the scope with flags * returned by this function. * - * This function is safe to be used from any context. + * Context: This function is safe to be used from any context. + * Return: The saved flags to be passed to memalloc_noio_restore. */ static inline unsigned int memalloc_noio_save(void) { @@ -346,7 +347,8 @@ static inline void memalloc_noio_restore(unsigned int flags) * point of view. Use memalloc_nofs_restore to end the scope with flags * returned by this function. * - * This function is safe to be used from any context. + * Context: This function is safe to be used from any context. + * Return: The saved flags to be passed to memalloc_nofs_restore. */ static inline unsigned int memalloc_nofs_save(void) { @@ -368,6 +370,29 @@ static inline void memalloc_nofs_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; } +/** + * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope. + * + * This function marks the beginning of the __GFP_MEMALLOC allocation scope. + * All further allocations will implicitly add the __GFP_MEMALLOC flag, which + * prevents entering reclaim and allows access to all memory reserves. This + * should only be used when the caller guarantees the allocation will allow more + * memory to be freed very shortly, i.e. it needs to allocate some memory in + * the process of freeing memory, and cannot reclaim due to potential recursion. + * + * Users of this scope have to be extremely careful to not deplete the reserves + * completely and implement a throttling mechanism which controls the + * consumption of the reserve based on the amount of freed memory. Usage of a + * pre-allocated pool (e.g. mempool) should be always considered before using + * this scope. + * + * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC + * + * Context: This function should not be used in an interrupt context as that one + * does not give PF_MEMALLOC access to reserves. + * See __gfp_pfmemalloc_flags(). + * Return: The saved flags to be passed to memalloc_noreclaim_restore. + */ static inline unsigned int memalloc_noreclaim_save(void) { unsigned int flags = current->flags & PF_MEMALLOC; @@ -375,11 +400,29 @@ static inline unsigned int memalloc_noreclaim_save(void) return flags; } +/** + * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope. + * @flags: Flags to restore. + * + * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save + * function. Always make sure that the given flags is the return value from the + * pairing memalloc_noreclaim_save call. + */ static inline void memalloc_noreclaim_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC) | flags; } +/** + * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope. + * + * This function marks the beginning of the ~__GFP_MOVABLE allocation scope. + * All further allocations will implicitly remove the __GFP_MOVABLE flag, which + * will constraint the allocations to zones that allow long term pinning, i.e. + * not ZONE_MOVABLE zones. + * + * Return: The saved flags to be passed to memalloc_pin_restore. + */ static inline unsigned int memalloc_pin_save(void) { unsigned int flags = current->flags & PF_MEMALLOC_PIN; @@ -388,6 +431,14 @@ static inline unsigned int memalloc_pin_save(void) return flags; } +/** + * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope. + * @flags: Flags to restore. + * + * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function. + * Always make sure that the given flags is the return value from the pairing + * memalloc_pin_save call. + */ static inline void memalloc_pin_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags; From e10aea105e9ed14b62a11844fec6aaa87c6935a3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 12 Feb 2024 12:15:52 +0100 Subject: [PATCH 172/435] kasan/test: avoid gcc warning for intentional overflow The out-of-bounds test allocates an object that is three bytes too short in order to validate the bounds checking. Starting with gcc-14, this causes a compile-time warning as gcc has grown smart enough to understand the sizeof() logic: mm/kasan/kasan_test.c: In function 'kmalloc_oob_16': mm/kasan/kasan_test.c:443:14: error: allocation of insufficient size '13' for type 'struct ' with size '16' [-Werror=alloc-size] 443 | ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL); | ^ Hide the actual computation behind a RELOC_HIDE() that ensures the compiler misses the intentional bug. Link: https://lkml.kernel.org/r/20240212111609.869266-1-arnd@kernel.org Fixes: 3f15801cdc23 ("lib: add kasan test module") Signed-off-by: Arnd Bergmann Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Arnd Bergmann Cc: Dmitry Vyukov Cc: Marco Elver Cc: Vincenzo Frascino Cc: Signed-off-by: Andrew Morton --- mm/kasan/kasan_test.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 318d9cec111a..2d8ae4fbe63b 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -440,7 +440,8 @@ static void kmalloc_oob_16(struct kunit *test) /* This test is specifically crafted for the generic mode. */ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); - ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL); + /* RELOC_HIDE to prevent gcc from warning about short alloc */ + ptr1 = RELOC_HIDE(kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL), 0); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL); From 1ce2292c148043360c49122b1310da7132f1b8f6 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 14 Feb 2024 06:05:34 +0000 Subject: [PATCH 173/435] mm/mglru: drop unused parameter Patch series "mm/mglru: code cleanup and refactoring" This provides MGLRU code cleanup and refactoring for better readability. This patch (of 5): struct scan_control *sc is currently passed into try_to_inc_max_seq() and run_aging(). This parameter is not used. Drop the unused parameter struct scan_control *sc. No functional change. Link: https://lkml.kernel.org/r/20240214060538.3524462-1-kinseyho@google.com Link: https://lkml.kernel.org/r/20240214060538.3524462-2-kinseyho@google.com Signed-off-by: Kinsey Ho Cc: Aneesh Kumar K.V Cc: Donet Tom Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 327bf904fdcd..0ff72ccd65b4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3816,7 +3816,7 @@ unlock: } static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, - struct scan_control *sc, bool can_swap, bool force_scan) + bool can_swap, bool force_scan) { bool success; struct lru_gen_mm_walk *walk; @@ -4672,7 +4672,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool return nr_to_scan; /* skip this lruvec as it's low on cold folios */ - return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; + return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0; } static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) @@ -5332,7 +5332,7 @@ static const struct seq_operations lru_gen_seq_ops = { .show = lru_gen_seq_show, }; -static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, +static int run_aging(struct lruvec *lruvec, unsigned long seq, bool can_swap, bool force_scan) { DEFINE_MAX_SEQ(lruvec); @@ -5347,7 +5347,7 @@ static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_contr if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) return -ERANGE; - try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); + try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan); return 0; } @@ -5415,7 +5415,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, switch (cmd) { case '+': - err = run_aging(lruvec, seq, sc, swappiness, opt); + err = run_aging(lruvec, seq, swappiness, opt); break; case '-': err = run_eviction(lruvec, seq, sc, swappiness, opt); From 51973cc9e53899e50e47c1e4c2967c3021cb6257 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 14 Feb 2024 06:05:35 +0000 Subject: [PATCH 174/435] mm/mglru: improve should_run_aging() scan_control *sc does not need to be passed into should_run_aging(), as it provides only the reclaim priority. This can be moved to get_nr_to_scan(). Refactor should_run_aging() and get_nr_to_scan() to improve code readability. No functional changes. Link: https://lkml.kernel.org/r/20240214060538.3524462-3-kinseyho@google.com Signed-off-by: Kinsey Ho Cc: Aneesh Kumar K.V Cc: Donet Tom Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 0ff72ccd65b4..6d1704e7bfe0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4584,14 +4584,13 @@ retry: } static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, - struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) + bool can_swap, unsigned long *nr_to_scan) { int gen, type, zone; unsigned long old = 0; unsigned long young = 0; unsigned long total = 0; struct lru_gen_folio *lrugen = &lruvec->lrugen; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); /* whether this lruvec is completely out of cold folios */ @@ -4619,13 +4618,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, } } - /* try to scrape all its memory if this memcg was deleted */ - if (!mem_cgroup_online(memcg)) { - *nr_to_scan = total; - return false; - } - - *nr_to_scan = total >> sc->priority; + *nr_to_scan = total; /* * The aging tries to be lazy to reduce the overhead, while the eviction @@ -4657,6 +4650,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, */ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) { + bool success; unsigned long nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); @@ -4664,14 +4658,17 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) return -1; - if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) + success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan); + + /* try to scrape all its memory if this memcg was deleted */ + if (nr_to_scan && !mem_cgroup_online(memcg)) return nr_to_scan; - /* skip the aging path at the default priority */ - if (sc->priority == DEF_PRIORITY) - return nr_to_scan; + /* try to get away with not aging at the default priority */ + if (!success || sc->priority == DEF_PRIORITY) + return nr_to_scan >> sc->priority; - /* skip this lruvec as it's low on cold folios */ + /* stop scanning this lruvec as it's low on cold folios */ return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0; } From 2d823764fafa6593a87d3192c62e19eec4883d84 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 14 Feb 2024 06:05:36 +0000 Subject: [PATCH 175/435] mm/mglru: improve reset_mm_stats() struct lruvec* is already a field of struct lru_gen_mm_walk. Remove the parameter struct lruvec* into functions that already have access to struct lru_gen_mm_walk*. Also, we do not need to handle reset histogram stats when !should_walk_mmu(). Remove the call to reset_mm_stats() in iterate_mm_list_nowalk(). Link: https://lkml.kernel.org/r/20240214060538.3524462-4-kinseyho@google.com Signed-off-by: Kinsey Ho Cc: Aneesh Kumar K.V Cc: Donet Tom Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 6d1704e7bfe0..1bec586d854e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2879,38 +2879,37 @@ static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) #endif -static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) +static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last) { int i; int hist; + struct lruvec *lruvec = walk->lruvec; struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); - if (walk) { - hist = lru_hist_from_seq(walk->max_seq); + hist = lru_hist_from_seq(walk->max_seq); - for (i = 0; i < NR_MM_STATS; i++) { - WRITE_ONCE(mm_state->stats[hist][i], - mm_state->stats[hist][i] + walk->mm_stats[i]); - walk->mm_stats[i] = 0; - } + for (i = 0; i < NR_MM_STATS; i++) { + WRITE_ONCE(mm_state->stats[hist][i], + mm_state->stats[hist][i] + walk->mm_stats[i]); + walk->mm_stats[i] = 0; } if (NR_HIST_GENS > 1 && last) { - hist = lru_hist_from_seq(mm_state->seq + 1); + hist = lru_hist_from_seq(walk->max_seq + 1); for (i = 0; i < NR_MM_STATS; i++) WRITE_ONCE(mm_state->stats[hist][i], 0); } } -static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, - struct mm_struct **iter) +static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **iter) { bool first = false; bool last = false; struct mm_struct *mm = NULL; + struct lruvec *lruvec = walk->lruvec; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); @@ -2954,7 +2953,7 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, } while (!(mm = get_next_mm(walk))); done: if (*iter || last) - reset_mm_stats(lruvec, walk, last); + reset_mm_stats(walk, last); spin_unlock(&mm_list->lock); @@ -2984,7 +2983,6 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) mm_state->head = NULL; mm_state->tail = NULL; WRITE_ONCE(mm_state->seq, mm_state->seq + 1); - reset_mm_stats(lruvec, NULL, true); success = true; } @@ -3159,9 +3157,10 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, walk->nr_pages[new_gen][type][zone] += delta; } -static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) +static void reset_batch_size(struct lru_gen_mm_walk *walk) { int gen, type, zone; + struct lruvec *lruvec = walk->lruvec; struct lru_gen_folio *lrugen = &lruvec->lrugen; walk->batched = 0; @@ -3591,7 +3590,7 @@ done: return -EAGAIN; } -static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) +static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) { static const struct mm_walk_ops mm_walk_ops = { .test_walk = should_skip_vma, @@ -3600,6 +3599,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ }; int err; + struct lruvec *lruvec = walk->lruvec; struct mem_cgroup *memcg = lruvec_memcg(lruvec); walk->next_addr = FIRST_USER_ADDRESS; @@ -3628,7 +3628,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ if (walk->batched) { spin_lock_irq(&lruvec->lru_lock); - reset_batch_size(lruvec, walk); + reset_batch_size(walk); spin_unlock_irq(&lruvec->lru_lock); } @@ -3856,9 +3856,9 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, walk->force_scan = force_scan; do { - success = iterate_mm_list(lruvec, walk, &mm); + success = iterate_mm_list(walk, &mm); if (mm) - walk_mm(lruvec, mm, walk); + walk_mm(mm, walk); } while (mm); done: if (success) { @@ -4558,8 +4558,10 @@ retry: move_folios_to_lru(lruvec, &list); walk = current->reclaim_state->mm_walk; - if (walk && walk->batched) - reset_batch_size(lruvec, walk); + if (walk && walk->batched) { + walk->lruvec = lruvec; + reset_batch_size(walk); + } item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) From cc25bbe10a86a7fe3ec8468fb01e579e6216d7e1 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 14 Feb 2024 06:05:37 +0000 Subject: [PATCH 176/435] mm/mglru: improve struct lru_gen_mm_walk Rename max_seq to seq in struct lru_gen_mm_walk to keep consistent with struct lru_gen_mm_state. Note that seq is not always up to date with max_seq from lru_gen_folio. No functional changes. Link: https://lkml.kernel.org/r/20240214060538.3524462-5-kinseyho@google.com Signed-off-by: Kinsey Ho Cc: Aneesh Kumar K.V Cc: Donet Tom Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 6 ++--- mm/vmscan.c | 50 ++++++++++++++++++++++-------------------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a497f189d988..633812a1d220 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -464,7 +464,7 @@ enum { #define NR_BLOOM_FILTERS 2 struct lru_gen_mm_state { - /* set to max_seq after each iteration */ + /* synced with max_seq after each iteration */ unsigned long seq; /* where the current iteration continues after */ struct list_head *head; @@ -479,8 +479,8 @@ struct lru_gen_mm_state { struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; - /* unstable max_seq from lru_gen_folio */ - unsigned long max_seq; + /* max_seq from lru_gen_folio: can be out of date */ + unsigned long seq; /* the next address within an mm to scan */ unsigned long next_addr; /* to batch promoted pages */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 1bec586d854e..577e7f9be7c6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2888,7 +2888,7 @@ static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last) lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); - hist = lru_hist_from_seq(walk->max_seq); + hist = lru_hist_from_seq(walk->seq); for (i = 0; i < NR_MM_STATS; i++) { WRITE_ONCE(mm_state->stats[hist][i], @@ -2897,7 +2897,7 @@ static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last) } if (NR_HIST_GENS > 1 && last) { - hist = lru_hist_from_seq(walk->max_seq + 1); + hist = lru_hist_from_seq(walk->seq + 1); for (i = 0; i < NR_MM_STATS; i++) WRITE_ONCE(mm_state->stats[hist][i], 0); @@ -2926,9 +2926,9 @@ static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **ite */ spin_lock(&mm_list->lock); - VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); + VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq); - if (walk->max_seq <= mm_state->seq) + if (walk->seq <= mm_state->seq) goto done; if (!mm_state->head) @@ -2958,7 +2958,7 @@ done: spin_unlock(&mm_list->lock); if (mm && first) - reset_bloom_filter(mm_state, walk->max_seq + 1); + reset_bloom_filter(mm_state, walk->seq + 1); if (*iter) mmput_async(*iter); @@ -2968,7 +2968,7 @@ done: return last; } -static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) +static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq) { bool success = false; struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -2977,9 +2977,9 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) spin_lock(&mm_list->lock); - VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); + VM_WARN_ON_ONCE(mm_state->seq + 1 < seq); - if (max_seq > mm_state->seq) { + if (seq > mm_state->seq) { mm_state->head = NULL; mm_state->tail = NULL; WRITE_ONCE(mm_state->seq, mm_state->seq + 1); @@ -3330,7 +3330,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); + DEFINE_MAX_SEQ(walk->lruvec); + int old_gen, new_gen = lru_gen_from_seq(max_seq); pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl); if (!pte) @@ -3397,7 +3398,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); + DEFINE_MAX_SEQ(walk->lruvec); + int old_gen, new_gen = lru_gen_from_seq(max_seq); VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -3528,7 +3530,7 @@ restart: walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); } - if (!walk->force_scan && !test_bloom_filter(mm_state, walk->max_seq, pmd + i)) + if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i)) continue; walk->mm_stats[MM_NONLEAF_FOUND]++; @@ -3539,7 +3541,7 @@ restart: walk->mm_stats[MM_NONLEAF_ADDED]++; /* carry over to the next generation */ - update_bloom_filter(mm_state, walk->max_seq + 1, pmd + i); + update_bloom_filter(mm_state, walk->seq + 1, pmd + i); } walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); @@ -3610,7 +3612,7 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) err = -EBUSY; /* another thread might have called inc_max_seq() */ - if (walk->max_seq != max_seq) + if (walk->seq != max_seq) break; /* folio_update_gen() requires stable folio_memcg() */ @@ -3747,7 +3749,7 @@ next: return success; } -static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, +static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, bool can_swap, bool force_scan) { bool success; @@ -3755,14 +3757,14 @@ static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, int type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; restart: - if (max_seq < READ_ONCE(lrugen->max_seq)) + if (seq < READ_ONCE(lrugen->max_seq)) return false; spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); - success = max_seq == lrugen->max_seq; + success = seq == lrugen->max_seq; if (!success) goto unlock; @@ -3815,7 +3817,7 @@ unlock: return success; } -static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, +static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, bool can_swap, bool force_scan) { bool success; @@ -3824,13 +3826,13 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, struct lru_gen_folio *lrugen = &lruvec->lrugen; struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); - VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); + VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq)); if (!mm_state) - return inc_max_seq(lruvec, max_seq, can_swap, force_scan); + return inc_max_seq(lruvec, seq, can_swap, force_scan); /* see the comment in iterate_mm_list() */ - if (max_seq <= READ_ONCE(mm_state->seq)) + if (seq <= READ_ONCE(mm_state->seq)) return false; /* @@ -3840,18 +3842,18 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, * is less efficient, but it avoids bursty page faults. */ if (!should_walk_mmu()) { - success = iterate_mm_list_nowalk(lruvec, max_seq); + success = iterate_mm_list_nowalk(lruvec, seq); goto done; } walk = set_mm_walk(NULL, true); if (!walk) { - success = iterate_mm_list_nowalk(lruvec, max_seq); + success = iterate_mm_list_nowalk(lruvec, seq); goto done; } walk->lruvec = lruvec; - walk->max_seq = max_seq; + walk->seq = seq; walk->can_swap = can_swap; walk->force_scan = force_scan; @@ -3862,7 +3864,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, } while (mm); done: if (success) { - success = inc_max_seq(lruvec, max_seq, can_swap, force_scan); + success = inc_max_seq(lruvec, seq, can_swap, force_scan); WARN_ON_ONCE(!success); } From 4acef5694e01a2d7de3066e9a6005d485b4374b9 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 14 Feb 2024 06:05:38 +0000 Subject: [PATCH 177/435] mm/mglru: improve swappiness handling The reclaimable number of anon pages used to set initial reclaim priority is only based on get_swappiness(). Use can_reclaim_anon_pages() to include NUMA node demotion. Also move the swappiness handling of when !__GFP_IO in try_to_shrink_lruvec() into isolate_folios(). Link: https://lkml.kernel.org/r/20240214060538.3524462-6-kinseyho@google.com Signed-off-by: Kinsey Ho Cc: Aneesh Kumar K.V Cc: Donet Tom Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 577e7f9be7c6..198d623054c5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4289,7 +4289,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca { bool success; - /* swapping inhibited */ + /* swap constrained */ if (!(sc->gfp_mask & __GFP_IO) && (folio_test_dirty(folio) || (folio_test_anon(folio) && !folio_test_swapcache(folio)))) @@ -4458,9 +4458,12 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw DEFINE_MIN_SEQ(lruvec); /* - * Try to make the obvious choice first. When anon and file are both - * available from the same generation, interpret swappiness 1 as file - * first and 200 as anon first. + * Try to make the obvious choice first, and if anon and file are both + * available from the same generation, + * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon + * first. + * 2. If !__GFP_IO, file first since clean pagecache is more likely to + * exist than clean swapcache. */ if (!swappiness) type = LRU_GEN_FILE; @@ -4470,6 +4473,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw type = LRU_GEN_FILE; else if (swappiness == 200) type = LRU_GEN_ANON; + else if (!(sc->gfp_mask & __GFP_IO)) + type = LRU_GEN_FILE; else type = get_type_to_scan(lruvec, swappiness, &tier); @@ -4713,10 +4718,6 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) unsigned long scanned = 0; int swappiness = get_swappiness(lruvec, sc); - /* clean file folios are more likely to exist */ - if (swappiness && !(sc->gfp_mask & __GFP_IO)) - swappiness = 1; - while (true) { int delta; @@ -4879,7 +4880,6 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control { int priority; unsigned long reclaimable; - struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) return; @@ -4889,7 +4889,7 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control * where reclaimed_to_scanned_ratio = inactive / total. */ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); - if (get_swappiness(lruvec, sc)) + if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); /* round down reclaimable and round up sc->nr_to_reclaim */ From ce70cfb145ad98739c8ae23d070a37863a4c52bb Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 9 Feb 2024 11:12:21 +0530 Subject: [PATCH 178/435] mm/hugetlb: move page order check inside hugetlb_cma_reserve() All platforms could benefit from page order check against MAX_PAGE_ORDER before allocating a CMA area for gigantic hugetlb pages. Let's move this check from individual platforms to generic hugetlb. Link: https://lkml.kernel.org/r/20240209054221.1403364-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Jane Chu Reviewed-by: David Hildenbrand Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/arm64/mm/hugetlbpage.c | 7 ------- arch/powerpc/mm/hugetlbpage.c | 4 +--- mm/hugetlb.c | 7 +++++++ 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 8116ac599f80..6720ec8d50e7 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -45,13 +45,6 @@ void __init arm64_hugetlb_cma_reserve(void) else order = CONT_PMD_SHIFT - PAGE_SHIFT; - /* - * HugeTLB CMA reservation is required for gigantic - * huge pages which could not be allocated via the - * page allocator. Just warn if there is any change - * breaking this assumption. - */ - WARN_ON(order <= MAX_PAGE_ORDER); hugetlb_cma_reserve(order); } #endif /* CONFIG_CMA */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 0a540b37aab6..16557d008eef 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -614,8 +614,6 @@ void __init gigantic_hugetlb_cma_reserve(void) */ order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; - if (order) { - VM_WARN_ON(order <= MAX_PAGE_ORDER); + if (order) hugetlb_cma_reserve(order); - } } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3d651fc1dd66..c53a41d07cd3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7720,6 +7720,13 @@ void __init hugetlb_cma_reserve(int order) bool node_specific_cma_alloc = false; int nid; + /* + * HugeTLB CMA reservation is required for gigantic + * huge pages which could not be allocated via the + * page allocator. Just warn if there is any change + * breaking this assumption. + */ + VM_WARN_ON(order <= MAX_PAGE_ORDER); cma_reserve_called = true; if (!hugetlb_cma_size) From 45866e0e214f0f1927a09011560c108933435350 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 13 Feb 2024 19:54:00 +1300 Subject: [PATCH 179/435] zram: do not allocate physically contiguous strm buffers Currently zram allocates 2 physically contiguous pages per-CPU's compression stream (we may have up to 4 streams per-CPU). Since those buffers are per-CPU we allocate them from CPU hotplug path, which may have higher risks of failed allocations on devices with fragmented memory. Switch to virtually contiguous allocations - crypto comp does not seem impose requirements on compression working buffers to be physically contiguous. Link: https://lkml.kernel.org/r/20240213065400.6561-1-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Sergey Senozhatsky Cc: Jens Axboe Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton --- drivers/block/zram/zcomp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 55af4efd7983..8237b08c49d8 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "zcomp.h" @@ -37,7 +38,7 @@ static void zcomp_strm_free(struct zcomp_strm *zstrm) { if (!IS_ERR_OR_NULL(zstrm->tfm)) crypto_free_comp(zstrm->tfm); - free_pages((unsigned long)zstrm->buffer, 1); + vfree(zstrm->buffer); zstrm->tfm = NULL; zstrm->buffer = NULL; } @@ -53,7 +54,7 @@ static int zcomp_strm_init(struct zcomp_strm *zstrm, struct zcomp *comp) * allocate 2 pages. 1 for compressed data, plus 1 extra for the * case when compressed size is larger than the original one */ - zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + zstrm->buffer = vzalloc(2 * PAGE_SIZE); if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) { zcomp_strm_free(zstrm); return -ENOMEM; From 1883e8ac96ddd73a87db7f2f8c06111148a3db6f Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 22 Jan 2024 21:01:53 +0800 Subject: [PATCH 180/435] mm: compaction: limit the suitable target page order to be less than cc->order It can not improve the fragmentation if we isolate the target free pages exceeding cc->order, especially when the cc->order is less than pageblock_order. For example, suppose the pageblock_order is MAX_ORDER (size is 4M) and cc->order is 2M THP size, we should not isolate other 2M free pages to be the migration target, which can not improve the fragmentation. Moreover this is also applicable for large folio compaction. Link: https://lkml.kernel.org/r/afcd9377351c259df7a25a388a4a0d5862b986f4.1705928395.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/compaction.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 12709d0bcf40..f146478b01bc 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1365,12 +1365,14 @@ static bool suitable_migration_target(struct compact_control *cc, { /* If the page is a large free page, then disallow migration */ if (PageBuddy(page)) { + int order = cc->order > 0 ? cc->order : pageblock_order; + /* * We are checking page_order without zone->lock taken. But * the only small danger is that we skip a potentially suitable * pageblock, so it's not worth to check order for valid range. */ - if (buddy_order_unsafe(page) >= pageblock_order) + if (buddy_order_unsafe(page) >= order) return false; } From 2b2178c40747dcfc54a477dd4def4318b06d30ad Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Mon, 5 Feb 2024 14:56:06 -0800 Subject: [PATCH 181/435] selftests: zswap: add zswap selftest file to zswap maintainer entry Patch series "fix and extend zswap kselftests", v3. Fix a broken zswap kselftest due to cgroup zswap writeback counter renaming, and add 2 zswap kselftests, one to cover the (z)swapin case, and another to check that no zswapping happens when the cgroup limit is 0. Also, add the zswap kselftest file to zswap maintainer entry so that get_maintainers script can find zswap maintainers. This patch (of 3): Make it easier for contributors to find the zswap maintainers when they update the zswap tests. Link: https://lkml.kernel.org/r/20240205225608.3083251-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20240205225608.3083251-2-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Yosry Ahmed Cc: Johannes Weiner Cc: Rik van Riel Cc: Roman Gushchin Cc: Shuah Khan Cc: Tejun Heo Cc: Zefan Li Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index c3c9cf33595c..1dbc57efa6d0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24412,6 +24412,7 @@ F: include/linux/zpool.h F: include/linux/zswap.h F: mm/zpool.c F: mm/zswap.c +F: tools/testing/selftests/cgroup/test_zswap.c THE REST M: Linus Torvalds From 012688f6006c01655e250c1612b544561b7c50be Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Mon, 5 Feb 2024 14:56:07 -0800 Subject: [PATCH 182/435] selftests: fix the zswap invasive shrink test The zswap no invasive shrink selftest breaks because we rename the zswap writeback counter (see [1]). Fix the test. [1]: https://patchwork.kernel.org/project/linux-kselftest/patch/20231205193307.2432803-1-nphamcs@gmail.com/ Link: https://lkml.kernel.org/r/20240205225608.3083251-3-nphamcs@gmail.com Fixes: a697dc2be925 ("selftests: cgroup: update per-memcg zswap writeback selftest") Signed-off-by: Nhat Pham Acked-by: Yosry Ahmed Cc: Johannes Weiner Cc: Rik van Riel Cc: Roman Gushchin Cc: Shuah Khan Cc: Tejun Heo Cc: Zefan Li Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 47fdaa146443..32ce975b21d1 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -52,7 +52,7 @@ static int get_zswap_stored_pages(size_t *value) static int get_cg_wb_count(const char *cg) { - return cg_read_key_long(cg, "memory.stat", "zswp_wb"); + return cg_read_key_long(cg, "memory.stat", "zswpwb"); } static long get_zswpout(const char *cgroup) From b93c28ff72d42b09d539ac68845307b90cf95e53 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Mon, 5 Feb 2024 14:56:08 -0800 Subject: [PATCH 183/435] selftests: add zswapin and no zswap tests Add a selftest to cover the zswapin code path, allocating more memory than the cgroup limit to trigger swapout/zswapout, then reading the pages back in memory several times. This is inspired by a recently encountered kernel crash on the zswapin path in our internal kernel, which went undetected because of a lack of test coverage for this path. Add a selftest to verify that when memory.zswap.max = 0, no pages can go to the zswap pool for the cgroup. [nphamcs@gmail.com: remove redundant comment, add success checks] Link: https://lkml.kernel.org/r/20240222043132.616320-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20240205225608.3083251-4-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Rik van Riel Suggested-by: Yosry Ahmed Acked-by: Yosry Ahmed Cc: Johannes Weiner Cc: Roman Gushchin Cc: Shuah Khan Cc: Tejun Heo Cc: Zefan Li Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 120 +++++++++++++++++++- 1 file changed, 118 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 32ce975b21d1..f0e488ed90d8 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -60,6 +60,27 @@ static long get_zswpout(const char *cgroup) return cg_read_key_long(cgroup, "memory.stat", "zswpout "); } +static int allocate_and_read_bytes(const char *cgroup, void *arg) +{ + size_t size = (size_t)arg; + char *mem = (char *)malloc(size); + int ret = 0; + + if (!mem) + return -1; + for (int i = 0; i < size; i += 4095) + mem[i] = 'a'; + + /* Go through the allocated memory to (z)swap in and out pages */ + for (int i = 0; i < size; i += 4095) { + if (mem[i] != 'a') + ret = -1; + } + + free(mem); + return ret; +} + static int allocate_bytes(const char *cgroup, void *arg) { size_t size = (size_t)arg; @@ -100,7 +121,6 @@ static int test_zswap_usage(const char *root) int ret = KSFT_FAIL; char *test_group; - /* Set up */ test_group = cg_name(root, "no_shrink_test"); if (!test_group) goto out; @@ -133,6 +153,101 @@ out: return ret; } +/* + * Check that when memory.zswap.max = 0, no pages can go to the zswap pool for + * the cgroup. + */ +static int test_swapin_nozswap(const char *root) +{ + int ret = KSFT_FAIL; + char *test_group; + long swap_peak, zswpout; + + test_group = cg_name(root, "no_zswap_test"); + if (!test_group) + goto out; + if (cg_create(test_group)) + goto out; + if (cg_write(test_group, "memory.max", "8M")) + goto out; + if (cg_write(test_group, "memory.zswap.max", "0")) + goto out; + + /* Allocate and read more than memory.max to trigger swapin */ + if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32))) + goto out; + + /* Verify that pages are swapped out, but no zswap happened */ + swap_peak = cg_read_long(test_group, "memory.swap.peak"); + if (swap_peak < 0) { + ksft_print_msg("failed to get cgroup's swap_peak\n"); + goto out; + } + + if (swap_peak < MB(24)) { + ksft_print_msg("at least 24MB of memory should be swapped out\n"); + goto out; + } + + zswpout = get_zswpout(test_group); + if (zswpout < 0) { + ksft_print_msg("failed to get zswpout\n"); + goto out; + } + + if (zswpout > 0) { + ksft_print_msg("zswapout > 0 when memory.zswap.max = 0\n"); + goto out; + } + + ret = KSFT_PASS; + +out: + cg_destroy(test_group); + free(test_group); + return ret; +} + +/* Simple test to verify the (z)swapin code paths */ +static int test_zswapin(const char *root) +{ + int ret = KSFT_FAIL; + char *test_group; + long zswpin; + + test_group = cg_name(root, "zswapin_test"); + if (!test_group) + goto out; + if (cg_create(test_group)) + goto out; + if (cg_write(test_group, "memory.max", "8M")) + goto out; + if (cg_write(test_group, "memory.zswap.max", "max")) + goto out; + + /* Allocate and read more than memory.max to trigger (z)swap in */ + if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32))) + goto out; + + zswpin = cg_read_key_long(test_group, "memory.stat", "zswpin "); + if (zswpin < 0) { + ksft_print_msg("failed to get zswpin\n"); + goto out; + } + + if (zswpin < MB(24) / PAGE_SIZE) { + ksft_print_msg("at least 24MB should be brought back from zswap\n"); + goto out; + } + + ret = KSFT_PASS; + +out: + cg_destroy(test_group); + free(test_group); + return ret; +} + /* * When trying to store a memcg page in zswap, if the memcg hits its memory * limit in zswap, writeback should affect only the zswapped pages of that @@ -144,7 +259,6 @@ static int test_no_invasive_cgroup_shrink(const char *root) size_t control_allocation_size = MB(10); char *control_allocation, *wb_group = NULL, *control_group = NULL; - /* Set up */ wb_group = setup_test_group_1M(root, "per_memcg_wb_test1"); if (!wb_group) return KSFT_FAIL; @@ -309,6 +423,8 @@ struct zswap_test { const char *name; } tests[] = { T(test_zswap_usage), + T(test_swapin_nozswap), + T(test_zswapin), T(test_no_kmem_bypass), T(test_no_invasive_cgroup_shrink), }; From 789753e17c4d6593932f07e40b740373123296a6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 21:44:26 +0100 Subject: [PATCH 184/435] mm/memory: factor out zapping of present pte into zap_present_pte() Patch series "mm/memory: optimize unmap/zap with PTE-mapped THP", v3. This series is based on [1]. Similar to what we did with fork(), let's implement PTE batching during unmap/zap when processing PTE-mapped THPs. We collect consecutive PTEs that map consecutive pages of the same large folio, making sure that the other PTE bits are compatible, and (a) adjust the refcount only once per batch, (b) call rmap handling functions only once per batch, (c) perform batch PTE setting/updates and (d) perform TLB entry removal once per batch. Ryan was previously working on this in the context of cont-pte for arm64, int latest iteration [2] with a focus on arm6 with cont-pte only. This series implements the optimization for all architectures, independent of such PTE bits, teaches MMU gather/TLB code to be fully aware of such large-folio-pages batches as well, and amkes use of our new rmap batching function when removing the rmap. To achieve that, we have to enlighten MMU gather / page freeing code (i.e., everything that consumes encoded_page) to process unmapping of consecutive pages that all belong to the same large folio. I'm being very careful to not degrade order-0 performance, and it looks like I managed to achieve that. While this series should -- similar to [1] -- be beneficial for adding cont-pte support on arm64[2], it's one of the requirements for maintaining a total mapcount[3] for large folios with minimal added overhead and further changes[4] that build up on top of the total mapcount. Independent of all that, this series results in a speedup during munmap() and similar unmapping (process teardown, MADV_DONTNEED on larger ranges) with PTE-mapped THP, which is the default with THPs that are smaller than a PMD (for example, 16KiB to 1024KiB mTHPs for anonymous memory[5]). On an Intel Xeon Silver 4210R CPU, munmap'ing a 1GiB VMA backed by PTE-mapped folios of the same size (stddev < 1%) results in the following runtimes for munmap() in seconds (shorter is better): Folio Size | mm-unstable | New | Change --------------------------------------------- 4KiB | 0.058110 | 0.057715 | - 1% 16KiB | 0.044198 | 0.035469 | -20% 32KiB | 0.034216 | 0.023522 | -31% 64KiB | 0.029207 | 0.018434 | -37% 128KiB | 0.026579 | 0.014026 | -47% 256KiB | 0.025130 | 0.011756 | -53% 512KiB | 0.024292 | 0.010703 | -56% 1024KiB | 0.023812 | 0.010294 | -57% 2048KiB | 0.023785 | 0.009910 | -58% [1] https://lkml.kernel.org/r/20240129124649.189745-1-david@redhat.com [2] https://lkml.kernel.org/r/20231218105100.172635-1-ryan.roberts@arm.com [3] https://lkml.kernel.org/r/20230809083256.699513-1-david@redhat.com [4] https://lkml.kernel.org/r/20231124132626.235350-1-david@redhat.com [5] https://lkml.kernel.org/r/20231207161211.2374093-1-ryan.roberts@arm.com This patch (of 10): Let's prepare for further changes by factoring out processing of present PTEs. Link: https://lkml.kernel.org/r/20240214204435.167852-1-david@redhat.com Link: https://lkml.kernel.org/r/20240214204435.167852-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: linuxppc-dev@lists.ozlabs.org Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/memory.c | 94 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 41 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index e5e5056cb53f..6712453888fa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1534,13 +1534,61 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); } +static inline void zap_present_pte(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, pte_t ptent, + unsigned long addr, struct zap_details *details, + int *rss, bool *force_flush, bool *force_break) +{ + struct mm_struct *mm = tlb->mm; + struct folio *folio = NULL; + bool delay_rmap = false; + struct page *page; + + page = vm_normal_page(vma, addr, ptent); + if (page) + folio = page_folio(page); + + if (unlikely(!should_zap_folio(details, folio))) + return; + ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + arch_check_zapped_pte(vma, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); + if (unlikely(!page)) { + ksm_might_unmap_zero_page(mm, ptent); + return; + } + + if (!folio_test_anon(folio)) { + if (pte_dirty(ptent)) { + folio_mark_dirty(folio); + if (tlb_delay_rmap(tlb)) { + delay_rmap = true; + *force_flush = true; + } + } + if (pte_young(ptent) && likely(vma_has_recency(vma))) + folio_mark_accessed(folio); + } + rss[mm_counter(folio)]--; + if (!delay_rmap) { + folio_remove_rmap_pte(folio, page, vma); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); + } + if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { + *force_flush = true; + *force_break = true; + } +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, struct zap_details *details) { + bool force_flush = false, force_break = false; struct mm_struct *mm = tlb->mm; - int force_flush = 0; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; pte_t *start_pte; @@ -1557,7 +1605,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, arch_enter_lazy_mmu_mode(); do { pte_t ptent = ptep_get(pte); - struct folio *folio = NULL; + struct folio *folio; struct page *page; if (pte_none(ptent)) @@ -1567,45 +1615,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, break; if (pte_present(ptent)) { - unsigned int delay_rmap; - - page = vm_normal_page(vma, addr, ptent); - if (page) - folio = page_folio(page); - - if (unlikely(!should_zap_folio(details, folio))) - continue; - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - arch_check_zapped_pte(vma, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); - zap_install_uffd_wp_if_needed(vma, addr, pte, details, - ptent); - if (unlikely(!page)) { - ksm_might_unmap_zero_page(mm, ptent); - continue; - } - - delay_rmap = 0; - if (!folio_test_anon(folio)) { - if (pte_dirty(ptent)) { - folio_mark_dirty(folio); - if (tlb_delay_rmap(tlb)) { - delay_rmap = 1; - force_flush = 1; - } - } - if (pte_young(ptent) && likely(vma_has_recency(vma))) - folio_mark_accessed(folio); - } - rss[mm_counter(folio)]--; - if (!delay_rmap) { - folio_remove_rmap_pte(folio, page, vma); - if (unlikely(page_mapcount(page) < 0)) - print_bad_pte(vma, addr, ptent, page); - } - if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { - force_flush = 1; + zap_present_pte(tlb, vma, pte, ptent, addr, details, + rss, &force_flush, &force_break); + if (unlikely(force_break)) { addr += PAGE_SIZE; break; } From 0cf18e839f64fff9a58569cc9a596bf97310e044 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 21:44:27 +0100 Subject: [PATCH 185/435] mm/memory: handle !page case in zap_present_pte() separately We don't need uptodate accessed/dirty bits, so in theory we could replace ptep_get_and_clear_full() by an optimized ptep_clear_full() function. Let's rely on the provided pte. Further, there is no scenario where we would have to insert uffd-wp markers when zapping something that is not a normal page (i.e., zeropage). Add a sanity check to make sure this remains true. should_zap_folio() no longer has to handle NULL pointers. This change replaces 2/3 "!page/!folio" checks by a single "!page" one. Note that arch_check_zapped_pte() on x86-64 checks the HW-dirty bit to detect shadow stack entries. But for shadow stack entries, the HW dirty bit (in combination with non-writable PTEs) is set by software. So for the arch_check_zapped_pte() check, we don't have to sync against HW setting the HW dirty bit concurrently, it is always set. Link: https://lkml.kernel.org/r/20240214204435.167852-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/memory.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 6712453888fa..2b1e9abbe1e0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1499,10 +1499,6 @@ static inline bool should_zap_folio(struct zap_details *details, if (should_zap_cows(details)) return true; - /* E.g. the caller passes NULL for the case of a zero folio */ - if (!folio) - return true; - /* Otherwise we should only zap non-anon folios */ return !folio_test_anon(folio); } @@ -1540,24 +1536,28 @@ static inline void zap_present_pte(struct mmu_gather *tlb, int *rss, bool *force_flush, bool *force_break) { struct mm_struct *mm = tlb->mm; - struct folio *folio = NULL; bool delay_rmap = false; + struct folio *folio; struct page *page; page = vm_normal_page(vma, addr, ptent); - if (page) - folio = page_folio(page); + if (!page) { + /* We don't need up-to-date accessed/dirty bits. */ + ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + arch_check_zapped_pte(vma, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + VM_WARN_ON_ONCE(userfaultfd_wp(vma)); + ksm_might_unmap_zero_page(mm, ptent); + return; + } + folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) return; ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); - if (unlikely(!page)) { - ksm_might_unmap_zero_page(mm, ptent); - return; - } if (!folio_test_anon(folio)) { if (pte_dirty(ptent)) { From d11838ed63ee842fc9ef335b9f3aee3aa26f2ab5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 21:44:28 +0100 Subject: [PATCH 186/435] mm/memory: further separate anon and pagecache folio handling in zap_present_pte() We don't need up-to-date accessed-dirty information for anon folios and can simply work with the ptent we already have. Also, we know the RSS counter we want to update. We can safely move arch_check_zapped_pte() + tlb_remove_tlb_entry() + zap_install_uffd_wp_if_needed() after updating the folio and RSS. While at it, only call zap_install_uffd_wp_if_needed() if there is even any chance that pte_install_uffd_wp_if_needed() would do *something*. That is, just don't bother if uffd-wp does not apply. Link: https://lkml.kernel.org/r/20240214204435.167852-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/memory.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 2b1e9abbe1e0..0b4d76dcf232 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1554,12 +1554,9 @@ static inline void zap_present_pte(struct mmu_gather *tlb, folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) return; - ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); - arch_check_zapped_pte(vma, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); - zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); if (!folio_test_anon(folio)) { + ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); if (pte_dirty(ptent)) { folio_mark_dirty(folio); if (tlb_delay_rmap(tlb)) { @@ -1569,8 +1566,17 @@ static inline void zap_present_pte(struct mmu_gather *tlb, } if (pte_young(ptent) && likely(vma_has_recency(vma))) folio_mark_accessed(folio); + rss[mm_counter(folio)]--; + } else { + /* We don't need up-to-date accessed/dirty bits. */ + ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + rss[MM_ANONPAGES]--; } - rss[mm_counter(folio)]--; + arch_check_zapped_pte(vma, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + if (unlikely(userfaultfd_pte_wp(vma, ptent))) + zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); + if (!delay_rmap) { folio_remove_rmap_pte(folio, page, vma); if (unlikely(page_mapcount(page) < 0)) From 2b42a7e531509577bd822aece610cd6d0dbf0dd7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 21:44:29 +0100 Subject: [PATCH 187/435] mm/memory: factor out zapping folio pte into zap_present_folio_pte() Let's prepare for further changes by factoring it out into a separate function. Link: https://lkml.kernel.org/r/20240214204435.167852-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/memory.c | 53 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 0b4d76dcf232..168096f9360e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1530,30 +1530,14 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); } -static inline void zap_present_pte(struct mmu_gather *tlb, - struct vm_area_struct *vma, pte_t *pte, pte_t ptent, - unsigned long addr, struct zap_details *details, - int *rss, bool *force_flush, bool *force_break) +static inline void zap_present_folio_pte(struct mmu_gather *tlb, + struct vm_area_struct *vma, struct folio *folio, + struct page *page, pte_t *pte, pte_t ptent, unsigned long addr, + struct zap_details *details, int *rss, bool *force_flush, + bool *force_break) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; - struct folio *folio; - struct page *page; - - page = vm_normal_page(vma, addr, ptent); - if (!page) { - /* We don't need up-to-date accessed/dirty bits. */ - ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); - arch_check_zapped_pte(vma, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); - VM_WARN_ON_ONCE(userfaultfd_wp(vma)); - ksm_might_unmap_zero_page(mm, ptent); - return; - } - - folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) - return; if (!folio_test_anon(folio)) { ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); @@ -1588,6 +1572,33 @@ static inline void zap_present_pte(struct mmu_gather *tlb, } } +static inline void zap_present_pte(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, pte_t ptent, + unsigned long addr, struct zap_details *details, + int *rss, bool *force_flush, bool *force_break) +{ + struct mm_struct *mm = tlb->mm; + struct folio *folio; + struct page *page; + + page = vm_normal_page(vma, addr, ptent); + if (!page) { + /* We don't need up-to-date accessed/dirty bits. */ + ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + arch_check_zapped_pte(vma, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + VM_WARN_ON_ONCE(userfaultfd_wp(vma)); + ksm_might_unmap_zero_page(mm, ptent); + return; + } + + folio = page_folio(page); + if (unlikely(!should_zap_folio(details, folio))) + return; + zap_present_folio_pte(tlb, vma, folio, page, pte, ptent, addr, details, + rss, force_flush, force_break); +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, From c30d6bc8d0153630e600e8f67ba88c670d9e1b0c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 21:44:30 +0100 Subject: [PATCH 188/435] mm/mmu_gather: pass "delay_rmap" instead of encoded page to __tlb_remove_page_size() We have two bits available in the encoded page pointer to store additional information. Currently, we use one bit to request delay of the rmap removal until after a TLB flush. We want to make use of the remaining bit internally for batching of multiple pages of the same folio, specifying that the next encoded page pointer in an array is actually "nr_pages". So pass page + delay_rmap flag instead of an encoded page, to handle the encoding internally. Link: https://lkml.kernel.org/r/20240214204435.167852-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yin Fengwei Signed-off-by: Andrew Morton --- arch/s390/include/asm/tlb.h | 13 ++++++------- include/asm-generic/tlb.h | 12 ++++++------ mm/mmu_gather.c | 7 ++++--- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index d1455a601adc..48df896d5b79 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -25,8 +25,7 @@ void __tlb_remove_table(void *_table); static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct encoded_page *page, - int page_size); + struct page *page, bool delay_rmap, int page_size); #define tlb_flush tlb_flush #define pte_free_tlb pte_free_tlb @@ -42,14 +41,14 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page * has already been freed, so just do free_page_and_swap_cache. * - * s390 doesn't delay rmap removal, so there is nothing encoded in - * the page pointer. + * s390 doesn't delay rmap removal. */ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct encoded_page *page, - int page_size) + struct page *page, bool delay_rmap, int page_size) { - free_page_and_swap_cache(encoded_page_ptr(page)); + VM_WARN_ON_ONCE(delay_rmap); + + free_page_and_swap_cache(page); return false; } diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 129a3a759976..2eb7b0d4f5d2 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -260,9 +260,8 @@ struct mmu_gather_batch { */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) -extern bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct encoded_page *page, - int page_size); +extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, + bool delay_rmap, int page_size); #ifdef CONFIG_SMP /* @@ -462,13 +461,14 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - if (__tlb_remove_page_size(tlb, encode_page(page, 0), page_size)) + if (__tlb_remove_page_size(tlb, page, false, page_size)) tlb_flush_mmu(tlb); } -static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page, unsigned int flags) +static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, + struct page *page, bool delay_rmap) { - return __tlb_remove_page_size(tlb, encode_page(page, flags), PAGE_SIZE); + return __tlb_remove_page_size(tlb, page, delay_rmap, PAGE_SIZE); } /* tlb_remove_page diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 604ddf08affe..ac733d81b112 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -116,7 +116,8 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) tlb->local.next = NULL; } -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size) +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, + bool delay_rmap, int page_size) { struct mmu_gather_batch *batch; @@ -131,13 +132,13 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, i * Add the page and check if we are full. If so * force a flush. */ - batch->encoded_pages[batch->nr++] = page; + batch->encoded_pages[batch->nr++] = encode_page(page, delay_rmap); if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } - VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page)); + VM_BUG_ON_PAGE(batch->nr > batch->max, page); return false; } From da510964c095cb5e070800ef38752c453d2aa71d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 21:44:31 +0100 Subject: [PATCH 189/435] mm/mmu_gather: define ENCODED_PAGE_FLAG_DELAY_RMAP Nowadays, encoded pages are only used in mmu_gather handling. Let's update the documentation, and define ENCODED_PAGE_BIT_DELAY_RMAP. While at it, rename ENCODE_PAGE_BITS to ENCODED_PAGE_BITS. If encoded page pointers would ever be used in other context again, we'd likely want to change the defines to reflect their context (e.g., ENCODED_PAGE_FLAG_MMU_GATHER_DELAY_RMAP). For now, let's keep it simple. This is a preparation for using the remaining spare bit to indicate that the next item in an array of encoded pages is a "nr_pages" argument and not an encoded page. Link: https://lkml.kernel.org/r/20240214204435.167852-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 17 +++++++++++------ mm/mmu_gather.c | 5 +++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8b611e13153e..1b89eec0d6df 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -210,8 +210,8 @@ struct page { * * An 'encoded_page' pointer is a pointer to a regular 'struct page', but * with the low bits of the pointer indicating extra context-dependent - * information. Not super-common, but happens in mmu_gather and mlock - * handling, and this acts as a type system check on that use. + * information. Only used in mmu_gather handling, and this acts as a type + * system check on that use. * * We only really have two guaranteed bits in general, although you could * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE) @@ -220,21 +220,26 @@ struct page { * Use the supplied helper functions to endcode/decode the pointer and bits. */ struct encoded_page; -#define ENCODE_PAGE_BITS 3ul + +#define ENCODED_PAGE_BITS 3ul + +/* Perform rmap removal after we have flushed the TLB. */ +#define ENCODED_PAGE_BIT_DELAY_RMAP 1ul + static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags) { - BUILD_BUG_ON(flags > ENCODE_PAGE_BITS); + BUILD_BUG_ON(flags > ENCODED_PAGE_BITS); return (struct encoded_page *)(flags | (unsigned long)page); } static inline unsigned long encoded_page_flags(struct encoded_page *page) { - return ENCODE_PAGE_BITS & (unsigned long)page; + return ENCODED_PAGE_BITS & (unsigned long)page; } static inline struct page *encoded_page_ptr(struct encoded_page *page) { - return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); + return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page); } /* diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index ac733d81b112..6540c99c6758 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -53,7 +53,7 @@ static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_ for (int i = 0; i < batch->nr; i++) { struct encoded_page *enc = batch->encoded_pages[i]; - if (encoded_page_flags(enc)) { + if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) { struct page *page = encoded_page_ptr(enc); folio_remove_rmap_pte(page_folio(page), page, vma); } @@ -119,6 +119,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size) { + int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0; struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); @@ -132,7 +133,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, * Add the page and check if we are full. If so * force a flush. */ - batch->encoded_pages[batch->nr++] = encode_page(page, delay_rmap); + batch->encoded_pages[batch->nr++] = encode_page(page, flags); if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; From 4d5bf0b6183f79ea361dd506365d2a471270735c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 21:44:32 +0100 Subject: [PATCH 190/435] mm/mmu_gather: add tlb_remove_tlb_entries() Let's add a helper that lets us batch-process multiple consecutive PTEs. Note that the loop will get optimized out on all architectures except on powerpc. We have to add an early define of __tlb_remove_tlb_entry() on ppc to make the compiler happy (and avoid making tlb_remove_tlb_entries() a macro). [arnd@kernel.org: change __tlb_remove_tlb_entry() to an inline function] Link: https://lkml.kernel.org/r/20240221154549.2026073-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20240214204435.167852-8-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Arnd Bergmann Reviewed-by: Ryan Roberts Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yin Fengwei Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/tlb.h | 2 ++ include/asm-generic/tlb.h | 24 +++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index b3de6102a907..1ca7d4c4b90d 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -19,6 +19,8 @@ #include +static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, + unsigned long address); #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry #define tlb_flush tlb_flush diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 2eb7b0d4f5d2..127a8230a40a 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -592,7 +592,9 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, } #ifndef __tlb_remove_tlb_entry -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) +static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address) +{ +} #endif /** @@ -608,6 +610,26 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) +/** + * tlb_remove_tlb_entries - remember unmapping of multiple consecutive ptes for + * later tlb invalidation. + * + * Similar to tlb_remove_tlb_entry(), but remember unmapping of multiple + * consecutive ptes instead of only a single one. + */ +static inline void tlb_remove_tlb_entries(struct mmu_gather *tlb, + pte_t *ptep, unsigned int nr, unsigned long address) +{ + tlb_flush_pte_range(tlb, address, PAGE_SIZE * nr); + for (;;) { + __tlb_remove_tlb_entry(tlb, ptep, address); + if (--nr == 0) + break; + ptep++; + address += PAGE_SIZE; + } +} + #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ do { \ unsigned long _sz = huge_page_size(h); \ From d7f861b9c43aadbe384ab1382d2e76750bedc91e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 21:44:33 +0100 Subject: [PATCH 191/435] mm/mmu_gather: add __tlb_remove_folio_pages() Add __tlb_remove_folio_pages(), which will remove multiple consecutive pages that belong to the same large folio, instead of only a single page. We'll be using this function when optimizing unmapping/zapping of large folios that are mapped by PTEs. We're using the remaining spare bit in an encoded_page to indicate that the next enoced page in an array contains actually shifted "nr_pages". Teach swap/freeing code about putting multiple folio references, and delayed rmap handling to remove page ranges of a folio. This extension allows for still gathering almost as many small folios as we used to (-1, because we have to prepare for a possibly bigger next entry), but still allows for gathering consecutive pages that belong to the same large folio. Note that we don't pass the folio pointer, because it is not required for now. Further, we don't support page_size != PAGE_SIZE, it won't be required for simple PTE batching. We have to provide a separate s390 implementation, but it's fairly straight forward. Another, more invasive and likely more expensive, approach would be to use folio+range or a PFN range instead of page+nr_pages. But, we should do that consistently for the whole mmu_gather. For now, let's keep it simple and add "nr_pages" only. Note that it is now possible to gather significantly more pages: In the past, we were able to gather ~10000 pages, now we can also gather ~5000 folio fragments that span multiple pages. A folio fragment on x86-64 can span up to 512 pages (2 MiB THP) and on arm64 with 64k in theory 8192 pages (512 MiB THP). Gathering more memory is not considered something we should worry about, especially because these are already corner cases. While we can gather more total memory, we won't free more folio fragments. As long as page freeing time primarily only depends on the number of involved folios, there is no effective change for !preempt configurations. However, we'll adjust tlb_batch_pages_flush() separately to handle corner cases where page freeing time grows proportionally with the actual memory size. Link: https://lkml.kernel.org/r/20240214204435.167852-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yin Fengwei Signed-off-by: Andrew Morton --- arch/s390/include/asm/tlb.h | 17 +++++++++++ include/asm-generic/tlb.h | 8 +++++ include/linux/mm_types.h | 20 ++++++++++++ mm/mmu_gather.c | 61 +++++++++++++++++++++++++++++++------ mm/swap.c | 12 ++++++-- mm/swap_state.c | 15 +++++++-- 6 files changed, 119 insertions(+), 14 deletions(-) diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 48df896d5b79..e95b2c8081eb 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -26,6 +26,8 @@ void __tlb_remove_table(void *_table); static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size); +static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, + struct page *page, unsigned int nr_pages, bool delay_rmap); #define tlb_flush tlb_flush #define pte_free_tlb pte_free_tlb @@ -52,6 +54,21 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return false; } +static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, + struct page *page, unsigned int nr_pages, bool delay_rmap) +{ + struct encoded_page *encoded_pages[] = { + encode_page(page, ENCODED_PAGE_BIT_NR_PAGES_NEXT), + encode_nr_pages(nr_pages), + }; + + VM_WARN_ON_ONCE(delay_rmap); + VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1)); + + free_pages_and_swap_cache(encoded_pages, ARRAY_SIZE(encoded_pages)); + return false; +} + static inline void tlb_flush(struct mmu_gather *tlb) { __tlb_flush_mm_lazy(tlb->mm); diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 127a8230a40a..709830274b75 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -69,6 +69,7 @@ * * - tlb_remove_page() / __tlb_remove_page() * - tlb_remove_page_size() / __tlb_remove_page_size() + * - __tlb_remove_folio_pages() * * __tlb_remove_page_size() is the basic primitive that queues a page for * freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a @@ -78,6 +79,11 @@ * tlb_remove_page() and tlb_remove_page_size() imply the call to * tlb_flush_mmu() when required and has no return value. * + * __tlb_remove_folio_pages() is similar to __tlb_remove_page(), however, + * instead of removing a single page, remove the given number of consecutive + * pages that are all part of the same (large) folio: just like calling + * __tlb_remove_page() on each page individually. + * * - tlb_change_page_size() * * call before __tlb_remove_page*() to set the current page-size; implies a @@ -262,6 +268,8 @@ struct mmu_gather_batch { extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size); +bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, + unsigned int nr_pages, bool delay_rmap); #ifdef CONFIG_SMP /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1b89eec0d6df..a7223ba3ea1e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -226,6 +226,15 @@ struct encoded_page; /* Perform rmap removal after we have flushed the TLB. */ #define ENCODED_PAGE_BIT_DELAY_RMAP 1ul +/* + * The next item in an encoded_page array is the "nr_pages" argument, specifying + * the number of consecutive pages starting from this page, that all belong to + * the same folio. For example, "nr_pages" corresponds to the number of folio + * references that must be dropped. If this bit is not set, "nr_pages" is + * implicitly 1. + */ +#define ENCODED_PAGE_BIT_NR_PAGES_NEXT 2ul + static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags) { BUILD_BUG_ON(flags > ENCODED_PAGE_BITS); @@ -242,6 +251,17 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page); } +static __always_inline struct encoded_page *encode_nr_pages(unsigned long nr) +{ + VM_WARN_ON_ONCE((nr << 2) >> 2 != nr); + return (struct encoded_page *)(nr << 2); +} + +static __always_inline unsigned long encoded_nr_pages(struct encoded_page *page) +{ + return ((unsigned long)page) >> 2; +} + /* * A swap entry has to fit into a "unsigned long", as the entry is hidden * in the "index" field of the swapper address space. diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 6540c99c6758..d175c0f1e2c8 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -50,12 +50,21 @@ static bool tlb_next_batch(struct mmu_gather *tlb) #ifdef CONFIG_SMP static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma) { + struct encoded_page **pages = batch->encoded_pages; + for (int i = 0; i < batch->nr; i++) { - struct encoded_page *enc = batch->encoded_pages[i]; + struct encoded_page *enc = pages[i]; if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) { struct page *page = encoded_page_ptr(enc); - folio_remove_rmap_pte(page_folio(page), page, vma); + unsigned int nr_pages = 1; + + if (unlikely(encoded_page_flags(enc) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + nr_pages = encoded_nr_pages(pages[++i]); + + folio_remove_rmap_ptes(page_folio(page), page, nr_pages, + vma); } } } @@ -89,18 +98,26 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb) for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { struct encoded_page **pages = batch->encoded_pages; - do { + while (batch->nr) { /* * limit free batch count when PAGE_SIZE > 4K */ unsigned int nr = min(512U, batch->nr); + /* + * Make sure we cover page + nr_pages, and don't leave + * nr_pages behind when capping the number of entries. + */ + if (unlikely(encoded_page_flags(pages[nr - 1]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + nr++; + free_pages_and_swap_cache(pages, nr); pages += nr; batch->nr -= nr; cond_resched(); - } while (batch->nr); + } } tlb->active = &tlb->local; } @@ -116,8 +133,9 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) tlb->local.next = NULL; } -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, - bool delay_rmap, int page_size) +static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb, + struct page *page, unsigned int nr_pages, bool delay_rmap, + int page_size) { int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0; struct mmu_gather_batch *batch; @@ -126,6 +144,8 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, #ifdef CONFIG_MMU_GATHER_PAGE_SIZE VM_WARN_ON(tlb->page_size != page_size); + VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE); + VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1)); #endif batch = tlb->active; @@ -133,17 +153,40 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, * Add the page and check if we are full. If so * force a flush. */ - batch->encoded_pages[batch->nr++] = encode_page(page, flags); - if (batch->nr == batch->max) { + if (likely(nr_pages == 1)) { + batch->encoded_pages[batch->nr++] = encode_page(page, flags); + } else { + flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT; + batch->encoded_pages[batch->nr++] = encode_page(page, flags); + batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages); + } + /* + * Make sure that we can always add another "page" + "nr_pages", + * requiring two entries instead of only a single one. + */ + if (batch->nr >= batch->max - 1) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } - VM_BUG_ON_PAGE(batch->nr > batch->max, page); + VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page); return false; } +bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, + unsigned int nr_pages, bool delay_rmap) +{ + return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap, + PAGE_SIZE); +} + +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, + bool delay_rmap, int page_size) +{ + return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size); +} + #endif /* MMU_GATHER_NO_GATHER */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE diff --git a/mm/swap.c b/mm/swap.c index cd8f0150ba3a..e5380d732c0d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -967,11 +967,17 @@ void release_pages(release_pages_arg arg, int nr) unsigned int lock_batch; for (i = 0; i < nr; i++) { + unsigned int nr_refs = 1; struct folio *folio; /* Turn any of the argument types into a folio */ folio = page_folio(encoded_page_ptr(encoded[i])); + /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ + if (unlikely(encoded_page_flags(encoded[i]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + nr_refs = encoded_nr_pages(encoded[++i]); + /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the @@ -990,14 +996,14 @@ void release_pages(release_pages_arg arg, int nr) unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - if (put_devmap_managed_page(&folio->page)) + if (put_devmap_managed_page_refs(&folio->page, nr_refs)) continue; - if (folio_put_testzero(folio)) + if (folio_ref_sub_and_test(folio, nr_refs)) free_zone_device_page(&folio->page); continue; } - if (!folio_put_testzero(folio)) + if (!folio_ref_sub_and_test(folio, nr_refs)) continue; if (folio_test_large(folio)) { diff --git a/mm/swap_state.c b/mm/swap_state.c index 7255c01a1e4e..2f540748f7c0 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -311,8 +311,19 @@ void free_page_and_swap_cache(struct page *page) void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { lru_add_drain(); - for (int i = 0; i < nr; i++) - free_swap_cache(encoded_page_ptr(pages[i])); + for (int i = 0; i < nr; i++) { + struct page *page = encoded_page_ptr(pages[i]); + + /* + * Skip over the "nr_pages" entry. It's sufficient to call + * free_swap_cache() only once per folio. + */ + if (unlikely(encoded_page_flags(pages[i]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + i++; + + free_swap_cache(page); + } release_pages(pages, nr); } From e61abd4490684de379b4a2ef1be2dbde39ac1ced Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 21:44:34 +0100 Subject: [PATCH 192/435] mm/mmu_gather: improve cond_resched() handling with large folios and expensive page freeing In tlb_batch_pages_flush(), we can end up freeing up to 512 pages or now up to 256 folio fragments that span more than one page, before we conditionally reschedule. It's a pain that we have to handle cond_resched() in tlb_batch_pages_flush() manually and cannot simply handle it in release_pages() -- release_pages() can be called from atomic context. Well, in a perfect world we wouldn't have to make our code more complicated at all. With page poisoning and init_on_free, we might now run into soft lockups when we free a lot of rather large folio fragments, because page freeing time then depends on the actual memory size we are freeing instead of on the number of folios that are involved. In the absolute (unlikely) worst case, on arm64 with 64k we will be able to free up to 256 folio fragments that each span 512 MiB: zeroing out 128 GiB does sound like it might take a while. But instead of ignoring this unlikely case, let's just handle it. So, let's teach tlb_batch_pages_flush() that there are some configurations where page freeing is horribly slow, and let's reschedule more frequently -- similarly like we did for now before we had large folio fragments in there. Avoid yet another loop over all encoded pages in the common case by handling that separately. Note that with page poisoning/zeroing, we might now end up freeing only a single folio fragment at a time that might exceed the old 512 pages limit: but if we cannot even free a single MAX_ORDER page on a system without running into soft lockups, something else is already completely bogus. Freeing a PMD-mapped THP would similarly cause trouble. In theory, we might even free 511 order-0 pages + a single MAX_ORDER page, effectively having to zero out 8703 pages on arm64 with 64k, translating to ~544 MiB of memory: however, if 512 MiB doesn't result in soft lockups, 544 MiB is unlikely to result in soft lockups, so we won't care about that for the time being. In the future, we might want to detect if handling cond_resched() is required at all, and just not do any of that with full preemption enabled. Link: https://lkml.kernel.org/r/20240214204435.167852-10-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/mmu_gather.c | 60 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index d175c0f1e2c8..99b3e9408aa0 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -91,18 +91,21 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) } #endif -static void tlb_batch_pages_flush(struct mmu_gather *tlb) +/* + * We might end up freeing a lot of pages. Reschedule on a regular + * basis to avoid soft lockups in configurations without full + * preemption enabled. The magic number of 512 folios seems to work. + */ +#define MAX_NR_FOLIOS_PER_FREE 512 + +static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch) { - struct mmu_gather_batch *batch; + struct encoded_page **pages = batch->encoded_pages; + unsigned int nr, nr_pages; - for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { - struct encoded_page **pages = batch->encoded_pages; - - while (batch->nr) { - /* - * limit free batch count when PAGE_SIZE > 4K - */ - unsigned int nr = min(512U, batch->nr); + while (batch->nr) { + if (!page_poisoning_enabled_static() && !want_init_on_free()) { + nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr); /* * Make sure we cover page + nr_pages, and don't leave @@ -111,14 +114,39 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb) if (unlikely(encoded_page_flags(pages[nr - 1]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr++; - - free_pages_and_swap_cache(pages, nr); - pages += nr; - batch->nr -= nr; - - cond_resched(); + } else { + /* + * With page poisoning and init_on_free, the time it + * takes to free memory grows proportionally with the + * actual memory size. Therefore, limit based on the + * actual memory size and not the number of involved + * folios. + */ + for (nr = 0, nr_pages = 0; + nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE; + nr++) { + if (unlikely(encoded_page_flags(pages[nr]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + nr_pages += encoded_nr_pages(pages[++nr]); + else + nr_pages++; + } } + + free_pages_and_swap_cache(pages, nr); + pages += nr; + batch->nr -= nr; + + cond_resched(); } +} + +static void tlb_batch_pages_flush(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; + + for (batch = &tlb->local; batch && batch->nr; batch = batch->next) + __tlb_batch_free_encoded_pages(batch); tlb->active = &tlb->local; } From 10ebac4f95e7a9951c453d6c66d9beb5a35db338 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 21:44:35 +0100 Subject: [PATCH 193/435] mm/memory: optimize unmap/zap with PTE-mapped THP Similar to how we optimized fork(), let's implement PTE batching when consecutive (present) PTEs map consecutive pages of the same large folio. Most infrastructure we need for batching (mmu gather, rmap) is already there. We only have to add get_and_clear_full_ptes() and clear_full_ptes(). Similarly, extend zap_install_uffd_wp_if_needed() to process a PTE range. We won't bother sanity-checking the mapcount of all subpages, but only check the mapcount of the first subpage we process. If there is a real problem hiding somewhere, we can trigger it simply by using small folios, or when we zap single pages of a large folio. Ideally, we had that check in rmap code (including for delayed rmap), but then we cannot print the PTE. Let's keep it simple for now. If we ever have a cheap folio_mapcount(), we might just want to check for underflows there. To keep small folios as fast as possible force inlining of a specialized variant using __always_inline with nr=1. Link: https://lkml.kernel.org/r/20240214204435.167852-11-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 70 +++++++++++++++++++++++++++++++ mm/memory.c | 92 +++++++++++++++++++++++++++++------------ 2 files changed, 136 insertions(+), 26 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index aab227e12493..49ab1f73b5c2 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -580,6 +580,76 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, } #endif +#ifndef get_and_clear_full_ptes +/** + * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of + * the same folio, collecting dirty/accessed bits. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * @full: Whether we are clearing a full mm. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the + * returned PTE. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned int nr, int full) +{ + pte_t pte, tmp_pte; + + pte = ptep_get_and_clear_full(mm, addr, ptep, full); + while (--nr) { + ptep++; + addr += PAGE_SIZE; + tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full); + if (pte_dirty(tmp_pte)) + pte = pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte = pte_mkyoung(pte); + } + return pte; +} +#endif + +#ifndef clear_full_ptes +/** + * clear_full_ptes - Clear present PTEs that map consecutive pages of the same + * folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * @full: Whether we are clearing a full mm. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_get_and_clear_full(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + for (;;) { + ptep_get_and_clear_full(mm, addr, ptep, full); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} +#endif /* * If two threads concurrently fault at the same page, the thread that diff --git a/mm/memory.c b/mm/memory.c index 168096f9360e..465ada39c2b7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1517,7 +1517,7 @@ static inline bool zap_drop_file_uffd_wp(struct zap_details *details) */ static inline void zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte, + unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { /* Zap on anonymous always means dropping everything */ @@ -1527,20 +1527,27 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, if (zap_drop_file_uffd_wp(details)) return; - pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + for (;;) { + /* the PFN in the PTE is irrelevant. */ + pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + if (--nr == 0) + break; + pte++; + addr += PAGE_SIZE; + } } -static inline void zap_present_folio_pte(struct mmu_gather *tlb, +static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio, - struct page *page, pte_t *pte, pte_t ptent, unsigned long addr, - struct zap_details *details, int *rss, bool *force_flush, - bool *force_break) + struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, + unsigned long addr, struct zap_details *details, int *rss, + bool *force_flush, bool *force_break) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; if (!folio_test_anon(folio)) { - ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); if (pte_dirty(ptent)) { folio_mark_dirty(folio); if (tlb_delay_rmap(tlb)) { @@ -1550,36 +1557,49 @@ static inline void zap_present_folio_pte(struct mmu_gather *tlb, } if (pte_young(ptent) && likely(vma_has_recency(vma))) folio_mark_accessed(folio); - rss[mm_counter(folio)]--; + rss[mm_counter(folio)] -= nr; } else { /* We don't need up-to-date accessed/dirty bits. */ - ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); - rss[MM_ANONPAGES]--; + clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); + rss[MM_ANONPAGES] -= nr; } + /* Checking a single PTE in a batch is sufficient. */ arch_check_zapped_pte(vma, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); + tlb_remove_tlb_entries(tlb, pte, nr, addr); if (unlikely(userfaultfd_pte_wp(vma, ptent))) - zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); + zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, + ptent); if (!delay_rmap) { - folio_remove_rmap_pte(folio, page, vma); + folio_remove_rmap_ptes(folio, page, nr, vma); + + /* Only sanity-check the first page in a batch. */ if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); } - if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { + if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { *force_flush = true; *force_break = true; } } -static inline void zap_present_pte(struct mmu_gather *tlb, +/* + * Zap or skip at least one present PTE, trying to batch-process subsequent + * PTEs that map consecutive pages of the same folio. + * + * Returns the number of processed (skipped or zapped) PTEs (at least 1). + */ +static inline int zap_present_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, - unsigned long addr, struct zap_details *details, - int *rss, bool *force_flush, bool *force_break) + unsigned int max_nr, unsigned long addr, + struct zap_details *details, int *rss, bool *force_flush, + bool *force_break) { + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; struct folio *folio; struct page *page; + int nr; page = vm_normal_page(vma, addr, ptent); if (!page) { @@ -1589,14 +1609,29 @@ static inline void zap_present_pte(struct mmu_gather *tlb, tlb_remove_tlb_entry(tlb, pte, addr); VM_WARN_ON_ONCE(userfaultfd_wp(vma)); ksm_might_unmap_zero_page(mm, ptent); - return; + return 1; } folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) - return; - zap_present_folio_pte(tlb, vma, folio, page, pte, ptent, addr, details, - rss, force_flush, force_break); + return 1; + + /* + * Make sure that the common "small folio" case is as fast as possible + * by keeping the batching logic separate. + */ + if (unlikely(folio_test_large(folio) && max_nr != 1)) { + nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, + NULL); + + zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, + addr, details, rss, force_flush, + force_break); + return nr; + } + zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, + details, rss, force_flush, force_break); + return 1; } static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -1611,6 +1646,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *start_pte; pte_t *pte; swp_entry_t entry; + int nr; tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); @@ -1624,7 +1660,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t ptent = ptep_get(pte); struct folio *folio; struct page *page; + int max_nr; + nr = 1; if (pte_none(ptent)) continue; @@ -1632,10 +1670,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, break; if (pte_present(ptent)) { - zap_present_pte(tlb, vma, pte, ptent, addr, details, - rss, &force_flush, &force_break); + max_nr = (end - addr) / PAGE_SIZE; + nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr, + addr, details, rss, &force_flush, + &force_break); if (unlikely(force_break)) { - addr += PAGE_SIZE; + addr += nr * PAGE_SIZE; break; } continue; @@ -1689,8 +1729,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, WARN_ON_ONCE(1); } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); - } while (pte++, addr += PAGE_SIZE, addr != end); + zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent); + } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); From 6280d7317ccae19c776a3b6cf9848c964f958091 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:48 +0000 Subject: [PATCH 194/435] mm: clarify the spec for set_ptes() Patch series "Transparent Contiguous PTEs for User Mappings", v6. This is a series to opportunistically and transparently use contpte mappings (set the contiguous bit in ptes) for user memory when those mappings meet the requirements. The change benefits arm64, but there is some (very) minor refactoring for x86 to enable its integration with core-mm. It is part of a wider effort to improve performance by allocating and mapping variable-sized blocks of memory (folios). One aim is for the 4K kernel to approach the performance of the 16K kernel, but without breaking compatibility and without the associated increase in memory. Another aim is to benefit the 16K and 64K kernels by enabling 2M THP, since this is the contpte size for those kernels. We have good performance data that demonstrates both aims are being met (see below). Of course this is only one half of the change. We require the mapped physical memory to be the correct size and alignment for this to actually be useful (i.e. 64K for 4K pages, or 2M for 16K/64K pages). Fortunately folios are solving this problem for us. Filesystems that support it (XFS, AFS, EROFS, tmpfs, ...) will allocate large folios up to the PMD size today, and more filesystems are coming. And for anonymous memory, "multi-size THP" is now upstream. Patch Layout ============ In this version, I've split the patches to better show each optimization: - 1-2: mm prep: misc code and docs cleanups - 3-6: mm,arm64,x86 prep: Add pte_advance_pfn() and make pte_next_pfn() a generic wrapper around it - 7-11: arm64 prep: Refactor ptep helpers into new layer - 12: functional contpte implementation - 23-18: various optimizations on top of the contpte implementation Testing ======= I've tested this series on both Ampere Altra (bare metal) and Apple M2 (VM): - mm selftests (inc new tests written for multi-size THP); no regressions - Speedometer Java script benchmark in Chromium web browser; no issues - Kernel compilation; no issues - Various tests under high memory pressure with swap enabled; no issues Performance =========== High Level Use Cases ~~~~~~~~~~~~~~~~~~~~ First some high level use cases (kernel compilation and speedometer JavaScript benchmarks). These are running on Ampere Altra (I've seen similar improvements on Android/Pixel 6). baseline: mm-unstable (mTHP switched off) mTHP: + enable 16K, 32K, 64K mTHP sizes "always" mTHP + contpte: + this series mTHP + contpte + exefolio: + patch at [6], which series supports Kernel Compilation with -j8 (negative is faster): | kernel | real-time | kern-time | user-time | |---------------------------|-----------|-----------|-----------| | baseline | 0.0% | 0.0% | 0.0% | | mTHP | -5.0% | -39.1% | -0.7% | | mTHP + contpte | -6.0% | -41.4% | -1.5% | | mTHP + contpte + exefolio | -7.8% | -43.1% | -3.4% | Kernel Compilation with -j80 (negative is faster): | kernel | real-time | kern-time | user-time | |---------------------------|-----------|-----------|-----------| | baseline | 0.0% | 0.0% | 0.0% | | mTHP | -5.0% | -36.6% | -0.6% | | mTHP + contpte | -6.1% | -38.2% | -1.6% | | mTHP + contpte + exefolio | -7.4% | -39.2% | -3.2% | Speedometer (positive is faster): | kernel | runs_per_min | |:--------------------------|--------------| | baseline | 0.0% | | mTHP | 1.5% | | mTHP + contpte | 3.2% | | mTHP + contpte + exefolio | 4.5% | Micro Benchmarks ~~~~~~~~~~~~~~~~ The following microbenchmarks are intended to demonstrate the performance of fork() and munmap() do not regress. I'm showing results for order-0 (4K) mappings, and for order-9 (2M) PTE-mapped THP. Thanks to David for sharing his benchmarks. baseline: mm-unstable + batch zap [7] series contpte-basic: + patches 0-19; functional contpte implementation contpte-batch: + patches 20-23; implement new batched APIs contpte-inline: + patch 24; __always_inline to help compiler contpte-fold: + patch 25; fold contpte mapping when sensible Primary platform is Ampere Altra bare metal. I'm also showing results for M2 VM (on top of MacOS) for reference, although experience suggests this might not be the most reliable for performance numbers of this sort: | FORK | order-0 | order-9 | | Ampere Altra |------------------------|------------------------| | (pte-map) | mean | stdev | mean | stdev | |----------------|------------|-----------|------------|-----------| | baseline | 0.0% | 2.7% | 0.0% | 0.2% | | contpte-basic | 6.3% | 1.4% | 1948.7% | 0.2% | | contpte-batch | 7.6% | 2.0% | -1.9% | 0.4% | | contpte-inline | 3.6% | 1.5% | -1.0% | 0.2% | | contpte-fold | 4.6% | 2.1% | -1.8% | 0.2% | | MUNMAP | order-0 | order-9 | | Ampere Altra |------------------------|------------------------| | (pte-map) | mean | stdev | mean | stdev | |----------------|------------|-----------|------------|-----------| | baseline | 0.0% | 0.5% | 0.0% | 0.3% | | contpte-basic | 1.8% | 0.3% | 1104.8% | 0.1% | | contpte-batch | -0.3% | 0.4% | 2.7% | 0.1% | | contpte-inline | -0.1% | 0.6% | 0.9% | 0.1% | | contpte-fold | 0.1% | 0.6% | 0.8% | 0.1% | | FORK | order-0 | order-9 | | Apple M2 VM |------------------------|------------------------| | (pte-map) | mean | stdev | mean | stdev | |----------------|------------|-----------|------------|-----------| | baseline | 0.0% | 1.4% | 0.0% | 0.8% | | contpte-basic | 6.8% | 1.2% | 469.4% | 1.4% | | contpte-batch | -7.7% | 2.0% | -8.9% | 0.7% | | contpte-inline | -6.0% | 2.1% | -6.0% | 2.0% | | contpte-fold | 5.9% | 1.4% | -6.4% | 1.4% | | MUNMAP | order-0 | order-9 | | Apple M2 VM |------------------------|------------------------| | (pte-map) | mean | stdev | mean | stdev | |----------------|------------|-----------|------------|-----------| | baseline | 0.0% | 0.6% | 0.0% | 0.4% | | contpte-basic | 1.6% | 0.6% | 233.6% | 0.7% | | contpte-batch | 1.9% | 0.3% | -3.9% | 0.4% | | contpte-inline | 2.2% | 0.8% | -1.6% | 0.9% | | contpte-fold | 1.5% | 0.7% | -1.7% | 0.7% | Misc ~~~~ John Hubbard at Nvidia has indicated dramatic 10x performance improvements for some workloads at [8], when using 64K base page kernel. [1] https://lore.kernel.org/linux-arm-kernel/20230622144210.2623299-1-ryan.roberts@arm.com/ [2] https://lore.kernel.org/linux-arm-kernel/20231115163018.1303287-1-ryan.roberts@arm.com/ [3] https://lore.kernel.org/linux-arm-kernel/20231204105440.61448-1-ryan.roberts@arm.com/ [4] https://lore.kernel.org/lkml/20231218105100.172635-1-ryan.roberts@arm.com/ [5] https://lore.kernel.org/linux-mm/633af0a7-0823-424f-b6ef-374d99483f05@arm.com/ [6] https://lore.kernel.org/lkml/08c16f7d-f3b3-4f22-9acc-da943f647dc3@arm.com/ [7] https://lore.kernel.org/linux-mm/20240214204435.167852-1-david@redhat.com/ [8] https://lore.kernel.org/linux-mm/c507308d-bdd4-5f9e-d4ff-e96e4520be85@nvidia.com/ [9] https://gitlab.arm.com/linux-arm/linux-rr/-/tree/features/granule_perf/contpte-lkml_v6 This patch (of 18): set_ptes() spec implies that it can only be used to set a present pte because it interprets the PFN field to increment it. However, set_pte_at() has been implemented on top of set_ptes() since set_ptes() was introduced, and set_pte_at() allows setting a pte to a not-present state. So clarify the spec to state that when nr==1, new state of pte may be present or not present. When nr>1, new state of all ptes must be present. While we are at it, tighten the spec to set requirements around the initial state of ptes; when nr==1 it may be either present or not-present. But when nr>1 all ptes must initially be not-present. All set_ptes() callsites already conform to this requirement. Stating it explicitly is useful because it allows for a simplification to the upcoming arm64 contpte implementation. Link: https://lkml.kernel.org/r/20240215103205.2607016-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240215103205.2607016-2-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 49ab1f73b5c2..231370e1b80f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -229,6 +229,10 @@ static inline pte_t pte_next_pfn(pte_t pte) * @pte: Page table entry for the first page. * @nr: Number of pages to map. * + * When nr==1, initial state of pte may be present or not present, and new state + * may be present or not present. When nr>1, initial state of all ptes must be + * not present, and new state must be present. + * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * From 2bdba9868a4ffcb1492db7272f34b54387910177 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:49 +0000 Subject: [PATCH 195/435] mm: thp: batch-collapse PMD with set_ptes() Refactor __split_huge_pmd_locked() so that a present PMD can be collapsed to PTEs in a single batch using set_ptes(). This should improve performance a little bit, but the real motivation is to remove the need for the arm64 backend to have to fold the contpte entries. Instead, since the ptes are set as a batch, the contpte blocks can be initially set up pre-folded (once the arm64 contpte support is added in the next few patches). This leads to noticeable performance improvement during split. Link: https://lkml.kernel.org/r/20240215103205.2607016-3-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 58 +++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 016e20bd813e..14888b15121e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2579,15 +2579,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte); - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { - pte_t entry; - /* - * Note that NUMA hinting access restrictions are not - * transferred to avoid any possibility of altering - * permissions across VMAs. - */ - if (freeze || pmd_migration) { + + /* + * Note that NUMA hinting access restrictions are not transferred to + * avoid any possibility of altering permissions across VMAs. + */ + if (freeze || pmd_migration) { + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { + pte_t entry; swp_entry_t swp_entry; + if (write) swp_entry = make_writable_migration_entry( page_to_pfn(page + i)); @@ -2606,25 +2607,32 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_swp_mksoft_dirty(entry); if (uffd_wp) entry = pte_swp_mkuffd_wp(entry); - } else { - entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); - if (write) - entry = pte_mkwrite(entry, vma); - if (!young) - entry = pte_mkold(entry); - /* NOTE: this may set soft-dirty too on some archs */ - if (dirty) - entry = pte_mkdirty(entry); - if (soft_dirty) - entry = pte_mksoft_dirty(entry); - if (uffd_wp) - entry = pte_mkuffd_wp(entry); + + VM_WARN_ON(!pte_none(ptep_get(pte + i))); + set_pte_at(mm, addr, pte + i, entry); } - VM_BUG_ON(!pte_none(ptep_get(pte))); - set_pte_at(mm, addr, pte, entry); - pte++; + } else { + pte_t entry; + + entry = mk_pte(page, READ_ONCE(vma->vm_page_prot)); + if (write) + entry = pte_mkwrite(entry, vma); + if (!young) + entry = pte_mkold(entry); + /* NOTE: this may set soft-dirty too on some archs */ + if (dirty) + entry = pte_mkdirty(entry); + if (soft_dirty) + entry = pte_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_mkuffd_wp(entry); + + for (i = 0; i < HPAGE_PMD_NR; i++) + VM_WARN_ON(!pte_none(ptep_get(pte + i))); + + set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR); } - pte_unmap(pte - 1); + pte_unmap(pte); if (!pmd_migration) folio_remove_rmap_pmd(folio, page, vma); From 583ceaaa339960e673ac0029f323bb1c6ffc96d7 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:50 +0000 Subject: [PATCH 196/435] mm: introduce pte_advance_pfn() and use for pte_next_pfn() The goal is to be able to advance a PTE by an arbitrary number of PFNs. So introduce a new API that takes a nr param. Define the default implementation here and allow for architectures to override. pte_next_pfn() becomes a wrapper around pte_advance_pfn(). Follow up commits will convert each overriding architecture's pte_next_pfn() to pte_advance_pfn(). Link: https://lkml.kernel.org/r/20240215103205.2607016-4-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 231370e1b80f..b7ac8358f2aa 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,14 +212,17 @@ static inline int pmd_dirty(pmd_t pmd) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif - #ifndef pte_next_pfn -static inline pte_t pte_next_pfn(pte_t pte) +#ifndef pte_advance_pfn +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { - return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } #endif +#define pte_next_pfn(pte) pte_advance_pfn(pte, 1) +#endif + #ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. From c1bd2b4028ae5b4d2ada64b31c40cc44cdf00972 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:51 +0000 Subject: [PATCH 197/435] arm64/mm: convert pte_next_pfn() to pte_advance_pfn() Core-mm needs to be able to advance the pfn by an arbitrary amount, so override the new pte_advance_pfn() API to do so. Link: https://lkml.kernel.org/r/20240215103205.2607016-5-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 52d0b0a763f1..b6d3e9e0a946 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -351,10 +351,10 @@ static inline pgprot_t pte_pgprot(pte_t pte) return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); } -#define pte_next_pfn pte_next_pfn -static inline pte_t pte_next_pfn(pte_t pte) +#define pte_advance_pfn pte_advance_pfn +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { - return pfn_pte(pte_pfn(pte) + 1, pte_pgprot(pte)); + return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte)); } static inline void set_ptes(struct mm_struct *mm, @@ -370,7 +370,7 @@ static inline void set_ptes(struct mm_struct *mm, if (--nr == 0) break; ptep++; - pte = pte_next_pfn(pte); + pte = pte_advance_pfn(pte, 1); } } #define set_ptes set_ptes From 506b586769ecef8c83fff64de227e7fa84b7be42 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:52 +0000 Subject: [PATCH 198/435] x86/mm: convert pte_next_pfn() to pte_advance_pfn() Core-mm needs to be able to advance the pfn by an arbitrary amount, so override the new pte_advance_pfn() API to do so. Link: https://lkml.kernel.org/r/20240215103205.2607016-6-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b50b2ef63672..69ed0ea0641b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -955,13 +955,13 @@ static inline int pte_same(pte_t a, pte_t b) return a.pte == b.pte; } -static inline pte_t pte_next_pfn(pte_t pte) +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { if (__pte_needs_invert(pte_val(pte))) - return __pte(pte_val(pte) - (1UL << PFN_PTE_SHIFT)); - return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } -#define pte_next_pfn pte_next_pfn +#define pte_advance_pfn pte_advance_pfn static inline int pte_present(pte_t a) { From fb23bf6bd288db3187c27b971e558a3e9f70ae96 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:53 +0000 Subject: [PATCH 199/435] mm: tidy up pte_next_pfn() definition Now that the all architecture overrides of pte_next_pfn() have been replaced with pte_advance_pfn(), we can simplify the definition of the generic pte_next_pfn() macro so that it is unconditionally defined. Link: https://lkml.kernel.org/r/20240215103205.2607016-7-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index b7ac8358f2aa..bc005d84f764 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,7 +212,6 @@ static inline int pmd_dirty(pmd_t pmd) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif -#ifndef pte_next_pfn #ifndef pte_advance_pfn static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { @@ -221,7 +220,6 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) #endif #define pte_next_pfn(pte) pte_advance_pfn(pte, 1) -#endif #ifndef set_ptes /** From 532736558e8ef2865eae1d84b52dda4422cac810 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:54 +0000 Subject: [PATCH 200/435] arm64/mm: convert READ_ONCE(*ptep) to ptep_get(ptep) There are a number of places in the arch code that read a pte by using the READ_ONCE() macro. Refactor these call sites to instead use the ptep_get() helper, which itself is a READ_ONCE(). Generated code should be the same. This will benefit us when we shortly introduce the transparent contpte support. In this case, ptep_get() will become more complex so we now have all the code abstracted through it. Link: https://lkml.kernel.org/r/20240215103205.2607016-8-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: John Hubbard Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 12 +++++++++--- arch/arm64/kernel/efi.c | 2 +- arch/arm64/mm/fault.c | 4 ++-- arch/arm64/mm/hugetlbpage.c | 6 +++--- arch/arm64/mm/kasan_init.c | 2 +- arch/arm64/mm/mmu.c | 12 ++++++------ arch/arm64/mm/pageattr.c | 4 ++-- arch/arm64/mm/trans_pgd.c | 2 +- 8 files changed, 25 insertions(+), 19 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index b6d3e9e0a946..de034ca40bad 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -275,6 +275,12 @@ static inline void set_pte(pte_t *ptep, pte_t pte) } } +#define ptep_get ptep_get +static inline pte_t ptep_get(pte_t *ptep) +{ + return READ_ONCE(*ptep); +} + extern void __sync_icache_dcache(pte_t pteval); bool pgattr_change_is_safe(u64 old, u64 new); @@ -302,7 +308,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep, if (!IS_ENABLED(CONFIG_DEBUG_VM)) return; - old_pte = READ_ONCE(*ptep); + old_pte = ptep_get(ptep); if (!pte_valid(old_pte) || !pte_valid(pte)) return; @@ -904,7 +910,7 @@ static inline int __ptep_test_and_clear_young(pte_t *ptep) { pte_t old_pte, pte; - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); do { old_pte = pte; pte = pte_mkold(pte); @@ -986,7 +992,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres { pte_t old_pte, pte; - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); do { old_pte = pte; pte = pte_wrprotect(pte); diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 0228001347be..d0e08e93b246 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -103,7 +103,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { struct set_perm_data *spd = data; const efi_memory_desc_t *md = spd->md; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = ptep_get(ptep); if (md->attribute & EFI_MEMORY_RO) pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 55f6455a8284..a254761fa1bd 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -191,7 +191,7 @@ static void show_pte(unsigned long addr) if (!ptep) break; - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); pr_cont(", pte=%016llx", pte_val(pte)); pte_unmap(ptep); } while(0); @@ -214,7 +214,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, pte_t entry, int dirty) { pteval_t old_pteval, pteval; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = ptep_get(ptep); if (pte_same(pte, entry)) return 0; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 6720ec8d50e7..2892f925ed66 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -485,7 +485,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, size_t pgsize; pte_t pte; - if (!pte_cont(READ_ONCE(*ptep))) { + if (!pte_cont(ptep_get(ptep))) { ptep_set_wrprotect(mm, addr, ptep); return; } @@ -510,7 +510,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, size_t pgsize; int ncontig; - if (!pte_cont(READ_ONCE(*ptep))) + if (!pte_cont(ptep_get(ptep))) return ptep_clear_flush(vma, addr, ptep); ncontig = find_num_contig(mm, addr, ptep, &pgsize); @@ -543,7 +543,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ - if (pte_user_exec(READ_ONCE(*ptep))) + if (pte_user_exec(ptep_get(ptep))) return huge_ptep_clear_flush(vma, addr, ptep); } return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 4c7ad574b946..c2a9f4f6c7dd 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -113,7 +113,7 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, memset(__va(page_phys), KASAN_SHADOW_INIT, PAGE_SIZE); next = addr + PAGE_SIZE; set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); - } while (ptep++, addr = next, addr != end && pte_none(READ_ONCE(*ptep))); + } while (ptep++, addr = next, addr != end && pte_none(ptep_get(ptep))); } static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr, diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 3a27d887f7dd..343629a17042 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -173,7 +173,7 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, ptep = pte_set_fixmap_offset(pmdp, addr); do { - pte_t old_pte = READ_ONCE(*ptep); + pte_t old_pte = ptep_get(ptep); set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); @@ -182,7 +182,7 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, * only allow updates to the permission attributes. */ BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), - READ_ONCE(pte_val(*ptep)))); + pte_val(ptep_get(ptep)))); phys += PAGE_SIZE; } while (ptep++, addr += PAGE_SIZE, addr != end); @@ -852,7 +852,7 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr, do { ptep = pte_offset_kernel(pmdp, addr); - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); if (pte_none(pte)) continue; @@ -985,7 +985,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, do { ptep = pte_offset_kernel(pmdp, addr); - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); /* * This is just a sanity check here which verifies that @@ -1004,7 +1004,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, */ ptep = pte_offset_kernel(pmdp, 0UL); for (i = 0; i < PTRS_PER_PTE; i++) { - if (!pte_none(READ_ONCE(ptep[i]))) + if (!pte_none(ptep_get(&ptep[i]))) return; } @@ -1473,7 +1473,7 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ - if (pte_user_exec(READ_ONCE(*ptep))) + if (pte_user_exec(ptep_get(ptep))) return ptep_clear_flush(vma, addr, ptep); } return ptep_get_and_clear(vma->vm_mm, addr, ptep); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 924843f1f661..73a5e8f82586 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -36,7 +36,7 @@ bool can_set_direct_map(void) static int change_page_range(pte_t *ptep, unsigned long addr, void *data) { struct page_change_data *cdata = data; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = ptep_get(ptep); pte = clear_pte_bit(pte, cdata->clear_mask); pte = set_pte_bit(pte, cdata->set_mask); @@ -245,5 +245,5 @@ bool kernel_page_present(struct page *page) return true; ptep = pte_offset_kernel(pmdp, addr); - return pte_valid(READ_ONCE(*ptep)); + return pte_valid(ptep_get(ptep)); } diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 7b14df3c6477..f71ab4704cce 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -33,7 +33,7 @@ static void *trans_alloc(struct trans_pgd_info *info) static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) { - pte_t pte = READ_ONCE(*src_ptep); + pte_t pte = ptep_get(src_ptep); if (pte_valid(pte)) { /* From 659e193027910a5d3083e34b488ab459d2ef5082 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:55 +0000 Subject: [PATCH 201/435] arm64/mm: convert set_pte_at() to set_ptes(..., 1) Since set_ptes() was introduced, set_pte_at() has been implemented as a generic macro around set_ptes(..., 1). So this change should continue to generate the same code. However, making this change prepares us for the transparent contpte support. It means we can reroute set_ptes() to __set_ptes(). Since set_pte_at() is a generic macro, there will be no equivalent __set_pte_at() to reroute to. Note that a couple of calls to set_pte_at() remain in the arch code. This is intentional, since those call sites are acting on behalf of core-mm and should continue to call into the public set_ptes() rather than the arch-private __set_ptes(). Link: https://lkml.kernel.org/r/20240215103205.2607016-9-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: John Hubbard Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/arm64/kernel/mte.c | 2 +- arch/arm64/kvm/guest.c | 2 +- arch/arm64/mm/fault.c | 2 +- arch/arm64/mm/hugetlbpage.c | 10 +++++----- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index de034ca40bad..9a2df85eb493 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1084,7 +1084,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) #endif /* CONFIG_ARM64_MTE */ /* - * On AArch64, the cache coherency is handled via the set_pte_at() function. + * On AArch64, the cache coherency is handled via the set_ptes() function. */ static inline void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index a41ef3213e1e..59bfe2e96f8f 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -67,7 +67,7 @@ int memcmp_pages(struct page *page1, struct page *page2) /* * If the page content is identical but at least one of the pages is * tagged, return non-zero to avoid KSM merging. If only one of the - * pages is tagged, set_pte_at() may zero or change the tags of the + * pages is tagged, set_ptes() may zero or change the tags of the * other page via mte_sync_tags(). */ if (page_mte_tagged(page1) || page_mte_tagged(page2)) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index aaf1d4939739..6e0df623c8e9 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1072,7 +1072,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, } else { /* * Only locking to serialise with a concurrent - * set_pte_at() in the VMM but still overriding the + * set_ptes() in the VMM but still overriding the * tags, hence ignoring the return value. */ try_page_mte_tagging(page); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index a254761fa1bd..3235e23309ec 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -205,7 +205,7 @@ static void show_pte(unsigned long addr) * * It needs to cope with hardware update of the accessed/dirty state by other * agents in the system and can safely skip the __sync_icache_dcache() call as, - * like set_pte_at(), the PTE is never changed from no-exec to exec here. + * like set_ptes(), the PTE is never changed from no-exec to exec here. * * Returns whether or not the PTE actually changed. */ diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 2892f925ed66..27f6160890d1 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -247,12 +247,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, if (!pte_present(pte)) { for (i = 0; i < ncontig; i++, ptep++, addr += pgsize) - set_pte_at(mm, addr, ptep, pte); + set_ptes(mm, addr, ptep, pte, 1); return; } if (!pte_cont(pte)) { - set_pte_at(mm, addr, ptep, pte); + set_ptes(mm, addr, ptep, pte, 1); return; } @@ -263,7 +263,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, clear_flush(mm, addr, ptep, pgsize, ncontig); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); } pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, @@ -471,7 +471,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, hugeprot = pte_pgprot(pte); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); return 1; } @@ -500,7 +500,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, pfn = pte_pfn(pte); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); } pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, From cbb0294fdd72a5f63ec59fad5c0a98d63bd572fc Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:56 +0000 Subject: [PATCH 202/435] arm64/mm: convert ptep_clear() to ptep_get_and_clear() ptep_clear() is a generic wrapper around the arch-implemented ptep_get_and_clear(). We are about to convert ptep_get_and_clear() into a public version and private version (__ptep_get_and_clear()) to support the transparent contpte work. We won't have a private version of ptep_clear() so let's convert it to directly call ptep_get_and_clear(). Link: https://lkml.kernel.org/r/20240215103205.2607016-10-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: John Hubbard Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 27f6160890d1..48e8b429879d 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -229,7 +229,7 @@ static void clear_flush(struct mm_struct *mm, unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - ptep_clear(mm, addr, ptep); + ptep_get_and_clear(mm, addr, ptep); flush_tlb_range(&vma, saddr, addr); } From 5a00bfd6a52cf31e93d5f1b734087deb32a3cffa Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:57 +0000 Subject: [PATCH 203/435] arm64/mm: new ptep layer to manage contig bit Create a new layer for the in-table PTE manipulation APIs. For now, The existing API is prefixed with double underscore to become the arch-private API and the public API is just a simple wrapper that calls the private API. The public API implementation will subsequently be used to transparently manipulate the contiguous bit where appropriate. But since there are already some contig-aware users (e.g. hugetlb, kernel mapper), we must first ensure those users use the private API directly so that the future contig-bit manipulations in the public API do not interfere with those existing uses. The following APIs are treated this way: - ptep_get - set_pte - set_ptes - pte_clear - ptep_get_and_clear - ptep_test_and_clear_young - ptep_clear_flush_young - ptep_set_wrprotect - ptep_set_access_flags Link: https://lkml.kernel.org/r/20240215103205.2607016-11-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: John Hubbard Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 83 +++++++++++++++++--------------- arch/arm64/kernel/efi.c | 4 +- arch/arm64/kernel/mte.c | 2 +- arch/arm64/kvm/guest.c | 2 +- arch/arm64/mm/fault.c | 12 ++--- arch/arm64/mm/fixmap.c | 4 +- arch/arm64/mm/hugetlbpage.c | 40 +++++++-------- arch/arm64/mm/kasan_init.c | 6 +-- arch/arm64/mm/mmu.c | 14 +++--- arch/arm64/mm/pageattr.c | 6 +-- arch/arm64/mm/trans_pgd.c | 6 +-- 11 files changed, 93 insertions(+), 86 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 9a2df85eb493..7336d40a893a 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -93,7 +93,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) __pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pte_none(pte) (!pte_val(pte)) -#define pte_clear(mm,addr,ptep) set_pte(ptep, __pte(0)) +#define __pte_clear(mm, addr, ptep) \ + __set_pte(ptep, __pte(0)) #define pte_page(pte) (pfn_to_page(pte_pfn(pte))) /* @@ -137,7 +138,7 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) * so that we don't erroneously return false for pages that have been * remapped as PROT_NONE but are yet to be flushed from the TLB. * Note that we can't make any assumptions based on the state of the access - * flag, since ptep_clear_flush_young() elides a DSB when invalidating the + * flag, since __ptep_clear_flush_young() elides a DSB when invalidating the * TLB. */ #define pte_accessible(mm, pte) \ @@ -261,7 +262,7 @@ static inline pte_t pte_mkdevmap(pte_t pte) return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL)); } -static inline void set_pte(pte_t *ptep, pte_t pte) +static inline void __set_pte(pte_t *ptep, pte_t pte) { WRITE_ONCE(*ptep, pte); @@ -275,8 +276,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte) } } -#define ptep_get ptep_get -static inline pte_t ptep_get(pte_t *ptep) +static inline pte_t __ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } @@ -308,7 +308,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep, if (!IS_ENABLED(CONFIG_DEBUG_VM)) return; - old_pte = ptep_get(ptep); + old_pte = __ptep_get(ptep); if (!pte_valid(old_pte) || !pte_valid(pte)) return; @@ -317,7 +317,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep, /* * Check for potential race with hardware updates of the pte - * (ptep_set_access_flags safely changes valid ptes without going + * (__ptep_set_access_flags safely changes valid ptes without going * through an invalid entry). */ VM_WARN_ONCE(!pte_young(pte), @@ -363,23 +363,22 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte)); } -static inline void set_ptes(struct mm_struct *mm, - unsigned long __always_unused addr, - pte_t *ptep, pte_t pte, unsigned int nr) +static inline void __set_ptes(struct mm_struct *mm, + unsigned long __always_unused addr, + pte_t *ptep, pte_t pte, unsigned int nr) { page_table_check_ptes_set(mm, ptep, pte, nr); __sync_cache_and_tags(pte, nr); for (;;) { __check_safe_pte_update(mm, ptep, pte); - set_pte(ptep, pte); + __set_pte(ptep, pte); if (--nr == 0) break; ptep++; pte = pte_advance_pfn(pte, 1); } } -#define set_ptes set_ptes /* * Huge pte definitions. @@ -546,7 +545,7 @@ static inline void __set_pte_at(struct mm_struct *mm, { __sync_cache_and_tags(pte, nr); __check_safe_pte_update(mm, ptep, pte); - set_pte(ptep, pte); + __set_pte(ptep, pte); } static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, @@ -860,8 +859,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); } -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -extern int ptep_set_access_flags(struct vm_area_struct *vma, +extern int __ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); @@ -871,7 +869,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { - return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty); + return __ptep_set_access_flags(vma, address, (pte_t *)pmdp, + pmd_pte(entry), dirty); } static inline int pud_devmap(pud_t pud) @@ -905,12 +904,13 @@ static inline bool pud_user_accessible_page(pud_t pud) /* * Atomic pte/pmd modifications. */ -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int __ptep_test_and_clear_young(pte_t *ptep) +static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) { pte_t old_pte, pte; - pte = ptep_get(ptep); + pte = __ptep_get(ptep); do { old_pte = pte; pte = pte_mkold(pte); @@ -921,18 +921,10 @@ static inline int __ptep_test_and_clear_young(pte_t *ptep) return pte_young(pte); } -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) -{ - return __ptep_test_and_clear_young(ptep); -} - -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -static inline int ptep_clear_flush_young(struct vm_area_struct *vma, +static inline int __ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - int young = ptep_test_and_clear_young(vma, address, ptep); + int young = __ptep_test_and_clear_young(vma, address, ptep); if (young) { /* @@ -955,12 +947,11 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp); + return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, +static inline pte_t __ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0)); @@ -984,15 +975,15 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * ptep_set_wrprotect - mark read-only while trasferring potential hardware + * __ptep_set_wrprotect - mark read-only while trasferring potential hardware * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. */ -#define __HAVE_ARCH_PTEP_SET_WRPROTECT -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) +static inline void __ptep_set_wrprotect(struct mm_struct *mm, + unsigned long address, pte_t *ptep) { pte_t old_pte, pte; - pte = ptep_get(ptep); + pte = __ptep_get(ptep); do { old_pte = pte; pte = pte_wrprotect(pte); @@ -1006,7 +997,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { - ptep_set_wrprotect(mm, address, (pte_t *)pmdp); + __ptep_set_wrprotect(mm, address, (pte_t *)pmdp); } #define pmdp_establish pmdp_establish @@ -1084,7 +1075,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) #endif /* CONFIG_ARM64_MTE */ /* - * On AArch64, the cache coherency is handled via the set_ptes() function. + * On AArch64, the cache coherency is handled via the __set_ptes() function. */ static inline void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, @@ -1136,6 +1127,22 @@ extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma, extern void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t new_pte); + +#define ptep_get __ptep_get +#define set_pte __set_pte +#define set_ptes __set_ptes +#define pte_clear __pte_clear +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +#define ptep_get_and_clear __ptep_get_and_clear +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +#define ptep_test_and_clear_young __ptep_test_and_clear_young +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +#define ptep_clear_flush_young __ptep_clear_flush_young +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +#define ptep_set_wrprotect __ptep_set_wrprotect +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +#define ptep_set_access_flags __ptep_set_access_flags + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_PGTABLE_H */ diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index d0e08e93b246..9afcc690fe73 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -103,7 +103,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { struct set_perm_data *spd = data; const efi_memory_desc_t *md = spd->md; - pte_t pte = ptep_get(ptep); + pte_t pte = __ptep_get(ptep); if (md->attribute & EFI_MEMORY_RO) pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); @@ -111,7 +111,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) pte = set_pte_bit(pte, __pgprot(PTE_PXN)); else if (system_supports_bti_kernel() && spd->has_bti) pte = set_pte_bit(pte, __pgprot(PTE_GP)); - set_pte(ptep, pte); + __set_pte(ptep, pte); return 0; } diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 59bfe2e96f8f..dcdcccd40891 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -67,7 +67,7 @@ int memcmp_pages(struct page *page1, struct page *page2) /* * If the page content is identical but at least one of the pages is * tagged, return non-zero to avoid KSM merging. If only one of the - * pages is tagged, set_ptes() may zero or change the tags of the + * pages is tagged, __set_ptes() may zero or change the tags of the * other page via mte_sync_tags(). */ if (page_mte_tagged(page1) || page_mte_tagged(page2)) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 6e0df623c8e9..629145fd3161 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1072,7 +1072,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, } else { /* * Only locking to serialise with a concurrent - * set_ptes() in the VMM but still overriding the + * __set_ptes() in the VMM but still overriding the * tags, hence ignoring the return value. */ try_page_mte_tagging(page); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3235e23309ec..9a1c66183d16 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -191,7 +191,7 @@ static void show_pte(unsigned long addr) if (!ptep) break; - pte = ptep_get(ptep); + pte = __ptep_get(ptep); pr_cont(", pte=%016llx", pte_val(pte)); pte_unmap(ptep); } while(0); @@ -205,16 +205,16 @@ static void show_pte(unsigned long addr) * * It needs to cope with hardware update of the accessed/dirty state by other * agents in the system and can safely skip the __sync_icache_dcache() call as, - * like set_ptes(), the PTE is never changed from no-exec to exec here. + * like __set_ptes(), the PTE is never changed from no-exec to exec here. * * Returns whether or not the PTE actually changed. */ -int ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, - pte_t entry, int dirty) +int __ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) { pteval_t old_pteval, pteval; - pte_t pte = ptep_get(ptep); + pte_t pte = __ptep_get(ptep); if (pte_same(pte, entry)) return 0; diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c index c0a3301203bd..bfc02568805a 100644 --- a/arch/arm64/mm/fixmap.c +++ b/arch/arm64/mm/fixmap.c @@ -121,9 +121,9 @@ void __set_fixmap(enum fixed_addresses idx, ptep = fixmap_pte(addr); if (pgprot_val(flags)) { - set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags)); + __set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags)); } else { - pte_clear(&init_mm, addr, ptep); + __pte_clear(&init_mm, addr, ptep); flush_tlb_kernel_range(addr, addr+PAGE_SIZE); } } diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 48e8b429879d..0f0e10bb0a95 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -145,14 +145,14 @@ pte_t huge_ptep_get(pte_t *ptep) { int ncontig, i; size_t pgsize; - pte_t orig_pte = ptep_get(ptep); + pte_t orig_pte = __ptep_get(ptep); if (!pte_present(orig_pte) || !pte_cont(orig_pte)) return orig_pte; ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize); for (i = 0; i < ncontig; i++, ptep++) { - pte_t pte = ptep_get(ptep); + pte_t pte = __ptep_get(ptep); if (pte_dirty(pte)) orig_pte = pte_mkdirty(orig_pte); @@ -177,11 +177,11 @@ static pte_t get_clear_contig(struct mm_struct *mm, unsigned long pgsize, unsigned long ncontig) { - pte_t orig_pte = ptep_get(ptep); + pte_t orig_pte = __ptep_get(ptep); unsigned long i; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { - pte_t pte = ptep_get_and_clear(mm, addr, ptep); + pte_t pte = __ptep_get_and_clear(mm, addr, ptep); /* * If HW_AFDBM is enabled, then the HW could turn on @@ -229,7 +229,7 @@ static void clear_flush(struct mm_struct *mm, unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - ptep_get_and_clear(mm, addr, ptep); + __ptep_get_and_clear(mm, addr, ptep); flush_tlb_range(&vma, saddr, addr); } @@ -247,12 +247,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, if (!pte_present(pte)) { for (i = 0; i < ncontig; i++, ptep++, addr += pgsize) - set_ptes(mm, addr, ptep, pte, 1); + __set_ptes(mm, addr, ptep, pte, 1); return; } if (!pte_cont(pte)) { - set_ptes(mm, addr, ptep, pte, 1); + __set_ptes(mm, addr, ptep, pte, 1); return; } @@ -263,7 +263,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, clear_flush(mm, addr, ptep, pgsize, ncontig); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); + __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); } pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, @@ -393,7 +393,7 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr, ncontig = num_contig_ptes(sz, &pgsize); for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - pte_clear(mm, addr, ptep); + __pte_clear(mm, addr, ptep); } pte_t huge_ptep_get_and_clear(struct mm_struct *mm, @@ -401,10 +401,10 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, { int ncontig; size_t pgsize; - pte_t orig_pte = ptep_get(ptep); + pte_t orig_pte = __ptep_get(ptep); if (!pte_cont(orig_pte)) - return ptep_get_and_clear(mm, addr, ptep); + return __ptep_get_and_clear(mm, addr, ptep); ncontig = find_num_contig(mm, addr, ptep, &pgsize); @@ -424,11 +424,11 @@ static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig) { int i; - if (pte_write(pte) != pte_write(ptep_get(ptep))) + if (pte_write(pte) != pte_write(__ptep_get(ptep))) return 1; for (i = 0; i < ncontig; i++) { - pte_t orig_pte = ptep_get(ptep + i); + pte_t orig_pte = __ptep_get(ptep + i); if (pte_dirty(pte) != pte_dirty(orig_pte)) return 1; @@ -452,7 +452,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pte_t orig_pte; if (!pte_cont(pte)) - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); + return __ptep_set_access_flags(vma, addr, ptep, pte, dirty); ncontig = find_num_contig(mm, addr, ptep, &pgsize); dpfn = pgsize >> PAGE_SHIFT; @@ -471,7 +471,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, hugeprot = pte_pgprot(pte); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); + __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); return 1; } @@ -485,8 +485,8 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, size_t pgsize; pte_t pte; - if (!pte_cont(ptep_get(ptep))) { - ptep_set_wrprotect(mm, addr, ptep); + if (!pte_cont(__ptep_get(ptep))) { + __ptep_set_wrprotect(mm, addr, ptep); return; } @@ -500,7 +500,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, pfn = pte_pfn(pte); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); + __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); } pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, @@ -510,7 +510,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, size_t pgsize; int ncontig; - if (!pte_cont(ptep_get(ptep))) + if (!pte_cont(__ptep_get(ptep))) return ptep_clear_flush(vma, addr, ptep); ncontig = find_num_contig(mm, addr, ptep, &pgsize); @@ -543,7 +543,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ - if (pte_user_exec(ptep_get(ptep))) + if (pte_user_exec(__ptep_get(ptep))) return huge_ptep_clear_flush(vma, addr, ptep); } return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index c2a9f4f6c7dd..9ee16cfce587 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -112,8 +112,8 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, if (!early) memset(__va(page_phys), KASAN_SHADOW_INIT, PAGE_SIZE); next = addr + PAGE_SIZE; - set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); - } while (ptep++, addr = next, addr != end && pte_none(ptep_get(ptep))); + __set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); + } while (ptep++, addr = next, addr != end && pte_none(__ptep_get(ptep))); } static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr, @@ -271,7 +271,7 @@ static void __init kasan_init_shadow(void) * so we should make sure that it maps the zero page read-only. */ for (i = 0; i < PTRS_PER_PTE; i++) - set_pte(&kasan_early_shadow_pte[i], + __set_pte(&kasan_early_shadow_pte[i], pfn_pte(sym_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO)); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 343629a17042..6208c7541f87 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -173,16 +173,16 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, ptep = pte_set_fixmap_offset(pmdp, addr); do { - pte_t old_pte = ptep_get(ptep); + pte_t old_pte = __ptep_get(ptep); - set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); + __set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); /* * After the PTE entry has been populated once, we * only allow updates to the permission attributes. */ BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), - pte_val(ptep_get(ptep)))); + pte_val(__ptep_get(ptep)))); phys += PAGE_SIZE; } while (ptep++, addr += PAGE_SIZE, addr != end); @@ -852,12 +852,12 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr, do { ptep = pte_offset_kernel(pmdp, addr); - pte = ptep_get(ptep); + pte = __ptep_get(ptep); if (pte_none(pte)) continue; WARN_ON(!pte_present(pte)); - pte_clear(&init_mm, addr, ptep); + __pte_clear(&init_mm, addr, ptep); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); if (free_mapped) free_hotplug_page_range(pte_page(pte), @@ -985,7 +985,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, do { ptep = pte_offset_kernel(pmdp, addr); - pte = ptep_get(ptep); + pte = __ptep_get(ptep); /* * This is just a sanity check here which verifies that @@ -1004,7 +1004,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, */ ptep = pte_offset_kernel(pmdp, 0UL); for (i = 0; i < PTRS_PER_PTE; i++) { - if (!pte_none(ptep_get(&ptep[i]))) + if (!pte_none(__ptep_get(&ptep[i]))) return; } diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 73a5e8f82586..0c4e3ecf989d 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -36,12 +36,12 @@ bool can_set_direct_map(void) static int change_page_range(pte_t *ptep, unsigned long addr, void *data) { struct page_change_data *cdata = data; - pte_t pte = ptep_get(ptep); + pte_t pte = __ptep_get(ptep); pte = clear_pte_bit(pte, cdata->clear_mask); pte = set_pte_bit(pte, cdata->set_mask); - set_pte(ptep, pte); + __set_pte(ptep, pte); return 0; } @@ -245,5 +245,5 @@ bool kernel_page_present(struct page *page) return true; ptep = pte_offset_kernel(pmdp, addr); - return pte_valid(ptep_get(ptep)); + return pte_valid(__ptep_get(ptep)); } diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index f71ab4704cce..5139a28130c0 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -33,7 +33,7 @@ static void *trans_alloc(struct trans_pgd_info *info) static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) { - pte_t pte = ptep_get(src_ptep); + pte_t pte = __ptep_get(src_ptep); if (pte_valid(pte)) { /* @@ -41,7 +41,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) * read only (code, rodata). Clear the RDONLY bit from * the temporary mappings we use during restore. */ - set_pte(dst_ptep, pte_mkwrite_novma(pte)); + __set_pte(dst_ptep, pte_mkwrite_novma(pte)); } else if ((debug_pagealloc_enabled() || is_kfence_address((void *)addr)) && !pte_none(pte)) { /* @@ -55,7 +55,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) */ BUG_ON(!pfn_valid(pte_pfn(pte))); - set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte))); + __set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte))); } } From d9d8dc2bd3fb2689309f704fe85e6dde2b1bd73a Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:58 +0000 Subject: [PATCH 204/435] arm64/mm: dplit __flush_tlb_range() to elide trailing DSB Split __flush_tlb_range() into __flush_tlb_range_nosync() + __flush_tlb_range(), in the same way as the existing flush_tlb_page() arrangement. This allows calling __flush_tlb_range_nosync() to elide the trailing DSB. Forthcoming "contpte" code will take advantage of this when clearing the young bit from a contiguous range of ptes. Ordering between dsb and mmu_notifier_arch_invalidate_secondary_tlbs() has changed, but now aligns with the ordering of __flush_tlb_page(). It has been discussed that __flush_tlb_page() may be wrong though. Regardless, both will be resolved separately if needed. Link: https://lkml.kernel.org/r/20240215103205.2607016-12-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: David Hildenbrand Tested-by: John Hubbard Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlbflush.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 1deb5d789c2e..3b0e8248e1a4 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -422,7 +422,7 @@ do { \ #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled()); -static inline void __flush_tlb_range(struct vm_area_struct *vma, +static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long stride, bool last_level, int tlb_level) @@ -456,10 +456,19 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, __flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true, lpa2_is_enabled()); - dsb(ish); mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } +static inline void __flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long stride, bool last_level, + int tlb_level) +{ + __flush_tlb_range_nosync(vma, start, end, stride, + last_level, tlb_level); + dsb(ish); +} + static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { From 4602e5757bcceb231c3a13c36c373ad4a750eddb Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:59 +0000 Subject: [PATCH 205/435] arm64/mm: wire up PTE_CONT for user mappings With the ptep API sufficiently refactored, we can now introduce a new "contpte" API layer, which transparently manages the PTE_CONT bit for user mappings. In this initial implementation, only suitable batches of PTEs, set via set_ptes(), are mapped with the PTE_CONT bit. Any subsequent modification of individual PTEs will cause an "unfold" operation to repaint the contpte block as individual PTEs before performing the requested operation. While, a modification of a single PTE could cause the block of PTEs to which it belongs to become eligible for "folding" into a contpte entry, "folding" is not performed in this initial implementation due to the costs of checking the requirements are met. Due to this, contpte mappings will degrade back to normal pte mappings over time if/when protections are changed. This will be solved in a future patch. Since a contpte block only has a single access and dirty bit, the semantic here changes slightly; when getting a pte (e.g. ptep_get()) that is part of a contpte mapping, the access and dirty information are pulled from the block (so all ptes in the block return the same access/dirty info). When changing the access/dirty info on a pte (e.g. ptep_set_access_flags()) that is part of a contpte mapping, this change will affect the whole contpte block. This is works fine in practice since we guarantee that only a single folio is mapped by a contpte block, and the core-mm tracks access/dirty information per folio. In order for the public functions, which used to be pure inline, to continue to be callable by modules, export all the contpte_* symbols that are now called by those public inline functions. The feature is enabled/disabled with the ARM64_CONTPTE Kconfig parameter at build time. It defaults to enabled as long as its dependency, TRANSPARENT_HUGEPAGE is also enabled. The core-mm depends upon TRANSPARENT_HUGEPAGE to be able to allocate large folios, so if its not enabled, then there is no chance of meeting the physical contiguity requirement for contpte mappings. Link: https://lkml.kernel.org/r/20240215103205.2607016-13-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: Ard Biesheuvel Tested-by: John Hubbard Acked-by: Mark Rutland Reviewed-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 9 + arch/arm64/include/asm/pgtable.h | 167 ++++++++++++++++++ arch/arm64/mm/Makefile | 1 + arch/arm64/mm/contpte.c | 285 +++++++++++++++++++++++++++++++ include/linux/efi.h | 5 + 5 files changed, 467 insertions(+) create mode 100644 arch/arm64/mm/contpte.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index aa7c1d435139..386566138620 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2229,6 +2229,15 @@ config UNWIND_PATCH_PAC_INTO_SCS select UNWIND_TABLES select DYNAMIC_SCS +config ARM64_CONTPTE + bool "Contiguous PTE mappings for user memory" if EXPERT + depends on TRANSPARENT_HUGEPAGE + default y + help + When enabled, user mappings are configured using the PTE contiguous + bit, for any mappings that meet the size and alignment requirements. + This reduces TLB pressure and improves performance. + endmenu # "Kernel Features" menu "Boot options" diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7336d40a893a..831099cfc96b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -133,6 +133,10 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) */ #define pte_valid_not_user(pte) \ ((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | PTE_UXN)) +/* + * Returns true if the pte is valid and has the contiguous bit set. + */ +#define pte_valid_cont(pte) (pte_valid(pte) && pte_cont(pte)) /* * Could the pte be present in the TLB? We must check mm_tlb_flush_pending * so that we don't erroneously return false for pages that have been @@ -1128,6 +1132,167 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t new_pte); +#ifdef CONFIG_ARM64_CONTPTE + +/* + * The contpte APIs are used to transparently manage the contiguous bit in ptes + * where it is possible and makes sense to do so. The PTE_CONT bit is considered + * a private implementation detail of the public ptep API (see below). + */ +extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte); +extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep); +extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr); +extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); +extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); +extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty); + +static inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + if (unlikely(pte_valid_cont(pte))) + __contpte_try_unfold(mm, addr, ptep, pte); +} + +/* + * The below functions constitute the public API that arm64 presents to the + * core-mm to manipulate PTE entries within their page tables (or at least this + * is the subset of the API that arm64 needs to implement). These public + * versions will automatically and transparently apply the contiguous bit where + * it makes sense to do so. Therefore any users that are contig-aware (e.g. + * hugetlb, kernel mapper) should NOT use these APIs, but instead use the + * private versions, which are prefixed with double underscore. All of these + * APIs except for ptep_get_lockless() are expected to be called with the PTL + * held. Although the contiguous bit is considered private to the + * implementation, it is deliberately allowed to leak through the getters (e.g. + * ptep_get()), back to core code. This is required so that pte_leaf_size() can + * provide an accurate size for perf_get_pgtable_size(). But this leakage means + * its possible a pte will be passed to a setter with the contiguous bit set, so + * we explicitly clear the contiguous bit in those cases to prevent accidentally + * setting it in the pgtable. + */ + +#define ptep_get ptep_get +static inline pte_t ptep_get(pte_t *ptep) +{ + pte_t pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(pte))) + return pte; + + return contpte_ptep_get(ptep, pte); +} + +#define ptep_get_lockless ptep_get_lockless +static inline pte_t ptep_get_lockless(pte_t *ptep) +{ + pte_t pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(pte))) + return pte; + + return contpte_ptep_get_lockless(ptep); +} + +static inline void set_pte(pte_t *ptep, pte_t pte) +{ + /* + * We don't have the mm or vaddr so cannot unfold contig entries (since + * it requires tlb maintenance). set_pte() is not used in core code, so + * this should never even be called. Regardless do our best to service + * any call and emit a warning if there is any attempt to set a pte on + * top of an existing contig range. + */ + pte_t orig_pte = __ptep_get(ptep); + + WARN_ON_ONCE(pte_valid_cont(orig_pte)); + __set_pte(ptep, pte_mknoncont(pte)); +} + +#define set_ptes set_ptes +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + pte = pte_mknoncont(pte); + + if (likely(nr == 1)) { + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + __set_ptes(mm, addr, ptep, pte, 1); + } else { + contpte_set_ptes(mm, addr, ptep, pte, nr); + } +} + +static inline void pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + __pte_clear(mm, addr, ptep); +} + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + return __ptep_get_and_clear(mm, addr, ptep); +} + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pte_t orig_pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(orig_pte))) + return __ptep_test_and_clear_young(vma, addr, ptep); + + return contpte_ptep_test_and_clear_young(vma, addr, ptep); +} + +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +static inline int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pte_t orig_pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(orig_pte))) + return __ptep_clear_flush_young(vma, addr, ptep); + + return contpte_ptep_clear_flush_young(vma, addr, ptep); +} + +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +static inline void ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + __ptep_set_wrprotect(mm, addr, ptep); +} + +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +static inline int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty) +{ + pte_t orig_pte = __ptep_get(ptep); + + entry = pte_mknoncont(entry); + + if (likely(!pte_valid_cont(orig_pte))) + return __ptep_set_access_flags(vma, addr, ptep, entry, dirty); + + return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty); +} + +#else /* CONFIG_ARM64_CONTPTE */ + #define ptep_get __ptep_get #define set_pte __set_pte #define set_ptes __set_ptes @@ -1143,6 +1308,8 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma, #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS #define ptep_set_access_flags __ptep_set_access_flags +#endif /* CONFIG_ARM64_CONTPTE */ + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_PGTABLE_H */ diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index dbd1bc95967d..60454256945b 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -3,6 +3,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ cache.o copypage.o flush.o \ ioremap.o mmap.o pgd.o mmu.o \ context.o proc.o pageattr.o fixmap.o +obj-$(CONFIG_ARM64_CONTPTE) += contpte.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c new file mode 100644 index 000000000000..6d7f40667fa2 --- /dev/null +++ b/arch/arm64/mm/contpte.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include +#include +#include +#include + +static inline bool mm_is_user(struct mm_struct *mm) +{ + /* + * Don't attempt to apply the contig bit to kernel mappings, because + * dynamically adding/removing the contig bit can cause page faults. + * These racing faults are ok for user space, since they get serialized + * on the PTL. But kernel mappings can't tolerate faults. + */ + if (unlikely(mm_is_efi(mm))) + return false; + return mm != &init_mm; +} + +static inline pte_t *contpte_align_down(pte_t *ptep) +{ + return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); +} + +static void contpte_convert(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); + unsigned long start_addr; + pte_t *start_ptep; + int i; + + start_ptep = ptep = contpte_align_down(ptep); + start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + pte = pfn_pte(ALIGN_DOWN(pte_pfn(pte), CONT_PTES), pte_pgprot(pte)); + + for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) { + pte_t ptent = __ptep_get_and_clear(mm, addr, ptep); + + if (pte_dirty(ptent)) + pte = pte_mkdirty(pte); + + if (pte_young(ptent)) + pte = pte_mkyoung(pte); + } + + __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3); + + __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES); +} + +void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + /* + * We have already checked that the ptes are contiguous in + * contpte_try_unfold(), so just check that the mm is user space. + */ + if (!mm_is_user(mm)) + return; + + pte = pte_mknoncont(pte); + contpte_convert(mm, addr, ptep, pte); +} +EXPORT_SYMBOL(__contpte_try_unfold); + +pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte) +{ + /* + * Gather access/dirty bits, which may be populated in any of the ptes + * of the contig range. We are guaranteed to be holding the PTL, so any + * contiguous range cannot be unfolded or otherwise modified under our + * feet. + */ + + pte_t pte; + int i; + + ptep = contpte_align_down(ptep); + + for (i = 0; i < CONT_PTES; i++, ptep++) { + pte = __ptep_get(ptep); + + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + + if (pte_young(pte)) + orig_pte = pte_mkyoung(orig_pte); + } + + return orig_pte; +} +EXPORT_SYMBOL(contpte_ptep_get); + +pte_t contpte_ptep_get_lockless(pte_t *orig_ptep) +{ + /* + * Gather access/dirty bits, which may be populated in any of the ptes + * of the contig range. We may not be holding the PTL, so any contiguous + * range may be unfolded/modified/refolded under our feet. Therefore we + * ensure we read a _consistent_ contpte range by checking that all ptes + * in the range are valid and have CONT_PTE set, that all pfns are + * contiguous and that all pgprots are the same (ignoring access/dirty). + * If we find a pte that is not consistent, then we must be racing with + * an update so start again. If the target pte does not have CONT_PTE + * set then that is considered consistent on its own because it is not + * part of a contpte range. + */ + + pgprot_t orig_prot; + unsigned long pfn; + pte_t orig_pte; + pgprot_t prot; + pte_t *ptep; + pte_t pte; + int i; + +retry: + orig_pte = __ptep_get(orig_ptep); + + if (!pte_valid_cont(orig_pte)) + return orig_pte; + + orig_prot = pte_pgprot(pte_mkold(pte_mkclean(orig_pte))); + ptep = contpte_align_down(orig_ptep); + pfn = pte_pfn(orig_pte) - (orig_ptep - ptep); + + for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) { + pte = __ptep_get(ptep); + prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); + + if (!pte_valid_cont(pte) || + pte_pfn(pte) != pfn || + pgprot_val(prot) != pgprot_val(orig_prot)) + goto retry; + + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + + if (pte_young(pte)) + orig_pte = pte_mkyoung(orig_pte); + } + + return orig_pte; +} +EXPORT_SYMBOL(contpte_ptep_get_lockless); + +void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + unsigned long next; + unsigned long end; + unsigned long pfn; + pgprot_t prot; + + /* + * The set_ptes() spec guarantees that when nr > 1, the initial state of + * all ptes is not-present. Therefore we never need to unfold or + * otherwise invalidate a range before we set the new ptes. + * contpte_set_ptes() should never be called for nr < 2. + */ + VM_WARN_ON(nr == 1); + + if (!mm_is_user(mm)) + return __set_ptes(mm, addr, ptep, pte, nr); + + end = addr + (nr << PAGE_SHIFT); + pfn = pte_pfn(pte); + prot = pte_pgprot(pte); + + do { + next = pte_cont_addr_end(addr, end); + nr = (next - addr) >> PAGE_SHIFT; + pte = pfn_pte(pfn, prot); + + if (((addr | next | (pfn << PAGE_SHIFT)) & ~CONT_PTE_MASK) == 0) + pte = pte_mkcont(pte); + else + pte = pte_mknoncont(pte); + + __set_ptes(mm, addr, ptep, pte, nr); + + addr = next; + ptep += nr; + pfn += nr; + + } while (addr != end); +} +EXPORT_SYMBOL(contpte_set_ptes); + +int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + /* + * ptep_clear_flush_young() technically requires us to clear the access + * flag for a _single_ pte. However, the core-mm code actually tracks + * access/dirty per folio, not per page. And since we only create a + * contig range when the range is covered by a single folio, we can get + * away with clearing young for the whole contig range here, so we avoid + * having to unfold. + */ + + int young = 0; + int i; + + ptep = contpte_align_down(ptep); + addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + + for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) + young |= __ptep_test_and_clear_young(vma, addr, ptep); + + return young; +} +EXPORT_SYMBOL(contpte_ptep_test_and_clear_young); + +int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int young; + + young = contpte_ptep_test_and_clear_young(vma, addr, ptep); + + if (young) { + /* + * See comment in __ptep_clear_flush_young(); same rationale for + * eliding the trailing DSB applies here. + */ + addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + __flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE, + PAGE_SIZE, true, 3); + } + + return young; +} +EXPORT_SYMBOL(contpte_ptep_clear_flush_young); + +int contpte_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty) +{ + unsigned long start_addr; + pte_t orig_pte; + int i; + + /* + * Gather the access/dirty bits for the contiguous range. If nothing has + * changed, its a noop. + */ + orig_pte = pte_mknoncont(ptep_get(ptep)); + if (pte_val(orig_pte) == pte_val(entry)) + return 0; + + /* + * We can fix up access/dirty bits without having to unfold the contig + * range. But if the write bit is changing, we must unfold. + */ + if (pte_write(orig_pte) == pte_write(entry)) { + /* + * For HW access management, we technically only need to update + * the flag on a single pte in the range. But for SW access + * management, we need to update all the ptes to prevent extra + * faults. Avoid per-page tlb flush in __ptep_set_access_flags() + * and instead flush the whole range at the end. + */ + ptep = contpte_align_down(ptep); + start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + + for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) + __ptep_set_access_flags(vma, addr, ptep, entry, 0); + + if (dirty) + __flush_tlb_range(vma, start_addr, addr, + PAGE_SIZE, true, 3); + } else { + __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte); + __ptep_set_access_flags(vma, addr, ptep, entry, dirty); + } + + return 1; +} +EXPORT_SYMBOL(contpte_ptep_set_access_flags); diff --git a/include/linux/efi.h b/include/linux/efi.h index c74f47711f0b..57da15e7429c 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -692,6 +692,11 @@ extern struct efi { extern struct mm_struct efi_mm; +static inline bool mm_is_efi(struct mm_struct *mm) +{ + return IS_ENABLED(CONFIG_EFI) && mm == &efi_mm; +} + static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right) { From 311a6cf29690bb8295327bad0e76e0ad48cadcc4 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:32:00 +0000 Subject: [PATCH 206/435] arm64/mm: implement new wrprotect_ptes() batch API Optimize the contpte implementation to fix some of the fork performance regression introduced by the initial contpte commit. Subsequent patches will solve it entirely. During fork(), any private memory in the parent must be write-protected. Previously this was done 1 PTE at a time. But the core-mm supports batched wrprotect via the new wrprotect_ptes() API. So let's implement that API and for fully covered contpte mappings, we no longer need to unfold the contpte. This has 2 benefits: - reduced unfolding, reduces the number of tlbis that must be issued. - The memory remains contpte-mapped ("folded") in the parent, so it continues to benefit from the more efficient use of the TLB after the fork. The optimization to wrprotect a whole contpte block without unfolding is possible thanks to the tightening of the Arm ARM in respect to the definition and behaviour when 'Misprogramming the Contiguous bit'. See section D21194 at https://developer.arm.com/documentation/102105/ja-07/ Link: https://lkml.kernel.org/r/20240215103205.2607016-14-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: John Hubbard Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 61 ++++++++++++++++++++++++++------ arch/arm64/mm/contpte.c | 38 ++++++++++++++++++++ 2 files changed, 89 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 831099cfc96b..8643227c318b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -978,6 +978,20 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline void ___ptep_set_wrprotect(struct mm_struct *mm, + unsigned long address, pte_t *ptep, + pte_t pte) +{ + pte_t old_pte; + + do { + old_pte = pte; + pte = pte_wrprotect(pte); + pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), + pte_val(old_pte), pte_val(pte)); + } while (pte_val(pte) != pte_val(old_pte)); +} + /* * __ptep_set_wrprotect - mark read-only while trasferring potential hardware * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. @@ -985,15 +999,16 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, static inline void __ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pte_t old_pte, pte; + ___ptep_set_wrprotect(mm, address, ptep, __ptep_get(ptep)); +} - pte = __ptep_get(ptep); - do { - old_pte = pte; - pte = pte_wrprotect(pte); - pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), - pte_val(old_pte), pte_val(pte)); - } while (pte_val(pte) != pte_val(old_pte)); +static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address, + pte_t *ptep, unsigned int nr) +{ + unsigned int i; + + for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++) + __ptep_set_wrprotect(mm, address, ptep); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -1149,6 +1164,8 @@ extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr); extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t entry, int dirty); @@ -1268,12 +1285,35 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, return contpte_ptep_clear_flush_young(vma, addr, ptep); } +#define wrprotect_ptes wrprotect_ptes +static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + if (likely(nr == 1)) { + /* + * Optimization: wrprotect_ptes() can only be called for present + * ptes so we only need to check contig bit as condition for + * unfold, and we can remove the contig bit from the pte we read + * to avoid re-reading. This speeds up fork() which is sensitive + * for order-0 folios. Equivalent to contpte_try_unfold(). + */ + pte_t orig_pte = __ptep_get(ptep); + + if (unlikely(pte_cont(orig_pte))) { + __contpte_try_unfold(mm, addr, ptep, orig_pte); + orig_pte = pte_mknoncont(orig_pte); + } + ___ptep_set_wrprotect(mm, addr, ptep, orig_pte); + } else { + contpte_wrprotect_ptes(mm, addr, ptep, nr); + } +} + #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); - __ptep_set_wrprotect(mm, addr, ptep); + wrprotect_ptes(mm, addr, ptep, 1); } #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS @@ -1305,6 +1345,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, #define ptep_clear_flush_young __ptep_clear_flush_young #define __HAVE_ARCH_PTEP_SET_WRPROTECT #define ptep_set_wrprotect __ptep_set_wrprotect +#define wrprotect_ptes __wrprotect_ptes #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS #define ptep_set_access_flags __ptep_set_access_flags diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 6d7f40667fa2..bedb58524535 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -26,6 +26,26 @@ static inline pte_t *contpte_align_down(pte_t *ptep) return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); } +static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + /* + * Unfold any partially covered contpte block at the beginning and end + * of the range. + */ + + if (ptep != contpte_align_down(ptep) || nr < CONT_PTES) + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + + if (ptep + nr != contpte_align_down(ptep + nr)) { + unsigned long last_addr = addr + PAGE_SIZE * (nr - 1); + pte_t *last_ptep = ptep + nr - 1; + + contpte_try_unfold(mm, last_addr, last_ptep, + __ptep_get(last_ptep)); + } +} + static void contpte_convert(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -238,6 +258,24 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, } EXPORT_SYMBOL(contpte_ptep_clear_flush_young); +void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + /* + * If wrprotecting an entire contig range, we can avoid unfolding. Just + * set wrprotect and wait for the later mmu_gather flush to invalidate + * the tlb. Until the flush, the page may or may not be wrprotected. + * After the flush, it is guaranteed wrprotected. If it's a partial + * range though, we must unfold, because we can't have a case where + * CONT_PTE is set but wrprotect applies to a subset of the PTEs; this + * would cause it to continue to be unpredictable after the flush. + */ + + contpte_try_unfold_partial(mm, addr, ptep, nr); + __wrprotect_ptes(mm, addr, ptep, nr); +} +EXPORT_SYMBOL(contpte_wrprotect_ptes); + int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t entry, int dirty) From 6b1e4efb6f5499ae8f9f5cdda7502285a0edbf51 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:32:01 +0000 Subject: [PATCH 207/435] arm64/mm: implement new [get_and_]clear_full_ptes() batch APIs Optimize the contpte implementation to fix some of the exit/munmap/dontneed performance regression introduced by the initial contpte commit. Subsequent patches will solve it entirely. During exit(), munmap() or madvise(MADV_DONTNEED), mappings must be cleared. Previously this was done 1 PTE at a time. But the core-mm supports batched clear via the new [get_and_]clear_full_ptes() APIs. So let's implement those APIs and for fully covered contpte mappings, we no longer need to unfold the contpte. This significantly reduces unfolding operations, reducing the number of tlbis that must be issued. Link: https://lkml.kernel.org/r/20240215103205.2607016-15-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: John Hubbard Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 67 ++++++++++++++++++++++++++++++++ arch/arm64/mm/contpte.c | 17 ++++++++ 2 files changed, 84 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 8643227c318b..a8f1a35e3086 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -965,6 +965,37 @@ static inline pte_t __ptep_get_and_clear(struct mm_struct *mm, return pte; } +static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + for (;;) { + __ptep_get_and_clear(mm, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} + +static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full) +{ + pte_t pte, tmp_pte; + + pte = __ptep_get_and_clear(mm, addr, ptep); + while (--nr) { + ptep++; + addr += PAGE_SIZE; + tmp_pte = __ptep_get_and_clear(mm, addr, ptep); + if (pte_dirty(tmp_pte)) + pte = pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte = pte_mkyoung(pte); + } + return pte; +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, @@ -1160,6 +1191,11 @@ extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte); extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep); extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr); +extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full); +extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full); extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, @@ -1253,6 +1289,35 @@ static inline void pte_clear(struct mm_struct *mm, __pte_clear(mm, addr, ptep); } +#define clear_full_ptes clear_full_ptes +static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + if (likely(nr == 1)) { + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + __clear_full_ptes(mm, addr, ptep, nr, full); + } else { + contpte_clear_full_ptes(mm, addr, ptep, nr, full); + } +} + +#define get_and_clear_full_ptes get_and_clear_full_ptes +static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full) +{ + pte_t pte; + + if (likely(nr == 1)) { + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + pte = __get_and_clear_full_ptes(mm, addr, ptep, nr, full); + } else { + pte = contpte_get_and_clear_full_ptes(mm, addr, ptep, nr, full); + } + + return pte; +} + #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -1337,6 +1402,8 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, #define set_pte __set_pte #define set_ptes __set_ptes #define pte_clear __pte_clear +#define clear_full_ptes __clear_full_ptes +#define get_and_clear_full_ptes __get_and_clear_full_ptes #define __HAVE_ARCH_PTEP_GET_AND_CLEAR #define ptep_get_and_clear __ptep_get_and_clear #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index bedb58524535..50e0173dc5ee 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -212,6 +212,23 @@ void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(contpte_set_ptes); +void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + contpte_try_unfold_partial(mm, addr, ptep, nr); + __clear_full_ptes(mm, addr, ptep, nr, full); +} +EXPORT_SYMBOL(contpte_clear_full_ptes); + +pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full) +{ + contpte_try_unfold_partial(mm, addr, ptep, nr); + return __get_and_clear_full_ptes(mm, addr, ptep, nr, full); +} +EXPORT_SYMBOL(contpte_get_and_clear_full_ptes); + int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { From c6ec76a2ebc5829e5826b218d2e1475ec11b333e Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:32:02 +0000 Subject: [PATCH 208/435] mm: add pte_batch_hint() to reduce scanning in folio_pte_batch() Some architectures (e.g. arm64) can tell from looking at a pte, if some follow-on ptes also map contiguous physical memory with the same pgprot. (for arm64, these are contpte mappings). Take advantage of this knowledge to optimize folio_pte_batch() so that it can skip these ptes when scanning to create a batch. By default, if an arch does not opt-in, folio_pte_batch() returns a compile-time 1, so the changes are optimized out and the behaviour is as before. arm64 will opt-in to providing this hint in the next patch, which will greatly reduce the cost of ptep_get() when scanning a range of contptes. Link: https://lkml.kernel.org/r/20240215103205.2607016-16-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Tested-by: John Hubbard Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 21 +++++++++++++++++++++ mm/memory.c | 19 ++++++++++++------- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index bc005d84f764..a36cf4e124b0 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,6 +212,27 @@ static inline int pmd_dirty(pmd_t pmd) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif +#ifndef pte_batch_hint +/** + * pte_batch_hint - Number of pages that can be added to batch without scanning. + * @ptep: Page table pointer for the entry. + * @pte: Page table entry. + * + * Some architectures know that a set of contiguous ptes all map the same + * contiguous memory with the same permissions. In this case, it can provide a + * hint to aid pte batching without the core code needing to scan every pte. + * + * An architecture implementation may ignore the PTE accessed state. Further, + * the dirty state must apply atomically to all the PTEs described by the hint. + * + * May be overridden by the architecture, else pte_batch_hint is always 1. + */ +static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) +{ + return 1; +} +#endif + #ifndef pte_advance_pfn static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { diff --git a/mm/memory.c b/mm/memory.c index 465ada39c2b7..642b4f2be523 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -988,16 +988,20 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, { unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); const pte_t *end_ptep = start_ptep + max_nr; - pte_t expected_pte = __pte_batch_clear_ignored(pte_next_pfn(pte), flags); - pte_t *ptep = start_ptep + 1; + pte_t expected_pte, *ptep; bool writable; + int nr; if (any_writable) *any_writable = false; VM_WARN_ON_FOLIO(!pte_present(pte), folio); - while (ptep != end_ptep) { + nr = pte_batch_hint(start_ptep, pte); + expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); + ptep = start_ptep + nr; + + while (ptep < end_ptep) { pte = ptep_get(ptep); if (any_writable) writable = !!pte_write(pte); @@ -1011,17 +1015,18 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, * corner cases the next PFN might fall into a different * folio. */ - if (pte_pfn(pte) == folio_end_pfn) + if (pte_pfn(pte) >= folio_end_pfn) break; if (any_writable) *any_writable |= writable; - expected_pte = pte_next_pfn(expected_pte); - ptep++; + nr = pte_batch_hint(ptep, pte); + expected_pte = pte_advance_pfn(expected_pte, nr); + ptep += nr; } - return ptep - start_ptep; + return min(ptep - start_ptep, max_nr); } /* From fb5451e5f72b31002760083a99fbb41771c4f1ad Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:32:03 +0000 Subject: [PATCH 209/435] arm64/mm: implement pte_batch_hint() When core code iterates over a range of ptes and calls ptep_get() for each of them, if the range happens to cover contpte mappings, the number of pte reads becomes amplified by a factor of the number of PTEs in a contpte block. This is because for each call to ptep_get(), the implementation must read all of the ptes in the contpte block to which it belongs to gather the access and dirty bits. This causes a hotspot for fork(), as well as operations that unmap memory such as munmap(), exit and madvise(MADV_DONTNEED). Fortunately we can fix this by implementing pte_batch_hint() which allows their iterators to skip getting the contpte tail ptes when gathering the batch of ptes to operate on. This results in the number of PTE reads returning to 1 per pte. Link: https://lkml.kernel.org/r/20240215103205.2607016-17-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: Mark Rutland Reviewed-by: David Hildenbrand Tested-by: John Hubbard Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index a8f1a35e3086..d759a20d2929 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1213,6 +1213,15 @@ static inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr, __contpte_try_unfold(mm, addr, ptep, pte); } +#define pte_batch_hint pte_batch_hint +static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) +{ + if (!pte_valid_cont(pte)) + return 1; + + return CONT_PTES - (((unsigned long)ptep >> 3) & (CONT_PTES - 1)); +} + /* * The below functions constitute the public API that arm64 presents to the * core-mm to manipulate PTE entries within their page tables (or at least this From b972fc6afba002319fe23bc698ce6431ee43868c Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:32:04 +0000 Subject: [PATCH 210/435] arm64/mm: __always_inline to improve fork() perf As set_ptes() and wrprotect_ptes() become a bit more complex, the compiler may choose not to inline them. But this is critical for fork() performance. So mark the functions, along with contpte_try_unfold() which is called by them, as __always_inline. This is worth ~1% on the fork() microbenchmark with order-0 folios (the common case). Link: https://lkml.kernel.org/r/20240215103205.2607016-18-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d759a20d2929..8310875133ff 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1206,8 +1206,8 @@ extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t entry, int dirty); -static inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) +static __always_inline void contpte_try_unfold(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pte) { if (unlikely(pte_valid_cont(pte))) __contpte_try_unfold(mm, addr, ptep, pte); @@ -1278,7 +1278,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte) } #define set_ptes set_ptes -static inline void set_ptes(struct mm_struct *mm, unsigned long addr, +static __always_inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { pte = pte_mknoncont(pte); @@ -1360,8 +1360,8 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, } #define wrprotect_ptes wrprotect_ptes -static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, unsigned int nr) +static __always_inline void wrprotect_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned int nr) { if (likely(nr == 1)) { /* From f0c2264958e18bc7bc35b567d51b99461e4de34f Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:32:05 +0000 Subject: [PATCH 211/435] arm64/mm: automatically fold contpte mappings There are situations where a change to a single PTE could cause the contpte block in which it resides to become foldable (i.e. could be repainted with the contiguous bit). Such situations arise, for example, when user space temporarily changes protections, via mprotect, for individual pages, such can be the case for certain garbage collectors. We would like to detect when such a PTE change occurs. However this can be expensive due to the amount of checking required. Therefore only perform the checks when an indiviual PTE is modified via mprotect (ptep_modify_prot_commit() -> set_pte_at() -> set_ptes(nr=1)) and only when we are setting the final PTE in a contpte-aligned block. Link: https://lkml.kernel.org/r/20240215103205.2607016-19-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 26 +++++++++++++ arch/arm64/mm/contpte.c | 64 ++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 8310875133ff..401087e8a43d 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1185,6 +1185,8 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma, * where it is possible and makes sense to do so. The PTE_CONT bit is considered * a private implementation detail of the public ptep API (see below). */ +extern void __contpte_try_fold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte); @@ -1206,6 +1208,29 @@ extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t entry, int dirty); +static __always_inline void contpte_try_fold(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pte) +{ + /* + * Only bother trying if both the virtual and physical addresses are + * aligned and correspond to the last entry in a contig range. The core + * code mostly modifies ranges from low to high, so this is the likely + * the last modification in the contig range, so a good time to fold. + * We can't fold special mappings, because there is no associated folio. + */ + + const unsigned long contmask = CONT_PTES - 1; + bool valign = ((addr >> PAGE_SHIFT) & contmask) == contmask; + + if (unlikely(valign)) { + bool palign = (pte_pfn(pte) & contmask) == contmask; + + if (unlikely(palign && + pte_valid(pte) && !pte_cont(pte) && !pte_special(pte))) + __contpte_try_fold(mm, addr, ptep, pte); + } +} + static __always_inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -1286,6 +1311,7 @@ static __always_inline void set_ptes(struct mm_struct *mm, unsigned long addr, if (likely(nr == 1)) { contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); __set_ptes(mm, addr, ptep, pte, 1); + contpte_try_fold(mm, addr, ptep, pte); } else { contpte_set_ptes(mm, addr, ptep, pte, nr); } diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 50e0173dc5ee..16788f07716d 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -73,6 +73,70 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr, __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES); } +void __contpte_try_fold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + /* + * We have already checked that the virtual and pysical addresses are + * correctly aligned for a contpte mapping in contpte_try_fold() so the + * remaining checks are to ensure that the contpte range is fully + * covered by a single folio, and ensure that all the ptes are valid + * with contiguous PFNs and matching prots. We ignore the state of the + * access and dirty bits for the purpose of deciding if its a contiguous + * range; the folding process will generate a single contpte entry which + * has a single access and dirty bit. Those 2 bits are the logical OR of + * their respective bits in the constituent pte entries. In order to + * ensure the contpte range is covered by a single folio, we must + * recover the folio from the pfn, but special mappings don't have a + * folio backing them. Fortunately contpte_try_fold() already checked + * that the pte is not special - we never try to fold special mappings. + * Note we can't use vm_normal_page() for this since we don't have the + * vma. + */ + + unsigned long folio_start, folio_end; + unsigned long cont_start, cont_end; + pte_t expected_pte, subpte; + struct folio *folio; + struct page *page; + unsigned long pfn; + pte_t *orig_ptep; + pgprot_t prot; + + int i; + + if (!mm_is_user(mm)) + return; + + page = pte_page(pte); + folio = page_folio(page); + folio_start = addr - (page - &folio->page) * PAGE_SIZE; + folio_end = folio_start + folio_nr_pages(folio) * PAGE_SIZE; + cont_start = ALIGN_DOWN(addr, CONT_PTE_SIZE); + cont_end = cont_start + CONT_PTE_SIZE; + + if (folio_start > cont_start || folio_end < cont_end) + return; + + pfn = ALIGN_DOWN(pte_pfn(pte), CONT_PTES); + prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); + expected_pte = pfn_pte(pfn, prot); + orig_ptep = ptep; + ptep = contpte_align_down(ptep); + + for (i = 0; i < CONT_PTES; i++) { + subpte = pte_mkold(pte_mkclean(__ptep_get(ptep))); + if (!pte_same(subpte, expected_pte)) + return; + expected_pte = pte_advance_pfn(expected_pte, 1); + ptep++; + } + + pte = pte_mkcont(pte); + contpte_convert(mm, addr, orig_ptep, pte); +} +EXPORT_SYMBOL(__contpte_try_fold); + void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { From f6932a275461e339de69df01195c50951f039153 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 8 Feb 2024 13:49:02 -0500 Subject: [PATCH 212/435] nvdimm/pmem: fix leak on dax_add_host() failure Fix a leak on dax_add_host() error, where "goto out_cleanup_dax" is done before setting pmem->dax_dev, which therefore issues the two following calls on NULL pointers: out_cleanup_dax: kill_dax(pmem->dax_dev); put_dax(pmem->dax_dev); Link: https://lkml.kernel.org/r/20240208184913.484340-1-mathieu.desnoyers@efficios.com Link: https://lkml.kernel.org/r/20240208184913.484340-2-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Reviewed-by: Dan Williams Reviewed-by: Fan Ni Reviewed-by: Dave Jiang Cc: Alasdair Kergon Cc: Mike Snitzer Cc: Mikulas Patocka Cc: Dan Williams Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Russell King Cc: Dave Chinner Cc: Linus Torvalds Signed-off-by: Andrew Morton --- drivers/nvdimm/pmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 4e8fdcb3f1c8..9fe358090720 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -566,12 +566,11 @@ static int pmem_attach_disk(struct device *dev, set_dax_nomc(dax_dev); if (is_nvdimm_sync(nd_region)) set_dax_synchronous(dax_dev); + pmem->dax_dev = dax_dev; rc = dax_add_host(dax_dev, disk); if (rc) goto out_cleanup_dax; dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); - pmem->dax_dev = dax_dev; - rc = device_add_disk(dev, disk, pmem_attribute_groups); if (rc) goto out_remove_host; From 2807c54b3809ca5e51e5a9dfa5e9ddb8993a0e6f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 15 Feb 2024 09:46:25 -0500 Subject: [PATCH 213/435] dax: add empty static inline for CONFIG_DAX=n Patch series "Introduce cpu_dcache_is_aliasing() to fix DAX regression", v6. This commit introduced in v4.0 prevents building FS_DAX on 32-bit ARM, even on ARMv7 which does not have virtually aliased data caches: commit d92576f1167c ("dax: does not work correctly with virtual aliasing caches") Even though it used to work fine before. The root of the issue here is the fact that DAX was never designed to handle virtually aliasing data caches (VIVT and VIPT with aliasing data cache). It touches the pages through their linear mapping, which is not consistent with the userspace mappings with virtually aliasing data caches. This patch series introduces cpu_dcache_is_aliasing() with the new Kconfig option ARCH_HAS_CPU_CACHE_ALIASING and implements it for all architectures. The implementation of cpu_dcache_is_aliasing() is either evaluated to a constant at compile-time or a runtime check, which is what is needed on ARM. With this we can basically narrow down the list of architectures which are unsupported by DAX to those which are really affected. This patch (of 9): When building a kernel with CONFIG_DAX=n, all uses of set_dax_nocache() and set_dax_nomc() need to be either within regions of code or compile units which are explicitly not compiled, or they need to rely on compiler optimizations to eliminate calls to those undefined symbols. It appears that at least the openrisc and loongarch architectures don't end up eliminating those undefined symbols even if they are provably within code which is eliminated due to conditional branches depending on constants. Implement empty static inline functions for set_dax_nocache() and set_dax_nomc() in CONFIG_DAX=n to ensure those undefined references are removed. Link: https://lkml.kernel.org/r/20240215144633.96437-1-mathieu.desnoyers@efficios.com Link: https://lkml.kernel.org/r/20240215144633.96437-2-mathieu.desnoyers@efficios.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202402140037.wGfA1kqX-lkp@intel.com/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202402131351.a0FZOgEG-lkp@intel.com/ Fixes: 7ac5360cd4d0 ("dax: remove the copy_from_iter and copy_to_iter methods") Signed-off-by: Mathieu Desnoyers Cc: Christoph Hellwig Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Russell King Cc: Dave Chinner Cc: Michael Sclafani Cc: Alasdair Kergon Cc: Heiko Carstens Cc: Mike Snitzer Cc: Mikulas Patocka Signed-off-by: Andrew Morton --- include/linux/dax.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/dax.h b/include/linux/dax.h index b463502b16e1..e3ffe7c7f01d 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -63,6 +63,8 @@ void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); bool dax_write_cache_enabled(struct dax_device *dax_dev); bool dax_synchronous(struct dax_device *dax_dev); +void set_dax_nocache(struct dax_device *dax_dev); +void set_dax_nomc(struct dax_device *dax_dev); void set_dax_synchronous(struct dax_device *dax_dev); size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); @@ -109,6 +111,12 @@ static inline bool dax_synchronous(struct dax_device *dax_dev) { return true; } +static inline void set_dax_nocache(struct dax_device *dax_dev) +{ +} +static inline void set_dax_nomc(struct dax_device *dax_dev) +{ +} static inline void set_dax_synchronous(struct dax_device *dax_dev) { } @@ -124,9 +132,6 @@ static inline size_t dax_recovery_write(struct dax_device *dax_dev, } #endif -void set_dax_nocache(struct dax_device *dax_dev); -void set_dax_nomc(struct dax_device *dax_dev); - struct writeback_control; #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); From 6d439c18d9b190ab1e0f1196bc45590f95752bf1 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 15 Feb 2024 09:46:26 -0500 Subject: [PATCH 214/435] dax: alloc_dax() return ERR_PTR(-EOPNOTSUPP) for CONFIG_DAX=n Change the return value from NULL to PTR_ERR(-EOPNOTSUPP) for CONFIG_DAX=n to be consistent with the fact that CONFIG_DAX=y never returns NULL. This is done in preparation for using cpu_dcache_is_aliasing() in a following change which will properly support architectures which detect data cache aliasing at runtime. Link: https://lkml.kernel.org/r/20240215144633.96437-3-mathieu.desnoyers@efficios.com Fixes: 4e4ced93794a ("dax: Move mandatory ->zero_page_range() check in alloc_dax()") Signed-off-by: Mathieu Desnoyers Reviewed-by: Dan Williams Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Russell King Cc: Alasdair Kergon Cc: Christoph Hellwig Cc: Dave Chinner Cc: Heiko Carstens Cc: kernel test robot Cc: Michael Sclafani Cc: Mike Snitzer Cc: Mikulas Patocka Signed-off-by: Andrew Morton --- drivers/dax/super.c | 5 +++++ include/linux/dax.h | 6 +----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index f4b635526345..97ef2a9d878d 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -319,6 +319,11 @@ EXPORT_SYMBOL_GPL(dax_alive); * that any fault handlers or operations that might have seen * dax_alive(), have completed. Any operations that start after * synchronize_srcu() has run will abort upon seeing !dax_alive(). + * + * Note, because alloc_dax() returns an ERR_PTR() on error, callers + * typically store its result into a local variable in order to check + * the result. Therefore, care must be taken to populate the struct + * device dax_dev field make sure the dax_dev is not leaked. */ void kill_dax(struct dax_device *dax_dev) { diff --git a/include/linux/dax.h b/include/linux/dax.h index e3ffe7c7f01d..9d3e3327af4c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -88,11 +88,7 @@ static inline void *dax_holder(struct dax_device *dax_dev) static inline struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { - /* - * Callers should check IS_ENABLED(CONFIG_DAX) to know if this - * NULL is an error or expected. - */ - return NULL; + return ERR_PTR(-EOPNOTSUPP); } static inline void put_dax(struct dax_device *dax_dev) { From f4d373ddd699561d81e79f17bb68064b1a9b686b Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 15 Feb 2024 09:46:27 -0500 Subject: [PATCH 215/435] nvdimm/pmem: Treat alloc_dax() -EOPNOTSUPP failure as non-fatal In preparation for checking whether the architecture has data cache aliasing within alloc_dax(), modify the error handling of nvdimm/pmem pmem_attach_disk() to treat alloc_dax() -EOPNOTSUPP failure as non-fatal. [ Based on commit "nvdimm/pmem: Fix leak on dax_add_host() failure". ] Link: https://lkml.kernel.org/r/20240215144633.96437-4-mathieu.desnoyers@efficios.com Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches") Signed-off-by: Mathieu Desnoyers Reviewed-by: Dan Williams Cc: Alasdair Kergon Cc: Mike Snitzer Cc: Mikulas Patocka Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Russell King Cc: Christoph Hellwig Cc: Dave Chinner Cc: Heiko Carstens Cc: kernel test robot Cc: Michael Sclafani Signed-off-by: Andrew Morton --- drivers/nvdimm/pmem.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 9fe358090720..e9898457a7bd 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -560,17 +560,19 @@ static int pmem_attach_disk(struct device *dev, dax_dev = alloc_dax(pmem, &pmem_dax_ops); if (IS_ERR(dax_dev)) { rc = PTR_ERR(dax_dev); - goto out; + if (rc != -EOPNOTSUPP) + goto out; + } else { + set_dax_nocache(dax_dev); + set_dax_nomc(dax_dev); + if (is_nvdimm_sync(nd_region)) + set_dax_synchronous(dax_dev); + pmem->dax_dev = dax_dev; + rc = dax_add_host(dax_dev, disk); + if (rc) + goto out_cleanup_dax; + dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); } - set_dax_nocache(dax_dev); - set_dax_nomc(dax_dev); - if (is_nvdimm_sync(nd_region)) - set_dax_synchronous(dax_dev); - pmem->dax_dev = dax_dev; - rc = dax_add_host(dax_dev, disk); - if (rc) - goto out_cleanup_dax; - dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); rc = device_add_disk(dev, disk, pmem_attribute_groups); if (rc) goto out_remove_host; From c29290728decdf58b48b10a58861beca3f2f87f0 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 15 Feb 2024 09:46:28 -0500 Subject: [PATCH 216/435] dm: treat alloc_dax() -EOPNOTSUPP failure as non-fatal In preparation for checking whether the architecture has data cache aliasing within alloc_dax(), modify the error handling of dm alloc_dev() to treat alloc_dax() -EOPNOTSUPP failure as non-fatal. Link: https://lkml.kernel.org/r/20240215144633.96437-5-mathieu.desnoyers@efficios.com Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches") Suggested-by: Dan Williams Signed-off-by: Mathieu Desnoyers Reviewed-by: Dan Williams Cc: Alasdair Kergon Cc: Mike Snitzer Cc: Mikulas Patocka Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Russell King Cc: Christoph Hellwig Cc: Dave Chinner Cc: Heiko Carstens Cc: kernel test robot Cc: Michael Sclafani Signed-off-by: Andrew Morton --- drivers/md/dm.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8dcabf84d866..10c73af93d00 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2054,6 +2054,7 @@ static void cleanup_mapped_device(struct mapped_device *md) static struct mapped_device *alloc_dev(int minor) { int r, numa_node_id = dm_get_numa_node(); + struct dax_device *dax_dev; struct mapped_device *md; void *old_md; @@ -2122,15 +2123,15 @@ static struct mapped_device *alloc_dev(int minor) md->disk->private_data = md; sprintf(md->disk->disk_name, "dm-%d", minor); - if (IS_ENABLED(CONFIG_FS_DAX)) { - md->dax_dev = alloc_dax(md, &dm_dax_ops); - if (IS_ERR(md->dax_dev)) { - md->dax_dev = NULL; + dax_dev = alloc_dax(md, &dm_dax_ops); + if (IS_ERR(dax_dev)) { + if (PTR_ERR(dax_dev) != -EOPNOTSUPP) goto bad; - } - set_dax_nocache(md->dax_dev); - set_dax_nomc(md->dax_dev); - if (dax_add_host(md->dax_dev, md->disk)) + } else { + set_dax_nocache(dax_dev); + set_dax_nomc(dax_dev); + md->dax_dev = dax_dev; + if (dax_add_host(dax_dev, md->disk)) goto bad; } From cf7fe690abbbe520034843b97423d35e0acf9bd8 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 15 Feb 2024 09:46:29 -0500 Subject: [PATCH 217/435] dcssblk: handle alloc_dax() -EOPNOTSUPP failure In preparation for checking whether the architecture has data cache aliasing within alloc_dax(), modify the error handling of dcssblk dcssblk_add_store() to handle alloc_dax() -EOPNOTSUPP failures. Considering that s390 is not a data cache aliasing architecture, and considering that DCSSBLK selects DAX, a return value of -EOPNOTSUPP from alloc_dax() should make dcssblk_add_store() fail. Link: https://lkml.kernel.org/r/20240215144633.96437-6-mathieu.desnoyers@efficios.com Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches") Signed-off-by: Mathieu Desnoyers Reviewed-by: Dan Williams Acked-by: Heiko Carstens Cc: Alasdair Kergon Cc: Mike Snitzer Cc: Mikulas Patocka Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Russell King Cc: Christoph Hellwig Cc: Dave Chinner Cc: kernel test robot Cc: Michael Sclafani Signed-off-by: Andrew Morton --- drivers/s390/block/dcssblk.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 4b7ecd4fd431..f363c1d51d9a 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -549,6 +549,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char int rc, i, j, num_of_segments; struct dcssblk_dev_info *dev_info; struct segment_info *seg_info, *temp; + struct dax_device *dax_dev; char *local_buf; unsigned long seg_byte_size; @@ -677,13 +678,13 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char if (rc) goto put_dev; - dev_info->dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops); - if (IS_ERR(dev_info->dax_dev)) { - rc = PTR_ERR(dev_info->dax_dev); - dev_info->dax_dev = NULL; + dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops); + if (IS_ERR(dax_dev)) { + rc = PTR_ERR(dax_dev); goto put_dev; } - set_dax_synchronous(dev_info->dax_dev); + set_dax_synchronous(dax_dev); + dev_info->dax_dev = dax_dev; rc = dax_add_host(dev_info->dax_dev, dev_info->gd); if (rc) goto out_dax; From 562ce8285b5da3f0d2087341f262c0c728d43f31 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 15 Feb 2024 09:46:30 -0500 Subject: [PATCH 218/435] virtio: treat alloc_dax() -EOPNOTSUPP failure as non-fatal In preparation for checking whether the architecture has data cache aliasing within alloc_dax(), modify the error handling of virtio virtio_fs_setup_dax() to treat alloc_dax() -EOPNOTSUPP failure as non-fatal. Link: https://lkml.kernel.org/r/20240215144633.96437-7-mathieu.desnoyers@efficios.com Co-developed-by: Dan Williams Signed-off-by: Dan Williams Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches") Signed-off-by: Mathieu Desnoyers Cc: Alasdair Kergon Cc: Mike Snitzer Cc: Mikulas Patocka Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Russell King Cc: Christoph Hellwig Cc: Dave Chinner Cc: Heiko Carstens Cc: kernel test robot Cc: Michael Sclafani Signed-off-by: Andrew Morton --- fs/fuse/virtio_fs.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 5f1be1da92ce..a28466c2da71 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "fuse_i.h" @@ -795,8 +796,11 @@ static void virtio_fs_cleanup_dax(void *data) put_dax(dax_dev); } +DEFINE_FREE(cleanup_dax, struct dax_dev *, if (!IS_ERR_OR_NULL(_T)) virtio_fs_cleanup_dax(_T)) + static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) { + struct dax_device *dax_dev __free(cleanup_dax) = NULL; struct virtio_shm_region cache_reg; struct dev_pagemap *pgmap; bool have_cache; @@ -804,6 +808,12 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) if (!IS_ENABLED(CONFIG_FUSE_DAX)) return 0; + dax_dev = alloc_dax(fs, &virtio_fs_dax_ops); + if (IS_ERR(dax_dev)) { + int rc = PTR_ERR(dax_dev); + return rc == -EOPNOTSUPP ? 0 : rc; + } + /* Get cache region */ have_cache = virtio_get_shm_region(vdev, &cache_reg, (u8)VIRTIO_FS_SHMCAP_ID_CACHE); @@ -849,10 +859,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); - fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops); - if (IS_ERR(fs->dax_dev)) - return PTR_ERR(fs->dax_dev); - + fs->dax_dev = no_free_ptr(dax_dev); return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, fs->dax_dev); } From 1df4ca0155acb37b9b1d03ab91323d70a309ff54 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 15 Feb 2024 09:46:31 -0500 Subject: [PATCH 219/435] dax: check for data cache aliasing at runtime Replace the following fs/Kconfig:FS_DAX dependency: depends on !(ARM || MIPS || SPARC) By a runtime check within alloc_dax(). This runtime check returns ERR_PTR(-EOPNOTSUPP) if the @ops parameter is non-NULL (which means the kernel is using an aliased mapping) on an architecture which has data cache aliasing. Change the return value from NULL to PTR_ERR(-EOPNOTSUPP) for CONFIG_DAX=n for consistency. This is done in preparation for using cpu_dcache_is_aliasing() in a following change which will properly support architectures which detect data cache aliasing at runtime. Link: https://lkml.kernel.org/r/20240215144633.96437-8-mathieu.desnoyers@efficios.com Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches") Signed-off-by: Mathieu Desnoyers Reviewed-by: Dan Williams Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Russell King Cc: Alasdair Kergon Cc: Christoph Hellwig Cc: Dave Chinner Cc: Heiko Carstens Cc: kernel test robot Cc: Michael Sclafani Cc: Mike Snitzer Cc: Mikulas Patocka Signed-off-by: Andrew Morton --- drivers/dax/super.c | 10 ++++++++++ fs/Kconfig | 1 - 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 97ef2a9d878d..7643b1a078d9 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -451,6 +451,16 @@ struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) dev_t devt; int minor; + /* + * Unavailable on architectures with virtually aliased data caches, + * except for device-dax (NULL operations pointer), which does + * not use aliased mappings from the kernel. + */ + if (ops && (IS_ENABLED(CONFIG_ARM) || + IS_ENABLED(CONFIG_MIPS) || + IS_ENABLED(CONFIG_SPARC))) + return ERR_PTR(-EOPNOTSUPP); + if (WARN_ON_ONCE(ops && !ops->zero_page_range)) return ERR_PTR(-EINVAL); diff --git a/fs/Kconfig b/fs/Kconfig index 89fdbefd1075..713c8655fa0a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -60,7 +60,6 @@ endif # BLOCK config FS_DAX bool "File system based Direct Access (DAX) support" depends on MMU - depends on !(ARM || MIPS || SPARC) depends on ZONE_DEVICE || FS_DAX_LIMITED select FS_IOMAP select DAX From 8690bbcf3b7010b31fdbf3851e1add6ae19b8624 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 15 Feb 2024 09:46:32 -0500 Subject: [PATCH 220/435] Introduce cpu_dcache_is_aliasing() across all architectures Introduce a generic way to query whether the data cache is virtually aliased on all architectures. Its purpose is to ensure that subsystems which are incompatible with virtually aliased data caches (e.g. FS_DAX) can reliably query this. For data cache aliasing, there are three scenarios dependending on the architecture. Here is a breakdown based on my understanding: A) The data cache is always aliasing: * arc * csky * m68k (note: shared memory mappings are incoherent ? SHMLBA is missing there.) * sh * parisc B) The data cache aliasing is statically known or depends on querying CPU state at runtime: * arm (cache_is_vivt() || cache_is_vipt_aliasing()) * mips (cpu_has_dc_aliases) * nios2 (NIOS2_DCACHE_SIZE > PAGE_SIZE) * sparc32 (vac_cache_size > PAGE_SIZE) * sparc64 (L1DCACHE_SIZE > PAGE_SIZE) * xtensa (DCACHE_WAY_SIZE > PAGE_SIZE) C) The data cache is never aliasing: * alpha * arm64 (aarch64) * hexagon * loongarch (but with incoherent write buffers, which are disabled since commit d23b7795 ("LoongArch: Change SHMLBA from SZ_64K to PAGE_SIZE")) * microblaze * openrisc * powerpc * riscv * s390 * um * x86 Require architectures in A) and B) to select ARCH_HAS_CPU_CACHE_ALIASING and implement "cpu_dcache_is_aliasing()". Architectures in C) don't select ARCH_HAS_CPU_CACHE_ALIASING, and thus cpu_dcache_is_aliasing() simply evaluates to "false". Note that this leaves "cpu_icache_is_aliasing()" to be implemented as future work. This would be useful to gate features like XIP on architectures which have aliasing CPU dcache-icache but not CPU dcache-dcache. Use "cpu_dcache" and "cpu_cache" rather than just "dcache" and "cache" to clarify that we really mean "CPU data cache" and "CPU cache" to eliminate any possible confusion with VFS "dentry cache" and "page cache". Link: https://lore.kernel.org/lkml/20030910210416.GA24258@mail.jlokier.co.uk/ Link: https://lkml.kernel.org/r/20240215144633.96437-9-mathieu.desnoyers@efficios.com Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches") Signed-off-by: Mathieu Desnoyers Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Russell King Cc: Alasdair Kergon Cc: Christoph Hellwig Cc: Dave Chinner Cc: Heiko Carstens Cc: kernel test robot Cc: Michael Sclafani Cc: Mike Snitzer Cc: Mikulas Patocka Signed-off-by: Andrew Morton --- arch/arc/Kconfig | 1 + arch/arc/include/asm/cachetype.h | 9 +++++++++ arch/arm/Kconfig | 1 + arch/arm/include/asm/cachetype.h | 2 ++ arch/csky/Kconfig | 1 + arch/csky/include/asm/cachetype.h | 9 +++++++++ arch/m68k/Kconfig | 1 + arch/m68k/include/asm/cachetype.h | 9 +++++++++ arch/mips/Kconfig | 1 + arch/mips/include/asm/cachetype.h | 9 +++++++++ arch/nios2/Kconfig | 1 + arch/nios2/include/asm/cachetype.h | 10 ++++++++++ arch/parisc/Kconfig | 1 + arch/parisc/include/asm/cachetype.h | 9 +++++++++ arch/sh/Kconfig | 1 + arch/sh/include/asm/cachetype.h | 9 +++++++++ arch/sparc/Kconfig | 1 + arch/sparc/include/asm/cachetype.h | 14 ++++++++++++++ arch/xtensa/Kconfig | 1 + arch/xtensa/include/asm/cachetype.h | 10 ++++++++++ include/linux/cacheinfo.h | 6 ++++++ mm/Kconfig | 6 ++++++ 22 files changed, 112 insertions(+) create mode 100644 arch/arc/include/asm/cachetype.h create mode 100644 arch/csky/include/asm/cachetype.h create mode 100644 arch/m68k/include/asm/cachetype.h create mode 100644 arch/mips/include/asm/cachetype.h create mode 100644 arch/nios2/include/asm/cachetype.h create mode 100644 arch/parisc/include/asm/cachetype.h create mode 100644 arch/sh/include/asm/cachetype.h create mode 100644 arch/sparc/include/asm/cachetype.h create mode 100644 arch/xtensa/include/asm/cachetype.h diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 1b0483c51cc1..7d294a3242a4 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -6,6 +6,7 @@ config ARC def_bool y select ARC_TIMERS + select ARCH_HAS_CPU_CACHE_ALIASING select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT diff --git a/arch/arc/include/asm/cachetype.h b/arch/arc/include/asm/cachetype.h new file mode 100644 index 000000000000..05fc7ed59712 --- /dev/null +++ b/arch/arc/include/asm/cachetype.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_ARC_CACHETYPE_H +#define __ASM_ARC_CACHETYPE_H + +#include + +#define cpu_dcache_is_aliasing() true + +#endif diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 0af6709570d1..66a8e64b226e 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -5,6 +5,7 @@ config ARM select ARCH_32BIT_OFF_T select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE if HAVE_KRETPROBES && FRAME_POINTER && !ARM_UNWIND select ARCH_HAS_BINFMT_FLAT + select ARCH_HAS_CPU_CACHE_ALIASING select ARCH_HAS_CPU_FINALIZE_INIT if MMU select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL if MMU diff --git a/arch/arm/include/asm/cachetype.h b/arch/arm/include/asm/cachetype.h index e8c30430be33..b9dbe1d4c8fe 100644 --- a/arch/arm/include/asm/cachetype.h +++ b/arch/arm/include/asm/cachetype.h @@ -20,6 +20,8 @@ extern unsigned int cacheid; #define icache_is_vipt_aliasing() cacheid_is(CACHEID_VIPT_I_ALIASING) #define icache_is_pipt() cacheid_is(CACHEID_PIPT) +#define cpu_dcache_is_aliasing() (cache_is_vivt() || cache_is_vipt_aliasing()) + /* * __LINUX_ARM_ARCH__ is the minimum supported CPU architecture * Mask out support which will never be present on newer CPUs. diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index cf2a6fd7dff8..8a91eccf76dc 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -2,6 +2,7 @@ config CSKY def_bool y select ARCH_32BIT_OFF_T + select ARCH_HAS_CPU_CACHE_ALIASING select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SYNC_DMA_FOR_CPU diff --git a/arch/csky/include/asm/cachetype.h b/arch/csky/include/asm/cachetype.h new file mode 100644 index 000000000000..98cbe3af662f --- /dev/null +++ b/arch/csky/include/asm/cachetype.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_CSKY_CACHETYPE_H +#define __ASM_CSKY_CACHETYPE_H + +#include + +#define cpu_dcache_is_aliasing() true + +#endif diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 4b3e93cac723..a9c3e3de0c6d 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -3,6 +3,7 @@ config M68K bool default y select ARCH_32BIT_OFF_T + select ARCH_HAS_CPU_CACHE_ALIASING select ARCH_HAS_BINFMT_FLAT select ARCH_HAS_CPU_FINALIZE_INIT if MMU select ARCH_HAS_CURRENT_STACK_POINTER diff --git a/arch/m68k/include/asm/cachetype.h b/arch/m68k/include/asm/cachetype.h new file mode 100644 index 000000000000..7fad5d9ab8fe --- /dev/null +++ b/arch/m68k/include/asm/cachetype.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_M68K_CACHETYPE_H +#define __ASM_M68K_CACHETYPE_H + +#include + +#define cpu_dcache_is_aliasing() true + +#endif diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 797ae590ebdb..ab1c8bd96666 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -4,6 +4,7 @@ config MIPS default y select ARCH_32BIT_OFF_T if !64BIT select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT + select ARCH_HAS_CPU_CACHE_ALIASING select ARCH_HAS_CPU_FINALIZE_INIT select ARCH_HAS_CURRENT_STACK_POINTER if !CC_IS_CLANG || CLANG_VERSION >= 140000 select ARCH_HAS_DEBUG_VIRTUAL if !64BIT diff --git a/arch/mips/include/asm/cachetype.h b/arch/mips/include/asm/cachetype.h new file mode 100644 index 000000000000..9f4ba2fe1155 --- /dev/null +++ b/arch/mips/include/asm/cachetype.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_MIPS_CACHETYPE_H +#define __ASM_MIPS_CACHETYPE_H + +#include + +#define cpu_dcache_is_aliasing() cpu_has_dc_aliases + +#endif diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 58d9565dc2c7..6b3a14633d2f 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -2,6 +2,7 @@ config NIOS2 def_bool y select ARCH_32BIT_OFF_T + select ARCH_HAS_CPU_CACHE_ALIASING select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE diff --git a/arch/nios2/include/asm/cachetype.h b/arch/nios2/include/asm/cachetype.h new file mode 100644 index 000000000000..eb9c416b8a1c --- /dev/null +++ b/arch/nios2/include/asm/cachetype.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_NIOS2_CACHETYPE_H +#define __ASM_NIOS2_CACHETYPE_H + +#include +#include + +#define cpu_dcache_is_aliasing() (NIOS2_DCACHE_SIZE > PAGE_SIZE) + +#endif diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 5c845e8d59d9..da6e97ba46a6 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -8,6 +8,7 @@ config PARISC select HAVE_FUNCTION_GRAPH_TRACER select HAVE_SYSCALL_TRACEPOINTS select ARCH_WANT_FRAME_POINTERS + select ARCH_HAS_CPU_CACHE_ALIASING select ARCH_HAS_DMA_ALLOC if PA11 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/parisc/include/asm/cachetype.h b/arch/parisc/include/asm/cachetype.h new file mode 100644 index 000000000000..e0868a1d3c47 --- /dev/null +++ b/arch/parisc/include/asm/cachetype.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_PARISC_CACHETYPE_H +#define __ASM_PARISC_CACHETYPE_H + +#include + +#define cpu_dcache_is_aliasing() true + +#endif diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 7500521b2b98..2ad3e29f0ebe 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -2,6 +2,7 @@ config SUPERH def_bool y select ARCH_32BIT_OFF_T + select ARCH_HAS_CPU_CACHE_ALIASING select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM && MMU select ARCH_ENABLE_MEMORY_HOTREMOVE if SPARSEMEM && MMU select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) diff --git a/arch/sh/include/asm/cachetype.h b/arch/sh/include/asm/cachetype.h new file mode 100644 index 000000000000..a5fffe536068 --- /dev/null +++ b/arch/sh/include/asm/cachetype.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_SH_CACHETYPE_H +#define __ASM_SH_CACHETYPE_H + +#include + +#define cpu_dcache_is_aliasing() true + +#endif diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 204c43cb3d43..cbec48219d9e 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -13,6 +13,7 @@ config 64BIT config SPARC bool default y + select ARCH_HAS_CPU_CACHE_ALIASING select ARCH_MIGHT_HAVE_PC_PARPORT if SPARC64 && PCI select ARCH_MIGHT_HAVE_PC_SERIO select DMA_OPS diff --git a/arch/sparc/include/asm/cachetype.h b/arch/sparc/include/asm/cachetype.h new file mode 100644 index 000000000000..caf1c0045892 --- /dev/null +++ b/arch/sparc/include/asm/cachetype.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_SPARC_CACHETYPE_H +#define __ASM_SPARC_CACHETYPE_H + +#include + +#ifdef CONFIG_SPARC32 +extern int vac_cache_size; +#define cpu_dcache_is_aliasing() (vac_cache_size > PAGE_SIZE) +#else +#define cpu_dcache_is_aliasing() (L1DCACHE_SIZE > PAGE_SIZE) +#endif + +#endif diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 6f248d87e496..6689a8547346 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -2,6 +2,7 @@ config XTENSA def_bool y select ARCH_32BIT_OFF_T + select ARCH_HAS_CPU_CACHE_ALIASING select ARCH_HAS_BINFMT_FLAT if !MMU select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VM_PGTABLE diff --git a/arch/xtensa/include/asm/cachetype.h b/arch/xtensa/include/asm/cachetype.h new file mode 100644 index 000000000000..51bd49e2a1c5 --- /dev/null +++ b/arch/xtensa/include/asm/cachetype.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_XTENSA_CACHETYPE_H +#define __ASM_XTENSA_CACHETYPE_H + +#include +#include + +#define cpu_dcache_is_aliasing() (DCACHE_WAY_SIZE > PAGE_SIZE) + +#endif diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index d504eb4b49ab..2cb15fe4fe12 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -138,4 +138,10 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level) #define use_arch_cache_info() (false) #endif +#ifndef CONFIG_ARCH_HAS_CPU_CACHE_ALIASING +#define cpu_dcache_is_aliasing() false +#else +#include +#endif + #endif /* _LINUX_CACHEINFO_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 2b267553f793..b924f4a5a3ef 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -973,6 +973,12 @@ config IDLE_PAGE_TRACKING See Documentation/admin-guide/mm/idle_page_tracking.rst for more details. +# Architectures which implement cpu_dcache_is_aliasing() to query +# whether the data caches are aliased (VIVT or VIPT with dcache +# aliasing) need to select this. +config ARCH_HAS_CPU_CACHE_ALIASING + bool + config ARCH_HAS_CACHE_LINE_SIZE bool From 902ccb86ed02eea5b81c2afb514c0a7d72f63de2 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 15 Feb 2024 09:46:33 -0500 Subject: [PATCH 221/435] dax: fix incorrect list of data cache aliasing architectures commit d92576f1167c ("dax: does not work correctly with virtual aliasing caches") prevents DAX from building on architectures with virtually aliased dcache with: depends on !(ARM || MIPS || SPARC) This check is too broad (e.g. recent ARMv7 don't have virtually aliased dcaches), and also misses many other architectures with virtually aliased data cache. This is a regression introduced in the v4.0 Linux kernel where the dax mount option is removed for 32-bit ARMv7 boards which have no data cache aliasing, and therefore should work fine with FS_DAX. This was turned into the following check in alloc_dax() by a preparatory change: if (ops && (IS_ENABLED(CONFIG_ARM) || IS_ENABLED(CONFIG_MIPS) || IS_ENABLED(CONFIG_SPARC))) return NULL; Use cpu_dcache_is_aliasing() instead to figure out whether the environment has aliasing data caches. Link: https://lkml.kernel.org/r/20240215144633.96437-10-mathieu.desnoyers@efficios.com Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches") Signed-off-by: Mathieu Desnoyers Reviewed-by: Dan Williams Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Russell King Cc: Alasdair Kergon Cc: Christoph Hellwig Cc: Dave Chinner Cc: Heiko Carstens Cc: kernel test robot Cc: Michael Sclafani Cc: Mike Snitzer Cc: Mikulas Patocka Signed-off-by: Andrew Morton --- drivers/dax/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 7643b1a078d9..54e528779877 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "dax-private.h" /** @@ -456,9 +457,7 @@ struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) * except for device-dax (NULL operations pointer), which does * not use aliased mappings from the kernel. */ - if (ops && (IS_ENABLED(CONFIG_ARM) || - IS_ENABLED(CONFIG_MIPS) || - IS_ENABLED(CONFIG_SPARC))) + if (ops && cpu_dcache_is_aliasing()) return ERR_PTR(-EOPNOTSUPP); if (WARN_ON_ONCE(ops && !ops->zero_page_range)) From 059ab7be09d4ff0cab4f43ffbf225a2693da0bd7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Feb 2024 20:53:05 +0000 Subject: [PATCH 222/435] rmap: replace two calls to compound_order with folio_order Removes two unnecessary conversions from folio to page. Should be no difference in behaviour. Link: https://lkml.kernel.org/r/20240215205307.674707-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 1cf2bffa48ed..3746a5531018 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2169,7 +2169,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); trace_set_migration_pte(pvmw.address, pte_val(swp_pte), - compound_order(&folio->page)); + folio_order(folio)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. @@ -2261,7 +2261,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, else set_pte_at(mm, address, pvmw.pte, swp_pte); trace_set_migration_pte(address, pte_val(swp_pte), - compound_order(&folio->page)); + folio_order(folio)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. From 952237b5a9b79acf1212b40ad1e993557e3cb0ca Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Thu, 15 Feb 2024 18:39:55 +0000 Subject: [PATCH 223/435] kasan: increase the number of bits to shift when recording extra timestamps In 5d4c6ac94694 ("kasan: record and report more information") I thought that printk only displays a maximum of 99999 seconds, but actually printk can display a larger number of seconds. So increase the number of bits to shift when recording the extra timestamp (44 bits), without affecting the precision, shift it right by 9 bits, discarding all bits that do not affect the microsecond part (nanoseconds will not be shown). Currently the maximum time that can be displayed is 9007199.254740s, because 11111111111111111111111111111111111111111111 (44 bits) << 9 = 11111111111111111111111111111111111111111111000000000 = 9007199.254740 Link: https://lkml.kernel.org/r/AM6PR03MB58481629F2F28CE007412139994D2@AM6PR03MB5848.eurprd03.prod.outlook.com Fixes: 5d4c6ac94694 ("kasan: record and report more information") Signed-off-by: Juntong Deng Acked-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/common.c | 2 +- mm/kasan/report.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 610efae91220..f2747ed30da0 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -55,7 +55,7 @@ void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack) u64 ts_nsec = local_clock(); track->cpu = cpu; - track->timestamp = ts_nsec >> 3; + track->timestamp = ts_nsec >> 9; #endif /* CONFIG_KASAN_EXTRA_INFO */ track->pid = current->pid; track->stack = stack; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 7afa4feb03e1..b48c768acc84 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -267,7 +267,7 @@ static void print_track(struct kasan_track *track, const char *prefix) u64 ts_nsec = track->timestamp; unsigned long rem_usec; - ts_nsec <<= 3; + ts_nsec <<= 9; rem_usec = do_div(ts_nsec, NSEC_PER_SEC) / 1000; pr_err("%s by task %u on cpu %d at %lu.%06lus:\n", From f91e6b41dd11daffb138e3afdb4804aefc3d4e1b Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Thu, 15 Feb 2024 10:27:53 -0800 Subject: [PATCH 224/435] userfaultfd: move userfaultfd_ctx struct to header file Patch series "per-vma locks in userfaultfd", v7. Performing userfaultfd operations (like copy/move etc.) in critical section of mmap_lock (read-mode) causes significant contention on the lock when operations requiring the lock in write-mode are taking place concurrently. We can use per-vma locks instead to significantly reduce the contention issue. Android runtime's Garbage Collector uses userfaultfd for concurrent compaction. mmap-lock contention during compaction potentially causes jittery experience for the user. During one such reproducible scenario, we observed the following improvements with this patch-set: - Wall clock time of compaction phase came down from ~3s to <500ms - Uninterruptible sleep time (across all threads in the process) was ~10ms (none in mmap_lock) during compaction, instead of >20s This patch (of 4): Move the struct to userfaultfd_k.h to be accessible from mm/userfaultfd.c. There are no other changes in the struct. This is required to prepare for using per-vma locks in userfaultfd operations. Link: https://lkml.kernel.org/r/20240215182756.3448972-1-lokeshgidra@google.com Link: https://lkml.kernel.org/r/20240215182756.3448972-2-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Liam R. Howlett Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: David Hildenbrand Cc: Jann Horn Cc: Kalesh Singh Cc: Matthew Wilcox (Oracle) Cc: Nicolas Geoffray Cc: Peter Xu Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 39 ----------------------------------- include/linux/userfaultfd_k.h | 39 +++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 05c8e8a05427..58331b83d648 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -50,45 +50,6 @@ static struct ctl_table vm_userfaultfd_table[] = { static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init; -/* - * Start with fault_pending_wqh and fault_wqh so they're more likely - * to be in the same cacheline. - * - * Locking order: - * fd_wqh.lock - * fault_pending_wqh.lock - * fault_wqh.lock - * event_wqh.lock - * - * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, - * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's - * also taken in IRQ context. - */ -struct userfaultfd_ctx { - /* waitqueue head for the pending (i.e. not read) userfaults */ - wait_queue_head_t fault_pending_wqh; - /* waitqueue head for the userfaults */ - wait_queue_head_t fault_wqh; - /* waitqueue head for the pseudo fd to wakeup poll/read */ - wait_queue_head_t fd_wqh; - /* waitqueue head for events */ - wait_queue_head_t event_wqh; - /* a refile sequence protected by fault_pending_wqh lock */ - seqcount_spinlock_t refile_seq; - /* pseudo fd refcounting */ - refcount_t refcount; - /* userfaultfd syscall flags */ - unsigned int flags; - /* features requested from the userspace */ - unsigned int features; - /* released */ - bool released; - /* memory mappings are changing because of non-cooperative event */ - atomic_t mmap_changing; - /* mm with one ore more vmas attached to this userfaultfd_ctx */ - struct mm_struct *mm; -}; - struct userfaultfd_fork_ctx { struct userfaultfd_ctx *orig; struct userfaultfd_ctx *new; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e4056547fbe6..691d928ee864 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -36,6 +36,45 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) +/* + * Start with fault_pending_wqh and fault_wqh so they're more likely + * to be in the same cacheline. + * + * Locking order: + * fd_wqh.lock + * fault_pending_wqh.lock + * fault_wqh.lock + * event_wqh.lock + * + * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, + * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's + * also taken in IRQ context. + */ +struct userfaultfd_ctx { + /* waitqueue head for the pending (i.e. not read) userfaults */ + wait_queue_head_t fault_pending_wqh; + /* waitqueue head for the userfaults */ + wait_queue_head_t fault_wqh; + /* waitqueue head for the pseudo fd to wakeup poll/read */ + wait_queue_head_t fd_wqh; + /* waitqueue head for events */ + wait_queue_head_t event_wqh; + /* a refile sequence protected by fault_pending_wqh lock */ + seqcount_spinlock_t refile_seq; + /* pseudo fd refcounting */ + refcount_t refcount; + /* userfaultfd syscall flags */ + unsigned int flags; + /* features requested from the userspace */ + unsigned int features; + /* released */ + bool released; + /* memory mappings are changing because of non-cooperative event */ + atomic_t mmap_changing; + /* mm with one ore more vmas attached to this userfaultfd_ctx */ + struct mm_struct *mm; +}; + extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); /* A combined operation mode + behavior flags. */ From 5e4c24a57b0c126686534b5b159a406c5dd02400 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Thu, 15 Feb 2024 10:27:54 -0800 Subject: [PATCH 225/435] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx Increments and loads to mmap_changing are always in mmap_lock critical section. This ensures that if userspace requests event notification for non-cooperative operations (e.g. mremap), userfaultfd operations don't occur concurrently. This can be achieved by using a separate read-write semaphore in userfaultfd_ctx such that increments are done in write-mode and loads in read-mode, thereby eliminating the dependency on mmap_lock for this purpose. This is a preparatory step before we replace mmap_lock usage with per-vma locks in fill/move ioctls. Link: https://lkml.kernel.org/r/20240215182756.3448972-3-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Liam R. Howlett Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: David Hildenbrand Cc: Jann Horn Cc: Kalesh Singh Cc: Matthew Wilcox (Oracle) Cc: Nicolas Geoffray Cc: Peter Xu Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 40 ++++++++++++---------- include/linux/userfaultfd_k.h | 31 ++++++++++-------- mm/userfaultfd.c | 62 ++++++++++++++++++++--------------- 3 files changed, 75 insertions(+), 58 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 58331b83d648..c00a021bcce4 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->flags = octx->flags; ctx->features = octx->features; ctx->released = false; + init_rwsem(&ctx->map_changing_lock); atomic_set(&ctx->mmap_changing, 0); ctx->mm = vma->vm_mm; mmgrab(ctx->mm); userfaultfd_ctx_get(octx); + down_write(&octx->map_changing_lock); atomic_inc(&octx->mmap_changing); + up_write(&octx->map_changing_lock); fctx->orig = octx; fctx->new = ctx; list_add_tail(&fctx->list, fcs); @@ -737,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); } else { /* Drop uffd context if remap feature not enabled */ vma_start_write(vma); @@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma, return true; userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); mmap_read_unlock(mm); msg_init(&ewq.msg); @@ -825,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, return -ENOMEM; userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); unmap_ctx->ctx = ctx; unmap_ctx->start = start; unmap_ctx->end = end; @@ -1709,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, &ctx->mmap_changing, - flags); + ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len, flags); mmput(ctx->mm); } else { return -ESRCH; @@ -1761,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, goto out; if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len, - &ctx->mmap_changing); + ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); mmput(ctx->mm); } else { return -ESRCH; @@ -1818,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, return -EINVAL; if (mmget_not_zero(ctx->mm)) { - ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, - uffdio_wp.range.len, mode_wp, - &ctx->mmap_changing); + ret = mwriteprotect_range(ctx, uffdio_wp.range.start, + uffdio_wp.range.len, mode_wp); mmput(ctx->mm); } else { return -ESRCH; @@ -1870,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start, - uffdio_continue.range.len, - &ctx->mmap_changing, flags); + ret = mfill_atomic_continue(ctx, uffdio_continue.range.start, + uffdio_continue.range.len, flags); mmput(ctx->mm); } else { return -ESRCH; @@ -1925,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long goto out; if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start, - uffdio_poison.range.len, - &ctx->mmap_changing, 0); + ret = mfill_atomic_poison(ctx, uffdio_poison.range.start, + uffdio_poison.range.len, 0); mmput(ctx->mm); } else { return -ESRCH; @@ -2003,13 +2007,14 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx, if (mmget_not_zero(mm)) { mmap_read_lock(mm); - /* Re-check after taking mmap_lock */ + /* Re-check after taking map_changing_lock */ + down_read(&ctx->map_changing_lock); if (likely(!atomic_read(&ctx->mmap_changing))) ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src, uffdio_move.len, uffdio_move.mode); else ret = -EAGAIN; - + up_read(&ctx->map_changing_lock); mmap_read_unlock(mm); mmput(mm); } else { @@ -2216,6 +2221,7 @@ static int new_userfaultfd(int flags) ctx->flags = flags; ctx->features = 0; ctx->released = false; + init_rwsem(&ctx->map_changing_lock); atomic_set(&ctx->mmap_changing, 0); ctx->mm = current->mm; /* prevent the mm struct to be freed */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 691d928ee864..3210c3552976 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -69,6 +69,13 @@ struct userfaultfd_ctx { unsigned int features; /* released */ bool released; + /* + * Prevents userfaultfd operations (fill/move/wp) from happening while + * some non-cooperative event(s) is taking place. Increments are done + * in write-mode. Whereas, userfaultfd operations, which includes + * reading mmap_changing, is done under read-mode. + */ + struct rw_semaphore map_changing_lock; /* memory mappings are changing because of non-cooperative event */ atomic_t mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ @@ -113,22 +120,18 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd, unsigned long dst_addr, struct page *page, bool newly_allocated, uffd_flags_t flags); -extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start, +extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, uffd_flags_t flags); -extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, + uffd_flags_t flags); +extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, unsigned long dst_start, - unsigned long len, - atomic_t *mmap_changing); -extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, atomic_t *mmap_changing, - uffd_flags_t flags); -extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing, - uffd_flags_t flags); -extern int mwriteprotect_range(struct mm_struct *dst_mm, - unsigned long start, unsigned long len, - bool enable_wp, atomic_t *mmap_changing); + unsigned long len); +extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long len, uffd_flags_t flags); +extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, uffd_flags_t flags); +extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, bool enable_wp); extern long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9cc93cc1330b..74aad0831e40 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -353,11 +353,11 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) * called with mmap_lock held, it will release mmap_lock before returning. */ static __always_inline ssize_t mfill_atomic_hugetlb( + struct userfaultfd_ctx *ctx, struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, uffd_flags_t flags) { struct mm_struct *dst_mm = dst_vma->vm_mm; @@ -379,6 +379,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb( * feature is not supported. */ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { + up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); return -EINVAL; } @@ -463,6 +464,7 @@ retry: cond_resched(); if (unlikely(err == -ENOENT)) { + up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); BUG_ON(!folio); @@ -473,12 +475,13 @@ retry: goto out; } mmap_read_lock(dst_mm); + down_read(&ctx->map_changing_lock); /* * If memory mappings are changing because of non-cooperative * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ - if (mmap_changing && atomic_read(mmap_changing)) { + if (atomic_read(&ctx->mmap_changing)) { err = -EAGAIN; break; } @@ -501,6 +504,7 @@ retry: } out_unlock: + up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); out: if (folio) @@ -512,11 +516,11 @@ out: } #else /* !CONFIG_HUGETLB_PAGE */ /* fail at build time if gcc attempts to use this */ -extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma, +extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, + struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, uffd_flags_t flags); #endif /* CONFIG_HUGETLB_PAGE */ @@ -564,13 +568,13 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, return err; } -static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, +static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, uffd_flags_t flags) { + struct mm_struct *dst_mm = ctx->mm; struct vm_area_struct *dst_vma; ssize_t err; pmd_t *dst_pmd; @@ -600,8 +604,9 @@ retry: * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ + down_read(&ctx->map_changing_lock); err = -EAGAIN; - if (mmap_changing && atomic_read(mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out_unlock; /* @@ -633,8 +638,8 @@ retry: * If this is a HUGETLB vma, pass off to appropriate routine */ if (is_vm_hugetlb_page(dst_vma)) - return mfill_atomic_hugetlb(dst_vma, dst_start, src_start, - len, mmap_changing, flags); + return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, + src_start, len, flags); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; @@ -693,6 +698,7 @@ retry: if (unlikely(err == -ENOENT)) { void *kaddr; + up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); BUG_ON(!folio); @@ -723,6 +729,7 @@ retry: } out_unlock: + up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); out: if (folio) @@ -733,34 +740,33 @@ out: return copied ? copied : err; } -ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start, +ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, uffd_flags_t flags) + uffd_flags_t flags) { - return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing, + return mfill_atomic(ctx, dst_start, src_start, len, uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); } -ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing) +ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, + unsigned long start, + unsigned long len) { - return mfill_atomic(dst_mm, start, 0, len, mmap_changing, + return mfill_atomic(ctx, start, 0, len, uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); } -ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing, - uffd_flags_t flags) +ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, uffd_flags_t flags) { - return mfill_atomic(dst_mm, start, 0, len, mmap_changing, + return mfill_atomic(ctx, start, 0, len, uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); } -ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing, - uffd_flags_t flags) +ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, uffd_flags_t flags) { - return mfill_atomic(dst_mm, start, 0, len, mmap_changing, + return mfill_atomic(ctx, start, 0, len, uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); } @@ -793,10 +799,10 @@ long uffd_wp_range(struct vm_area_struct *dst_vma, return ret; } -int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool enable_wp, - atomic_t *mmap_changing) +int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, bool enable_wp) { + struct mm_struct *dst_mm = ctx->mm; unsigned long end = start + len; unsigned long _start, _end; struct vm_area_struct *dst_vma; @@ -820,8 +826,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ + down_read(&ctx->map_changing_lock); err = -EAGAIN; - if (mmap_changing && atomic_read(mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out_unlock; err = -ENOENT; @@ -850,6 +857,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, err = 0; } out_unlock: + up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); return err; } From 32af81af2f6f4c23b1b4ff68410e91da660af102 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Thu, 15 Feb 2024 10:27:55 -0800 Subject: [PATCH 226/435] mm: add vma_assert_locked() for !CONFIG_PER_VMA_LOCK vma_assert_locked() is needed to replace mmap_assert_locked() once we start using per-vma locks in userfaultfd operations. In !CONFIG_PER_VMA_LOCK case when mm is locked, it implies that the given VMA is locked. Link: https://lkml.kernel.org/r/20240215182756.3448972-4-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Reviewed-by: Suren Baghdasaryan Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: David Hildenbrand Cc: Jann Horn Cc: Kalesh Singh Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Nicolas Geoffray Cc: Peter Xu Cc: Ryan Roberts Cc: Tim Murray Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index ac6b71cbdffb..6f4825d82965 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -781,6 +781,11 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, return NULL; } +static inline void vma_assert_locked(struct vm_area_struct *vma) +{ + mmap_assert_locked(vma->vm_mm); +} + static inline void release_fault_lock(struct vm_fault *vmf) { mmap_read_unlock(vmf->vma->vm_mm); From 867a43a34ff8a38772212045262b2c9b77807ea3 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Thu, 15 Feb 2024 10:27:56 -0800 Subject: [PATCH 227/435] userfaultfd: use per-vma locks in userfaultfd operations All userfaultfd operations, except write-protect, opportunistically use per-vma locks to lock vmas. On failure, attempt again inside mmap_lock critical section. Write-protect operation requires mmap_lock as it iterates over multiple vmas. Link: https://lkml.kernel.org/r/20240215182756.3448972-5-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Reviewed-by: Liam R. Howlett Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: David Hildenbrand Cc: Jann Horn Cc: Kalesh Singh Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Nicolas Geoffray Cc: Peter Xu Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 13 +- include/linux/userfaultfd_k.h | 5 +- mm/huge_memory.c | 5 +- mm/userfaultfd.c | 384 ++++++++++++++++++++++++++-------- 4 files changed, 301 insertions(+), 106 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index c00a021bcce4..60dcfafdc11a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -2005,17 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx, return -EINVAL; if (mmget_not_zero(mm)) { - mmap_read_lock(mm); - - /* Re-check after taking map_changing_lock */ - down_read(&ctx->map_changing_lock); - if (likely(!atomic_read(&ctx->mmap_changing))) - ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src, - uffdio_move.len, uffdio_move.mode); - else - ret = -EAGAIN; - up_read(&ctx->map_changing_lock); - mmap_read_unlock(mm); + ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src, + uffdio_move.len, uffdio_move.mode); mmput(mm); } else { return -ESRCH; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 3210c3552976..05d59f74fc88 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma, /* move_pages */ void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, - unsigned long dst_start, unsigned long src_start, - unsigned long len, __u64 flags); +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long src_start, unsigned long len, __u64 flags); int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 14888b15121e..28341a5067fb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2158,7 +2158,7 @@ unlock: #ifdef CONFIG_USERFAULTFD /* - * The PT lock for src_pmd and the mmap_lock for reading are held by + * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by * the caller, but it must return after releasing the page_table_lock. * Just move the page from src_pmd to dst_pmd if possible. * Return zero if succeeded in moving the page, -EAGAIN if it needs to be @@ -2181,7 +2181,8 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm src_ptl = pmd_lockptr(mm, src_pmd); lockdep_assert_held(src_ptl); - mmap_assert_locked(mm); + vma_assert_locked(src_vma); + vma_assert_locked(dst_vma); /* Sanity checks before the operation */ if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) || diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 74aad0831e40..4744d6a96f96 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -20,19 +20,11 @@ #include "internal.h" static __always_inline -struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, - unsigned long dst_start, - unsigned long len) +bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) { - /* - * Make sure that the dst range is both valid and fully within a - * single existing vma. - */ - struct vm_area_struct *dst_vma; - - dst_vma = find_vma(dst_mm, dst_start); - if (!range_in_vma(dst_vma, dst_start, dst_start + len)) - return NULL; + /* Make sure that the dst range is fully within dst_vma. */ + if (dst_end > dst_vma->vm_end) + return false; /* * Check the vma is registered in uffd, this is required to @@ -40,11 +32,122 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, * time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) - return NULL; + return false; + return true; +} + +static __always_inline +struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, + unsigned long addr) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + vma = vma_lookup(mm, addr); + if (!vma) + vma = ERR_PTR(-ENOENT); + else if (!(vma->vm_flags & VM_SHARED) && + unlikely(anon_vma_prepare(vma))) + vma = ERR_PTR(-ENOMEM); + + return vma; +} + +#ifdef CONFIG_PER_VMA_LOCK +/* + * lock_vma() - Lookup and lock vma corresponding to @address. + * @mm: mm to search vma in. + * @address: address that the vma should contain. + * + * Should be called without holding mmap_lock. vma should be unlocked after use + * with unlock_vma(). + * + * Return: A locked vma containing @address, -ENOENT if no vma is found, or + * -ENOMEM if anon_vma couldn't be allocated. + */ +static struct vm_area_struct *lock_vma(struct mm_struct *mm, + unsigned long address) +{ + struct vm_area_struct *vma; + + vma = lock_vma_under_rcu(mm, address); + if (vma) { + /* + * lock_vma_under_rcu() only checks anon_vma for private + * anonymous mappings. But we need to ensure it is assigned in + * private file-backed vmas as well. + */ + if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) + vma_end_read(vma); + else + return vma; + } + + mmap_read_lock(mm); + vma = find_vma_and_prepare_anon(mm, address); + if (!IS_ERR(vma)) { + /* + * We cannot use vma_start_read() as it may fail due to + * false locked (see comment in vma_start_read()). We + * can avoid that by directly locking vm_lock under + * mmap_lock, which guarantees that nobody can lock the + * vma for write (vma_start_write()) under us. + */ + down_read(&vma->vm_lock->lock); + } + + mmap_read_unlock(mm); + return vma; +} + +static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long len) +{ + struct vm_area_struct *dst_vma; + + dst_vma = lock_vma(dst_mm, dst_start); + if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) + return dst_vma; + + vma_end_read(dst_vma); + return ERR_PTR(-ENOENT); +} + +static void uffd_mfill_unlock(struct vm_area_struct *vma) +{ + vma_end_read(vma); +} + +#else + +static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long len) +{ + struct vm_area_struct *dst_vma; + + mmap_read_lock(dst_mm); + dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); + if (IS_ERR(dst_vma)) + goto out_unlock; + + if (validate_dst_vma(dst_vma, dst_start + len)) + return dst_vma; + + dst_vma = ERR_PTR(-ENOENT); +out_unlock: + mmap_read_unlock(dst_mm); return dst_vma; } +static void uffd_mfill_unlock(struct vm_area_struct *vma) +{ + mmap_read_unlock(vma->vm_mm); +} +#endif + /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ static bool mfill_file_over_size(struct vm_area_struct *dst_vma, unsigned long dst_addr) @@ -350,7 +453,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) #ifdef CONFIG_HUGETLB_PAGE /* * mfill_atomic processing for HUGETLB vmas. Note that this routine is - * called with mmap_lock held, it will release mmap_lock before returning. + * called with either vma-lock or mmap_lock held, it will release the lock + * before returning. */ static __always_inline ssize_t mfill_atomic_hugetlb( struct userfaultfd_ctx *ctx, @@ -361,7 +465,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb( uffd_flags_t flags) { struct mm_struct *dst_mm = dst_vma->vm_mm; - int vm_shared = dst_vma->vm_flags & VM_SHARED; ssize_t err; pte_t *dst_pte; unsigned long src_addr, dst_addr; @@ -380,7 +483,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb( */ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { up_read(&ctx->map_changing_lock); - mmap_read_unlock(dst_mm); + uffd_mfill_unlock(dst_vma); return -EINVAL; } @@ -403,24 +506,28 @@ retry: * retry, dst_vma will be set to NULL and we must lookup again. */ if (!dst_vma) { + dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); + if (IS_ERR(dst_vma)) { + err = PTR_ERR(dst_vma); + goto out; + } + err = -ENOENT; - dst_vma = find_dst_vma(dst_mm, dst_start, len); - if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) - goto out_unlock; + if (!is_vm_hugetlb_page(dst_vma)) + goto out_unlock_vma; err = -EINVAL; if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) - goto out_unlock; + goto out_unlock_vma; - vm_shared = dst_vma->vm_flags & VM_SHARED; - } - - /* - * If not shared, ensure the dst_vma has a anon_vma. - */ - err = -ENOMEM; - if (!vm_shared) { - if (unlikely(anon_vma_prepare(dst_vma))) + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + down_read(&ctx->map_changing_lock); + err = -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) goto out_unlock; } @@ -465,7 +572,7 @@ retry: if (unlikely(err == -ENOENT)) { up_read(&ctx->map_changing_lock); - mmap_read_unlock(dst_mm); + uffd_mfill_unlock(dst_vma); BUG_ON(!folio); err = copy_folio_from_user(folio, @@ -474,17 +581,6 @@ retry: err = -EFAULT; goto out; } - mmap_read_lock(dst_mm); - down_read(&ctx->map_changing_lock); - /* - * If memory mappings are changing because of non-cooperative - * operation (e.g. mremap) running in parallel, bail out and - * request the user to retry later - */ - if (atomic_read(&ctx->mmap_changing)) { - err = -EAGAIN; - break; - } dst_vma = NULL; goto retry; @@ -505,7 +601,8 @@ retry: out_unlock: up_read(&ctx->map_changing_lock); - mmap_read_unlock(dst_mm); +out_unlock_vma: + uffd_mfill_unlock(dst_vma); out: if (folio) folio_put(folio); @@ -597,7 +694,15 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, copied = 0; folio = NULL; retry: - mmap_read_lock(dst_mm); + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); + if (IS_ERR(dst_vma)) { + err = PTR_ERR(dst_vma); + goto out; + } /* * If memory mappings are changing because of non-cooperative @@ -609,15 +714,6 @@ retry: if (atomic_read(&ctx->mmap_changing)) goto out_unlock; - /* - * Make sure the vma is not shared, that the dst range is - * both valid and fully within a single existing vma. - */ - err = -ENOENT; - dst_vma = find_dst_vma(dst_mm, dst_start, len); - if (!dst_vma) - goto out_unlock; - err = -EINVAL; /* * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but @@ -647,16 +743,6 @@ retry: uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) goto out_unlock; - /* - * Ensure the dst_vma has a anon_vma or this page - * would get a NULL anon_vma when moved in the - * dst_vma. - */ - err = -ENOMEM; - if (!(dst_vma->vm_flags & VM_SHARED) && - unlikely(anon_vma_prepare(dst_vma))) - goto out_unlock; - while (src_addr < src_start + len) { pmd_t dst_pmdval; @@ -699,7 +785,7 @@ retry: void *kaddr; up_read(&ctx->map_changing_lock); - mmap_read_unlock(dst_mm); + uffd_mfill_unlock(dst_vma); BUG_ON(!folio); kaddr = kmap_local_folio(folio, 0); @@ -730,7 +816,7 @@ retry: out_unlock: up_read(&ctx->map_changing_lock); - mmap_read_unlock(dst_mm); + uffd_mfill_unlock(dst_vma); out: if (folio) folio_put(folio); @@ -1267,27 +1353,136 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx, if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) return -EINVAL; - /* - * Ensure the dst_vma has a anon_vma or this page - * would get a NULL anon_vma when moved in the - * dst_vma. - */ - if (unlikely(anon_vma_prepare(dst_vma))) - return -ENOMEM; - return 0; } +static __always_inline +int find_vmas_mm_locked(struct mm_struct *mm, + unsigned long dst_start, + unsigned long src_start, + struct vm_area_struct **dst_vmap, + struct vm_area_struct **src_vmap) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + vma = find_vma_and_prepare_anon(mm, dst_start); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + *dst_vmap = vma; + /* Skip finding src_vma if src_start is in dst_vma */ + if (src_start >= vma->vm_start && src_start < vma->vm_end) + goto out_success; + + vma = vma_lookup(mm, src_start); + if (!vma) + return -ENOENT; +out_success: + *src_vmap = vma; + return 0; +} + +#ifdef CONFIG_PER_VMA_LOCK +static int uffd_move_lock(struct mm_struct *mm, + unsigned long dst_start, + unsigned long src_start, + struct vm_area_struct **dst_vmap, + struct vm_area_struct **src_vmap) +{ + struct vm_area_struct *vma; + int err; + + vma = lock_vma(mm, dst_start); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + *dst_vmap = vma; + /* + * Skip finding src_vma if src_start is in dst_vma. This also ensures + * that we don't lock the same vma twice. + */ + if (src_start >= vma->vm_start && src_start < vma->vm_end) { + *src_vmap = vma; + return 0; + } + + /* + * Using lock_vma() to get src_vma can lead to following deadlock: + * + * Thread1 Thread2 + * ------- ------- + * vma_start_read(dst_vma) + * mmap_write_lock(mm) + * vma_start_write(src_vma) + * vma_start_read(src_vma) + * mmap_read_lock(mm) + * vma_start_write(dst_vma) + */ + *src_vmap = lock_vma_under_rcu(mm, src_start); + if (likely(*src_vmap)) + return 0; + + /* Undo any locking and retry in mmap_lock critical section */ + vma_end_read(*dst_vmap); + + mmap_read_lock(mm); + err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); + if (!err) { + /* + * See comment in lock_vma() as to why not using + * vma_start_read() here. + */ + down_read(&(*dst_vmap)->vm_lock->lock); + if (*dst_vmap != *src_vmap) + down_read(&(*src_vmap)->vm_lock->lock); + } + mmap_read_unlock(mm); + return err; +} + +static void uffd_move_unlock(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) +{ + vma_end_read(src_vma); + if (src_vma != dst_vma) + vma_end_read(dst_vma); +} + +#else + +static int uffd_move_lock(struct mm_struct *mm, + unsigned long dst_start, + unsigned long src_start, + struct vm_area_struct **dst_vmap, + struct vm_area_struct **src_vmap) +{ + int err; + + mmap_read_lock(mm); + err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); + if (err) + mmap_read_unlock(mm); + return err; +} + +static void uffd_move_unlock(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) +{ + mmap_assert_locked(src_vma->vm_mm); + mmap_read_unlock(dst_vma->vm_mm); +} +#endif + /** * move_pages - move arbitrary anonymous pages of an existing vma * @ctx: pointer to the userfaultfd context - * @mm: the address space to move pages * @dst_start: start of the destination virtual memory range * @src_start: start of the source virtual memory range * @len: length of the virtual memory range * @mode: flags from uffdio_move.mode * - * Must be called with mmap_lock held for read. + * It will either use the mmap_lock in read mode or per-vma locks * * move_pages() remaps arbitrary anonymous pages atomically in zero * copy. It only works on non shared anonymous pages because those can @@ -1355,10 +1550,10 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx, * could be obtained. This is the only additional complexity added to * the rmap code to provide this anonymous page remapping functionality. */ -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, - unsigned long dst_start, unsigned long src_start, - unsigned long len, __u64 mode) +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long src_start, unsigned long len, __u64 mode) { + struct mm_struct *mm = ctx->mm; struct vm_area_struct *src_vma, *dst_vma; unsigned long src_addr, dst_addr; pmd_t *src_pmd, *dst_pmd; @@ -1376,28 +1571,34 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, WARN_ON_ONCE(dst_start + len <= dst_start)) goto out; + err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); + if (err) + goto out; + + /* Re-check after taking map_changing_lock */ + err = -EAGAIN; + down_read(&ctx->map_changing_lock); + if (likely(atomic_read(&ctx->mmap_changing))) + goto out_unlock; /* * Make sure the vma is not shared, that the src and dst remap * ranges are both valid and fully within a single existing * vma. */ - src_vma = find_vma(mm, src_start); - if (!src_vma || (src_vma->vm_flags & VM_SHARED)) - goto out; - if (src_start < src_vma->vm_start || - src_start + len > src_vma->vm_end) - goto out; + err = -EINVAL; + if (src_vma->vm_flags & VM_SHARED) + goto out_unlock; + if (src_start + len > src_vma->vm_end) + goto out_unlock; - dst_vma = find_vma(mm, dst_start); - if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) - goto out; - if (dst_start < dst_vma->vm_start || - dst_start + len > dst_vma->vm_end) - goto out; + if (dst_vma->vm_flags & VM_SHARED) + goto out_unlock; + if (dst_start + len > dst_vma->vm_end) + goto out_unlock; err = validate_move_areas(ctx, src_vma, dst_vma); if (err) - goto out; + goto out_unlock; for (src_addr = src_start, dst_addr = dst_start; src_addr < src_start + len;) { @@ -1514,6 +1715,9 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, moved += step_size; } +out_unlock: + up_read(&ctx->map_changing_lock); + uffd_move_unlock(dst_vma, src_vma); out: VM_WARN_ON(moved < 0); VM_WARN_ON(err > 0); From 0a1ebc17a710011486e919983e03837d450ff2ee Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Feb 2024 16:58:38 -0800 Subject: [PATCH 228/435] Docs/mm/damon/maintainer-profile: fix reference links for mm-[un]stable tree Patch series "Docs/mm/damon: misc readability improvements". Fix trivial mistakes and improve layout of information on different documents for DAMON. This patch (of 5): A couple of sentences on maintainer-profile.rst are having reference links for mm-unstable and mm-stable trees with wrong rst markup. Fix those. Link: https://lkml.kernel.org/r/20240217005842.87348-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240217005842.87348-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/mm/damon/maintainer-profile.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst index a84c14e59053..5a306e4de22e 100644 --- a/Documentation/mm/damon/maintainer-profile.rst +++ b/Documentation/mm/damon/maintainer-profile.rst @@ -21,8 +21,8 @@ be queued in mm-stable [3]_ , and finally pull-requested to the mainline by the memory management subsystem maintainer. Note again the patches for review should be made against the mm-unstable -tree[1] whenever possible. damon/next is only for preview of others' works in -progress. +tree [1]_ whenever possible. damon/next is only for preview of others' works +in progress. Submit checklist addendum ------------------------- @@ -41,8 +41,8 @@ Further doing below and putting the results will be helpful. Key cycle dates --------------- -Patches can be sent anytime. Key cycle dates of the mm-unstable[1] and -mm-stable[3] trees depend on the memory management subsystem maintainer. +Patches can be sent anytime. Key cycle dates of the mm-unstable [1]_ and +mm-stable [3]_ trees depend on the memory management subsystem maintainer. Review cadence -------------- From 5b7708e6a85574db9fd0d82caba1c38e01723d64 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Feb 2024 16:58:39 -0800 Subject: [PATCH 229/435] Docs/mm/damon: move the list of DAMOS actions to design doc DAMOS operation actions are explained nearly twice on the DAMON usage document, once for the sysfs interface, and then again for the debugfs interface. Duplication is bad. Also it would better to keep this kind of concept level details in design document and keep the usage document small and focus on only the usage. Move the list to design document and update usage document to reference it. Link: https://lkml.kernel.org/r/20240217005842.87348-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 45 ++++++-------------- Documentation/mm/damon/design.rst | 26 +++++++++-- 2 files changed, 35 insertions(+), 36 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 58c34e66b31b..0335d584956b 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -302,27 +302,8 @@ In each scheme directory, five directories (``access_pattern``, ``quotas``, The ``action`` file is for setting and getting the scheme's :ref:`action `. The keywords that can be written to and read -from the file and their meaning are as below. - -Note that support of each action depends on the running DAMON operations set -:ref:`implementation `. - - - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``. - Supported by ``vaddr`` and ``fvaddr`` operations set. - - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``. - Supported by ``vaddr`` and ``fvaddr`` operations set. - - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``. - Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set. - - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. - Supported by ``vaddr`` and ``fvaddr`` operations set. - - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. - Supported by ``vaddr`` and ``fvaddr`` operations set. - - ``lru_prio``: Prioritize the region on its LRU lists. - Supported by ``paddr`` operations set. - - ``lru_deprio``: Deprioritize the region on its LRU lists. - Supported by ``paddr`` operations set. - - ``stat``: Do nothing but count the statistics. - Supported by all operations sets. +from the file and their meaning are same to those of the list on +:ref:`design doc `. The ``apply_interval_us`` file is for setting and getting the scheme's :ref:`apply_interval ` in microseconds. @@ -763,19 +744,17 @@ Action ~~~~~~ The ```` is a predefined integer for memory management :ref:`actions -`. The supported numbers and their meanings are as -below. +`. The mapping between the ```` values and +the memory management actions is as below. For the detailed meaning of the +action and DAMON operations set supporting each action, please refer to the +list on :ref:`design doc `. - - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``. Ignored if - ``target`` is ``paddr``. - - 1: Call ``madvise()`` for the region with ``MADV_COLD``. Ignored if - ``target`` is ``paddr``. - - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``. - - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. Ignored if - ``target`` is ``paddr``. - - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. Ignored if - ``target`` is ``paddr``. - - 5: Do nothing but count the statistics + - 0: ``willneed`` + - 1: ``cold`` + - 2: ``pageout`` + - 3: ``hugepage`` + - 4: ``nohugepage`` + - 5: ``stat`` Quota ~~~~~ diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 1bb69524a62e..9f16c4e62e72 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -294,9 +294,29 @@ not mandated to support all actions of the list. Hence, the availability of specific DAMOS action depends on what operations set is selected to be used together. -Applying an action to a region is considered as changing the region's -characteristics. Hence, DAMOS resets the age of regions when an action is -applied to those. +The list of the supported actions, their meaning, and DAMON operations sets +that supports each action are as below. + + - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``. + Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set. + - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``lru_prio``: Prioritize the region on its LRU lists. + Supported by ``paddr`` operations set. + - ``lru_deprio``: Deprioritize the region on its LRU lists. + Supported by ``paddr`` operations set. + - ``stat``: Do nothing but count the statistics. + Supported by all operations sets. + +Applying the actions except ``stat`` to a region is considered as changing the +region's characteristics. Hence, DAMOS resets the age of regions when any such +actions are applied to those. .. _damon_design_damos_access_pattern: From 669971b406f0d4a6ffe0816ec1281cbf8f99e307 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Feb 2024 16:58:40 -0800 Subject: [PATCH 230/435] Docs/mm/damon: move DAMON operation sets list from the usage to the design document The list of DAMON operation sets and their explanation, which may better to be on design document, is written on the usage document. Move the detail to design document and make the usage document only reference the design document. [sj@kernel.org: fix a typo on a reference link] Link: https://lkml.kernel.org/r/20240221170852.55529-2-sj@kernel.org Link: https://lkml.kernel.org/r/20240217005842.87348-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 19 +++++++------------ Documentation/mm/damon/design.rst | 12 ++++++++++-- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 0335d584956b..68d00e8f0140 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -180,19 +180,14 @@ In each context directory, two files (``avail_operations`` and ``operations``) and three directories (``monitoring_attrs``, ``targets``, and ``schemes``) exist. -DAMON supports multiple types of monitoring operations, including those for -virtual address space and the physical address space. You can get the list of -available monitoring operations set on the currently running kernel by reading +DAMON supports multiple types of :ref:`monitoring operations +`, including those for virtual address +space and the physical address space. You can get the list of available +monitoring operations set on the currently running kernel by reading ``avail_operations`` file. Based on the kernel configuration, the file will -list some or all of below keywords. - - - vaddr: Monitor virtual address spaces of specific processes - - fvaddr: Monitor fixed virtual address ranges - - paddr: Monitor the physical address space of the system - -Please refer to :ref:`regions sysfs directory ` for detailed -differences between the operations sets in terms of the monitoring target -regions. +list different available operation sets. Please refer to the :ref:`design +` for the list of all available operation sets and their +brief explanations. You can set and get what type of monitoring operations DAMON will use for the context by writing one of the keywords listed in ``avail_operations`` file and diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 9f16c4e62e72..6abf976dd71f 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -31,6 +31,8 @@ DAMON subsystem is configured with three layers including interfaces for the user space, on top of the core layer. +.. _damon_design_configurable_operations_set: + Configurable Operations Set --------------------------- @@ -63,6 +65,8 @@ modules that built on top of the core layer using the API, which can be easily used by the user space end users. +.. _damon_operations_set: + Operations Set Layer ==================== @@ -71,8 +75,12 @@ The monitoring operations are defined in two parts: 1. Identification of the monitoring target address range for the address space. 2. Access check of specific address range in the target space. -DAMON currently provides the implementations of the operations for the physical -and virtual address spaces. Below two subsections describe how those work. +DAMON currently provides below three operation sets. Below two subsections +describe how those work. + + - vaddr: Monitor virtual address spaces of specific processes + - fvaddr: Monitor fixed virtual address ranges + - paddr: Monitor the physical address space of the system VMA-based Target Address Range Construction From 2d89957c93667d160de6b43771dc20946bbe9805 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Feb 2024 16:58:41 -0800 Subject: [PATCH 231/435] Docs/mm/damon: move monitoring target regions setup detail from the usage to the design document Design doc is aimed to have all concept level details, while the usage doc is focused on only how the features can be used. Some details about monitoring target regions construction is on the usage doc. Move the details about the monitoring target regions construction differences for DAMON operations set from the usage to the design doc. Link: https://lkml.kernel.org/r/20240217005842.87348-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 16 +++++----------- Documentation/mm/damon/design.rst | 12 +++++++++--- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 68d00e8f0140..ae5b986a5976 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -242,17 +242,11 @@ process to the ``pid_target`` file. targets//regions ------------------- -When ``vaddr`` monitoring operations set is being used (``vaddr`` is written to -the ``contexts//operations`` file), DAMON automatically sets and updates the -monitoring target regions so that entire memory mappings of target processes -can be covered. However, users could want to set the initial monitoring region -to specific address ranges. - -In contrast, DAMON do not automatically sets and updates the monitoring target -regions when ``fvaddr`` or ``paddr`` monitoring operations sets are being used -(``fvaddr`` or ``paddr`` have written to the ``contexts//operations``). -Therefore, users should set the monitoring target regions by themselves in the -cases. +In case of ``fvaddr`` or ``paddr`` monitoring operations sets, users are +required to set the monitoring target address ranges. In case of ``vaddr`` +operations set, it is not mandatory, but users can optionally set the initial +monitoring region to specific address ranges. Please refer to the :ref:`design +` for more details. For such cases, users can explicitly set the initial monitoring target regions as they want, by writing proper values to the files under this directory. diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 6abf976dd71f..2bd0c203dcfb 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -83,12 +83,18 @@ describe how those work. - paddr: Monitor the physical address space of the system + .. _damon_design_vaddr_target_regions_construction: + VMA-based Target Address Range Construction ------------------------------------------- -This is only for the virtual address space monitoring operations -implementation. That for the physical address space simply asks users to -manually set the monitoring target address ranges. +A mechanism of ``vaddr`` DAMON operations set that automatically initializes +and updates the monitoring target address regions so that entire memory +mappings of the target processes can be covered. + +This mechanism is only for the ``vaddr`` operations set. In cases of +``fvaddr`` and ``paddr`` operation sets, users are asked to manually set the +monitoring target address ranges. Only small parts in the super-huge virtual address space of the processes are mapped to the physical memory and accessed. Thus, tracking the unmapped From 7d8cebb9630af71f04cb27314a4effbc0f4f8648 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 16 Feb 2024 16:58:42 -0800 Subject: [PATCH 232/435] Docs/admin-guide/mm/damon/usage: fix wrong quotas diabling condition After the introduction of DAMOS quotas, DAMOS quotas is not disabled if both size and time quotas are zero but the quota goal is set. The new rule is also applied to DAMON sysfs interface, but the usage doc is not updated. Update it. Link: https://lkml.kernel.org/r/20240217005842.87348-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index ae5b986a5976..22254997723c 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -329,7 +329,8 @@ respectively. Then, DAMON tries to use only up to ``time quota`` milliseconds for applying the ``action`` to memory regions of the ``access_pattern``, and to apply the action to only up to ``bytes`` bytes of memory regions within the ``reset_interval_ms``. Setting both ``ms`` and ``bytes`` zero disables the -quota limits. +quota limits unless at least one :ref:`goal ` is +set. Under ``weights`` directory, three files (``sz_permil``, ``nr_accesses_permil``, and ``age_permil``) exist. From 3ee34eabac2abb6b1b6fcdebffe18870719ad000 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Feb 2024 22:59:01 +0100 Subject: [PATCH 233/435] lib/stackdepot: fix first entry having a 0-handle Patch series "page_owner: print stacks and their outstanding allocations", v10. page_owner is a great debug functionality tool that lets us know about all pages that have been allocated/freed and their specific stacktrace. This comes very handy when debugging memory leaks, since with some scripting we can see the outstanding allocations, which might point to a memory leak. In my experience, that is one of the most useful cases, but it can get really tedious to screen through all pages and try to reconstruct the stack <-> allocated/freed relationship, becoming most of the time a daunting and slow process when we have tons of allocation/free operations. This patchset aims to ease that by adding a new functionality into page_owner. This functionality creates a new directory called 'page_owner_stacks' under 'sys/kernel//debug' with a read-only file called 'show_stacks', which prints out all the stacks followed by their outstanding number of allocations (being that the times the stacktrace has allocated but not freed yet). This gives us a clear and a quick overview of stacks <-> allocated/free. We take advantage of the new refcount_f field that stack_record struct gained, and increment/decrement the stack refcount on every __set_page_owner() (alloc operation) and __reset_page_owner (free operation) call. Unfortunately, we cannot use the new stackdepot api STACK_DEPOT_FLAG_GET because it does not fulfill page_owner needs, meaning we would have to special case things, at which point makes more sense for page_owner to do its own {dec,inc}rementing of the stacks. E.g: Using STACK_DEPOT_FLAG_PUT, once the refcount reaches 0, such stack gets evicted, so page_owner would lose information. This patchset also creates a new file called 'set_threshold' within 'page_owner_stacks' directory, and by writing a value to it, the stacks which refcount is below such value will be filtered out. A PoC can be found below: # cat /sys/kernel/debug/page_owner_stacks/show_stacks > page_owner_full_stacks.txt # head -40 page_owner_full_stacks.txt prep_new_page+0xa9/0x120 get_page_from_freelist+0x801/0x2210 __alloc_pages+0x18b/0x350 alloc_pages_mpol+0x91/0x1f0 folio_alloc+0x14/0x50 filemap_alloc_folio+0xb2/0x100 page_cache_ra_unbounded+0x96/0x180 filemap_get_pages+0xfd/0x590 filemap_read+0xcc/0x330 blkdev_read_iter+0xb8/0x150 vfs_read+0x285/0x320 ksys_read+0xa5/0xe0 do_syscall_64+0x80/0x160 entry_SYSCALL_64_after_hwframe+0x6e/0x76 stack_count: 521 prep_new_page+0xa9/0x120 get_page_from_freelist+0x801/0x2210 __alloc_pages+0x18b/0x350 alloc_pages_mpol+0x91/0x1f0 folio_alloc+0x14/0x50 filemap_alloc_folio+0xb2/0x100 __filemap_get_folio+0x14a/0x490 ext4_write_begin+0xbd/0x4b0 [ext4] generic_perform_write+0xc1/0x1e0 ext4_buffered_write_iter+0x68/0xe0 [ext4] ext4_file_write_iter+0x70/0x740 [ext4] vfs_write+0x33d/0x420 ksys_write+0xa5/0xe0 do_syscall_64+0x80/0x160 entry_SYSCALL_64_after_hwframe+0x6e/0x76 stack_count: 4609 ... ... # echo 5000 > /sys/kernel/debug/page_owner_stacks/set_threshold # cat /sys/kernel/debug/page_owner_stacks/show_stacks > page_owner_full_stacks_5000.txt # head -40 page_owner_full_stacks_5000.txt prep_new_page+0xa9/0x120 get_page_from_freelist+0x801/0x2210 __alloc_pages+0x18b/0x350 alloc_pages_mpol+0x91/0x1f0 folio_alloc+0x14/0x50 filemap_alloc_folio+0xb2/0x100 __filemap_get_folio+0x14a/0x490 ext4_write_begin+0xbd/0x4b0 [ext4] generic_perform_write+0xc1/0x1e0 ext4_buffered_write_iter+0x68/0xe0 [ext4] ext4_file_write_iter+0x70/0x740 [ext4] vfs_write+0x33d/0x420 ksys_pwrite64+0x75/0x90 do_syscall_64+0x80/0x160 entry_SYSCALL_64_after_hwframe+0x6e/0x76 stack_count: 6781 prep_new_page+0xa9/0x120 get_page_from_freelist+0x801/0x2210 __alloc_pages+0x18b/0x350 pcpu_populate_chunk+0xec/0x350 pcpu_balance_workfn+0x2d1/0x4a0 process_scheduled_works+0x84/0x380 worker_thread+0x12a/0x2a0 kthread+0xe3/0x110 ret_from_fork+0x30/0x50 ret_from_fork_asm+0x1b/0x30 stack_count: 8641 This patch (of 7): The very first entry of stack_record gets a handle of 0, but this is wrong because stackdepot treats a 0-handle as a non-valid one. E.g: See the check in stack_depot_fetch() Fix this by adding and offset of 1. This bug has been lurking since the very beginning of stackdepot, but no one really cared as it seems. Because of that I am not adding a Fixes tag. Link: https://lkml.kernel.org/r/20240215215907.20121-1-osalvador@suse.de Link: https://lkml.kernel.org/r/20240215215907.20121-2-osalvador@suse.de Co-developed-by: Marco Elver Signed-off-by: Marco Elver Signed-off-by: Oscar Salvador Acked-by: Vlastimil Babka Acked-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Michal Hocko Signed-off-by: Andrew Morton --- lib/stackdepot.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 4a7055a63d9f..c043a4186bc5 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -45,15 +45,16 @@ #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \ STACK_DEPOT_EXTRA_BITS) #define DEPOT_POOLS_CAP 8192 +/* The pool_index is offset by 1 so the first record does not have a 0 handle. */ #define DEPOT_MAX_POOLS \ - (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \ - (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP) + (((1LL << (DEPOT_POOL_INDEX_BITS)) - 1 < DEPOT_POOLS_CAP) ? \ + (1LL << (DEPOT_POOL_INDEX_BITS)) - 1 : DEPOT_POOLS_CAP) /* Compact structure that stores a reference to a stack. */ union handle_parts { depot_stack_handle_t handle; struct { - u32 pool_index : DEPOT_POOL_INDEX_BITS; + u32 pool_index : DEPOT_POOL_INDEX_BITS; /* pool_index is offset by 1 */ u32 offset : DEPOT_OFFSET_BITS; u32 extra : STACK_DEPOT_EXTRA_BITS; }; @@ -372,7 +373,7 @@ static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size) stack = current_pool + pool_offset; /* Pre-initialize handle once. */ - stack->handle.pool_index = pool_index; + stack->handle.pool_index = pool_index + 1; stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN; stack->handle.extra = 0; INIT_LIST_HEAD(&stack->hash_list); @@ -483,18 +484,19 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) const int pools_num_cached = READ_ONCE(pools_num); union handle_parts parts = { .handle = handle }; void *pool; + u32 pool_index = parts.pool_index - 1; size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; lockdep_assert_not_held(&pool_lock); - if (parts.pool_index > pools_num_cached) { + if (pool_index > pools_num_cached) { WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", - parts.pool_index, pools_num_cached, handle); + pool_index, pools_num_cached, handle); return NULL; } - pool = stack_pools[parts.pool_index]; + pool = stack_pools[pool_index]; if (WARN_ON(!pool)) return NULL; From 8151c7a35d8bd8a12e93538ef7963ea209b6ab41 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Feb 2024 22:59:02 +0100 Subject: [PATCH 234/435] lib/stackdepot: move stack_record struct definition into the header In order to move the heavy lifting into page_owner code, this one needs to have access to the stack_record structure, which right now sits in lib/stackdepot.c. Move it to the stackdepot.h header so page_owner can access stack_record's struct fields. Link: https://lkml.kernel.org/r/20240215215907.20121-3-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Marco Elver Reviewed-by: Vlastimil Babka Acked-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 47 ++++++++++++++++++++++++++++++++++++++ lib/stackdepot.c | 43 ---------------------------------- 2 files changed, 47 insertions(+), 43 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index adcbb8f23600..c4b5ad57c066 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -30,6 +30,53 @@ typedef u32 depot_stack_handle_t; */ #define STACK_DEPOT_EXTRA_BITS 5 +#define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8) + +#define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */ +#define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER)) +#define DEPOT_STACK_ALIGN 4 +#define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) +#define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \ + STACK_DEPOT_EXTRA_BITS) + +#ifdef CONFIG_STACKDEPOT +/* Compact structure that stores a reference to a stack. */ +union handle_parts { + depot_stack_handle_t handle; + struct { + /* pool_index is offset by 1 */ + u32 pool_index : DEPOT_POOL_INDEX_BITS; + u32 offset : DEPOT_OFFSET_BITS; + u32 extra : STACK_DEPOT_EXTRA_BITS; + }; +}; + +struct stack_record { + struct list_head hash_list; /* Links in the hash table */ + u32 hash; /* Hash in hash table */ + u32 size; /* Number of stored frames */ + union handle_parts handle; /* Constant after initialization */ + refcount_t count; + union { + unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */ + struct { + /* + * An important invariant of the implementation is to + * only place a stack record onto the freelist iff its + * refcount is zero. Because stack records with a zero + * refcount are never considered as valid, it is safe to + * union @entries and freelist management state below. + * Conversely, as soon as an entry is off the freelist + * and its refcount becomes non-zero, the below must not + * be accessed until being placed back on the freelist. + */ + struct list_head free_list; /* Links in the freelist */ + unsigned long rcu_state; /* RCU cookie */ + }; + }; +}; +#endif + typedef u32 depot_flags_t; /* diff --git a/lib/stackdepot.c b/lib/stackdepot.c index c043a4186bc5..514b8d40ff57 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -36,55 +36,12 @@ #include #include -#define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8) - -#define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */ -#define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER)) -#define DEPOT_STACK_ALIGN 4 -#define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) -#define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \ - STACK_DEPOT_EXTRA_BITS) #define DEPOT_POOLS_CAP 8192 /* The pool_index is offset by 1 so the first record does not have a 0 handle. */ #define DEPOT_MAX_POOLS \ (((1LL << (DEPOT_POOL_INDEX_BITS)) - 1 < DEPOT_POOLS_CAP) ? \ (1LL << (DEPOT_POOL_INDEX_BITS)) - 1 : DEPOT_POOLS_CAP) -/* Compact structure that stores a reference to a stack. */ -union handle_parts { - depot_stack_handle_t handle; - struct { - u32 pool_index : DEPOT_POOL_INDEX_BITS; /* pool_index is offset by 1 */ - u32 offset : DEPOT_OFFSET_BITS; - u32 extra : STACK_DEPOT_EXTRA_BITS; - }; -}; - -struct stack_record { - struct list_head hash_list; /* Links in the hash table */ - u32 hash; /* Hash in hash table */ - u32 size; /* Number of stored frames */ - union handle_parts handle; /* Constant after initialization */ - refcount_t count; - union { - unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */ - struct { - /* - * An important invariant of the implementation is to - * only place a stack record onto the freelist iff its - * refcount is zero. Because stack records with a zero - * refcount are never considered as valid, it is safe to - * union @entries and freelist management state below. - * Conversely, as soon as an entry is off the freelist - * and its refcount becomes non-zero, the below must not - * be accessed until being placed back on the freelist. - */ - struct list_head free_list; /* Links in the freelist */ - unsigned long rcu_state; /* RCU cookie */ - }; - }; -}; - static bool stack_depot_disabled; static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); static bool __stack_depot_early_init_passed __initdata; From 4bedfb314bdd85c1662ecc46fa25b33b998f994d Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Feb 2024 22:59:03 +0100 Subject: [PATCH 235/435] mm,page_owner: maintain own list of stack_records structs page_owner needs to increment a stack_record refcount when a new allocation occurs, and decrement it on a free operation. In order to do that, we need to have a way to get a stack_record from a handle. Implement __stack_depot_get_stack_record() which just does that, and make it public so page_owner can use it. Also, traversing all stackdepot buckets comes with its own complexity, plus we would have to implement a way to mark only those stack_records that were originated from page_owner, as those are the ones we are interested in. For that reason, page_owner maintains its own list of stack_records, because traversing that list is faster than traversing all buckets while keeping at the same time a low complexity. For now, add to stack_list only the stack_records of dummy_handle and failure_handle, and set their refcount of 1. Further patches will add code to increment or decrement stack_records count on allocation and free operation. Link: https://lkml.kernel.org/r/20240215215907.20121-4-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Reviewed-by: Marco Elver Acked-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 11 +++++++++++ lib/stackdepot.c | 8 ++++++++ mm/page_owner.c | 15 +++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index c4b5ad57c066..3c6caa5abc7c 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -178,6 +178,17 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); +/** + * __stack_depot_get_stack_record - Get a pointer to a stack_record struct + * + * @handle: Stack depot handle + * + * This function is only for internal purposes. + * + * Return: Returns a pointer to a stack_record struct + */ +struct stack_record *__stack_depot_get_stack_record(depot_stack_handle_t handle); + /** * stack_depot_fetch - Fetch a stack trace from stack depot * diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 514b8d40ff57..8c795bb20afb 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -687,6 +687,14 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, } EXPORT_SYMBOL_GPL(stack_depot_save); +struct stack_record *__stack_depot_get_stack_record(depot_stack_handle_t handle) +{ + if (!handle) + return NULL; + + return depot_fetch_stack(handle); +} + unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries) { diff --git a/mm/page_owner.c b/mm/page_owner.c index 5634e5d890f8..33e342b15d9b 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -36,6 +36,14 @@ struct page_owner { pid_t free_tgid; }; +struct stack { + struct stack_record *stack_record; + struct stack *next; +}; +static struct stack dummy_stack; +static struct stack failure_stack; +static struct stack *stack_list; + static bool page_owner_enabled __initdata; DEFINE_STATIC_KEY_FALSE(page_owner_inited); @@ -95,6 +103,13 @@ static __init void init_page_owner(void) register_early_stack(); static_branch_enable(&page_owner_inited); init_early_allocated_pages(); + /* Initialize dummy and failure stacks and link them to stack_list */ + dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle); + failure_stack.stack_record = __stack_depot_get_stack_record(failure_handle); + refcount_set(&dummy_stack.stack_record->count, 1); + refcount_set(&failure_stack.stack_record->count, 1); + dummy_stack.next = &failure_stack; + stack_list = &dummy_stack; } struct page_ext_operations page_owner_ops = { From 217b2119b9e260609958db413876f211038f00ee Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Feb 2024 22:59:04 +0100 Subject: [PATCH 236/435] mm,page_owner: implement the tracking of the stacks count Implement {inc,dec}_stack_record_count() which increments or decrements on respective allocation and free operations, via __reset_page_owner() (free operation) and __set_page_owner() (alloc operation). Newly allocated stack_record structs will be added to the list stack_list via add_stack_record_to_list(). Modifications on the list are protected via a spinlock with irqs disabled, since this code can also be reached from IRQ context. Link: https://lkml.kernel.org/r/20240215215907.20121-5-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Marco Elver Reviewed-by: Vlastimil Babka Acked-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/page_owner.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 33e342b15d9b..df6a923af5de 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -43,6 +43,7 @@ struct stack { static struct stack dummy_stack; static struct stack failure_stack; static struct stack *stack_list; +static DEFINE_SPINLOCK(stack_list_lock); static bool page_owner_enabled __initdata; DEFINE_STATIC_KEY_FALSE(page_owner_inited); @@ -150,11 +151,68 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) return handle; } +static void add_stack_record_to_list(struct stack_record *stack_record, + gfp_t gfp_mask) +{ + unsigned long flags; + struct stack *stack; + + /* Filter gfp_mask the same way stackdepot does, for consistency */ + gfp_mask &= ~GFP_ZONEMASK; + gfp_mask &= (GFP_ATOMIC | GFP_KERNEL); + gfp_mask |= __GFP_NOWARN; + + stack = kmalloc(sizeof(*stack), gfp_mask); + if (!stack) + return; + + stack->stack_record = stack_record; + stack->next = NULL; + + spin_lock_irqsave(&stack_list_lock, flags); + stack->next = stack_list; + stack_list = stack; + spin_unlock_irqrestore(&stack_list_lock, flags); +} + +static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask) +{ + struct stack_record *stack_record = __stack_depot_get_stack_record(handle); + + if (!stack_record) + return; + + /* + * New stack_record's that do not use STACK_DEPOT_FLAG_GET start + * with REFCOUNT_SATURATED to catch spurious increments of their + * refcount. + * Since we do not use STACK_DEPOT_FLAG_GET API, let us + * set a refcount of 1 ourselves. + */ + if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) { + int old = REFCOUNT_SATURATED; + + if (atomic_try_cmpxchg_relaxed(&stack_record->count.refs, &old, 1)) + /* Add the new stack_record to our list */ + add_stack_record_to_list(stack_record, gfp_mask); + } + refcount_inc(&stack_record->count); +} + +static void dec_stack_record_count(depot_stack_handle_t handle) +{ + struct stack_record *stack_record = __stack_depot_get_stack_record(handle); + + if (stack_record) + refcount_dec(&stack_record->count); +} + void __reset_page_owner(struct page *page, unsigned short order) { int i; struct page_ext *page_ext; depot_stack_handle_t handle; + depot_stack_handle_t alloc_handle; struct page_owner *page_owner; u64 free_ts_nsec = local_clock(); @@ -162,17 +220,29 @@ void __reset_page_owner(struct page *page, unsigned short order) if (unlikely(!page_ext)) return; + page_owner = get_page_owner(page_ext); + alloc_handle = page_owner->handle; + handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); for (i = 0; i < (1 << order); i++) { __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); - page_owner = get_page_owner(page_ext); page_owner->free_handle = handle; page_owner->free_ts_nsec = free_ts_nsec; page_owner->free_pid = current->pid; page_owner->free_tgid = current->tgid; page_ext = page_ext_next(page_ext); + page_owner = get_page_owner(page_ext); } page_ext_put(page_ext); + if (alloc_handle != early_handle) + /* + * early_handle is being set as a handle for all those + * early allocated pages. See init_pages_in_zone(). + * Since their refcount is not being incremented because + * the machinery is not ready yet, we cannot decrement + * their refcount either. + */ + dec_stack_record_count(alloc_handle); } static inline void __set_page_owner_handle(struct page_ext *page_ext, @@ -214,6 +284,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order, return; __set_page_owner_handle(page_ext, handle, order, gfp_mask); page_ext_put(page_ext); + inc_stack_record_count(handle, gfp_mask); } void __set_page_owner_migrate_reason(struct page *page, int reason) From 765973a098037b198ffb813244dbdb45c875ad5f Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Feb 2024 22:59:05 +0100 Subject: [PATCH 237/435] mm,page_owner: display all stacks and their count This patch adds a new directory called 'page_owner_stacks' under /sys/kernel/debug/, with a file called 'show_stacks' in it. Reading from that file will show all stacks that were added by page_owner followed by their counting, giving us a clear overview of stack <-> count relationship. E.g: prep_new_page+0xa9/0x120 get_page_from_freelist+0x801/0x2210 __alloc_pages+0x18b/0x350 alloc_pages_mpol+0x91/0x1f0 folio_alloc+0x14/0x50 filemap_alloc_folio+0xb2/0x100 __filemap_get_folio+0x14a/0x490 ext4_write_begin+0xbd/0x4b0 [ext4] generic_perform_write+0xc1/0x1e0 ext4_buffered_write_iter+0x68/0xe0 [ext4] ext4_file_write_iter+0x70/0x740 [ext4] vfs_write+0x33d/0x420 ksys_write+0xa5/0xe0 do_syscall_64+0x80/0x160 entry_SYSCALL_64_after_hwframe+0x6e/0x76 stack_count: 4578 The seq stack_{start,next} functions will iterate through the list stack_list in order to print all stacks. Link: https://lkml.kernel.org/r/20240215215907.20121-6-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Marco Elver Reviewed-by: Vlastimil Babka Acked-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/page_owner.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index df6a923af5de..e99fbf822dd6 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -171,7 +171,13 @@ static void add_stack_record_to_list(struct stack_record *stack_record, spin_lock_irqsave(&stack_list_lock, flags); stack->next = stack_list; - stack_list = stack; + /* + * This pairs with smp_load_acquire() from function + * stack_start(). This guarantees that stack_start() + * will see an updated stack_list before starting to + * traverse the list. + */ + smp_store_release(&stack_list, stack); spin_unlock_irqrestore(&stack_list_lock, flags); } @@ -805,8 +811,90 @@ static const struct file_operations proc_page_owner_operations = { .llseek = lseek_page_owner, }; +static void *stack_start(struct seq_file *m, loff_t *ppos) +{ + struct stack *stack; + + if (*ppos == -1UL) + return NULL; + + if (!*ppos) { + /* + * This pairs with smp_store_release() from function + * add_stack_record_to_list(), so we get a consistent + * value of stack_list. + */ + stack = smp_load_acquire(&stack_list); + } else { + stack = m->private; + stack = stack->next; + } + + m->private = stack; + + return stack; +} + +static void *stack_next(struct seq_file *m, void *v, loff_t *ppos) +{ + struct stack *stack = v; + + stack = stack->next; + *ppos = stack ? *ppos + 1 : -1UL; + m->private = stack; + + return stack; +} + +static int stack_print(struct seq_file *m, void *v) +{ + int i, stack_count; + struct stack *stack = v; + unsigned long *entries; + unsigned long nr_entries; + struct stack_record *stack_record = stack->stack_record; + + nr_entries = stack_record->size; + entries = stack_record->entries; + stack_count = refcount_read(&stack_record->count) - 1; + + if (!nr_entries || nr_entries < 0 || stack_count < 1) + return 0; + + for (i = 0; i < nr_entries; i++) + seq_printf(m, " %pS\n", (void *)entries[i]); + seq_printf(m, "stack_count: %d\n\n", stack_count); + + return 0; +} + +static void stack_stop(struct seq_file *m, void *v) +{ +} + +static const struct seq_operations page_owner_stack_op = { + .start = stack_start, + .next = stack_next, + .stop = stack_stop, + .show = stack_print +}; + +static int page_owner_stack_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &page_owner_stack_op, 0); +} + +static const struct file_operations page_owner_stack_operations = { + .open = page_owner_stack_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static int __init pageowner_init(void) { + struct dentry *dir; + if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; @@ -814,6 +902,9 @@ static int __init pageowner_init(void) debugfs_create_file("page_owner", 0400, NULL, NULL, &proc_page_owner_operations); + dir = debugfs_create_dir("page_owner_stacks", NULL); + debugfs_create_file("show_stacks", 0400, dir, NULL, + &page_owner_stack_operations); return 0; } From 05bb6f4e826bc8d02632ff942ee8d34b7be37379 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Feb 2024 22:59:06 +0100 Subject: [PATCH 238/435] mm,page_owner: filter out stacks by a threshold We want to be able to filter out the stacks based on a threshold we can can tune. By writing to 'count_threshold' file, we can adjust the threshold value. Link: https://lkml.kernel.org/r/20240215215907.20121-7-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Acked-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/page_owner.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index e99fbf822dd6..e56c1e92eccf 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -846,6 +846,8 @@ static void *stack_next(struct seq_file *m, void *v, loff_t *ppos) return stack; } +static unsigned long page_owner_stack_threshold; + static int stack_print(struct seq_file *m, void *v) { int i, stack_count; @@ -858,7 +860,8 @@ static int stack_print(struct seq_file *m, void *v) entries = stack_record->entries; stack_count = refcount_read(&stack_record->count) - 1; - if (!nr_entries || nr_entries < 0 || stack_count < 1) + if (!nr_entries || nr_entries < 0 || stack_count < 1 || + stack_count < page_owner_stack_threshold) return 0; for (i = 0; i < nr_entries; i++) @@ -891,6 +894,22 @@ static const struct file_operations page_owner_stack_operations = { .release = seq_release, }; +static int page_owner_threshold_get(void *data, u64 *val) +{ + *val = READ_ONCE(page_owner_stack_threshold); + return 0; +} + +static int page_owner_threshold_set(void *data, u64 val) +{ + WRITE_ONCE(page_owner_stack_threshold, val); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(proc_page_owner_threshold, &page_owner_threshold_get, + &page_owner_threshold_set, "%llu"); + + static int __init pageowner_init(void) { struct dentry *dir; @@ -905,6 +924,8 @@ static int __init pageowner_init(void) dir = debugfs_create_dir("page_owner_stacks", NULL); debugfs_create_file("show_stacks", 0400, dir, NULL, &page_owner_stack_operations); + debugfs_create_file("count_threshold", 0600, dir, NULL, + &proc_page_owner_threshold); return 0; } From ba6fe53772447968194a9c182f082b33ac1c8daa Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Feb 2024 22:59:07 +0100 Subject: [PATCH 239/435] mm,page_owner: update Documentation regarding page_owner_stacks Update page_owner documentation including the new page_owner_stacks feature to show how it can be used. Link: https://lkml.kernel.org/r/20240215215907.20121-8-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Reviewed-by: Marco Elver Acked-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Michal Hocko Signed-off-by: Andrew Morton --- Documentation/mm/page_owner.rst | 45 +++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst index 62e3f7ab23cc..0d0334cd5179 100644 --- a/Documentation/mm/page_owner.rst +++ b/Documentation/mm/page_owner.rst @@ -24,6 +24,11 @@ fragmentation statistics can be obtained through gfp flag information of each page. It is already implemented and activated if page owner is enabled. Other usages are more than welcome. +It can also be used to show all the stacks and their outstanding +allocations, which gives us a quick overview of where the memory is going +without the need to screen through all the pages and match the allocation +and free operation. + page owner is disabled by default. So, if you'd like to use it, you need to add "page_owner=on" to your boot cmdline. If the kernel is built with page owner and page owner is disabled in runtime due to not enabling @@ -68,6 +73,46 @@ Usage 4) Analyze information from page owner:: + cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt + cat stacks.txt + prep_new_page+0xa9/0x120 + get_page_from_freelist+0x7e6/0x2140 + __alloc_pages+0x18a/0x370 + new_slab+0xc8/0x580 + ___slab_alloc+0x1f2/0xaf0 + __slab_alloc.isra.86+0x22/0x40 + kmem_cache_alloc+0x31b/0x350 + __khugepaged_enter+0x39/0x100 + dup_mmap+0x1c7/0x5ce + copy_process+0x1afe/0x1c90 + kernel_clone+0x9a/0x3c0 + __do_sys_clone+0x66/0x90 + do_syscall_64+0x7f/0x160 + entry_SYSCALL_64_after_hwframe+0x6c/0x74 + stack_count: 234 + ... + ... + echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold + cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt + cat stacks_7000.txt + prep_new_page+0xa9/0x120 + get_page_from_freelist+0x7e6/0x2140 + __alloc_pages+0x18a/0x370 + alloc_pages_mpol+0xdf/0x1e0 + folio_alloc+0x14/0x50 + filemap_alloc_folio+0xb0/0x100 + page_cache_ra_unbounded+0x97/0x180 + filemap_fault+0x4b4/0x1200 + __do_fault+0x2d/0x110 + do_pte_missing+0x4b0/0xa30 + __handle_mm_fault+0x7fa/0xb70 + handle_mm_fault+0x125/0x300 + do_user_addr_fault+0x3c9/0x840 + exc_page_fault+0x68/0x150 + asm_exc_page_fault+0x22/0x30 + stack_count: 8248 + ... + cat /sys/kernel/debug/page_owner > page_owner_full.txt ./page_owner_sort page_owner_full.txt sorted_page_owner.txt From 38f6b9af04c4b79f81b3c2a0f76d1de94b78d7bc Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:23 +0100 Subject: [PATCH 240/435] mm: vmalloc: add va_alloc() helper Patch series "Mitigate a vmap lock contention", v3. 1. Motivation - Offload global vmap locks making it scaled to number of CPUS; - If possible and there is an agreement, we can remove the "Per cpu kva allocator" to make the vmap code to be more simple; - There were complaints from XFS folk that a vmalloc might be contented on their workloads. 2. Design(high level overview) We introduce an effective vmap node logic. A node behaves as independent entity to serve an allocation request directly(if possible) from its pool. That way it bypasses a global vmap space that is protected by its own lock. An access to pools are serialized by CPUs. Number of nodes are equal to number of CPUs in a system. Please note the high threshold is bound to 128 nodes. Pools are size segregated and populated based on system demand. The maximum alloc request that can be stored into a segregated storage is 256 pages. The lazily drain path decays a pool by 25% as a first step and as second populates it by fresh freed VAs for reuse instead of returning them into a global space. When a VA is obtained(alloc path), it is stored in separate nodes. A va->va_start address is converted into a correct node where it should be placed and resided. Doing so we balance VAs across the nodes as a result an access becomes scalable. The addr_to_node() function does a proper address conversion to a correct node. A vmap space is divided on segments with fixed size, it is 16 pages. That way any address can be associated with a segment number. Number of segments are equal to num_possible_cpus() but not grater then 128. The numeration starts from 0. See below how it is converted: static inline unsigned int addr_to_node_id(unsigned long addr) { return (addr / zone_size) % nr_nodes; } On a free path, a VA can be easily found by converting its "va_start" address to a certain node it resides. It is moved from "busy" data to "lazy" data structure. Later on, as noted earlier, the lazy kworker decays each node pool and populates it by fresh incoming VAs. Please note, a VA is returned to a node that did an alloc request. 3. Test on AMD Ryzen Threadripper 3970X 32-Core Processor sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64 94.41% 0.89% [kernel] [k] _raw_spin_lock 93.35% 93.07% [kernel] [k] native_queued_spin_lock_slowpath 76.13% 0.28% [kernel] [k] __vmalloc_node_range 72.96% 0.81% [kernel] [k] alloc_vmap_area 56.94% 0.00% [kernel] [k] __get_vm_area_node 41.95% 0.00% [kernel] [k] vmalloc 37.15% 0.01% [test_vmalloc] [k] full_fit_alloc_test 35.17% 0.00% [kernel] [k] ret_from_fork_asm 35.17% 0.00% [kernel] [k] ret_from_fork 35.17% 0.00% [kernel] [k] kthread 35.08% 0.00% [test_vmalloc] [k] test_func 34.45% 0.00% [test_vmalloc] [k] fix_size_alloc_test 28.09% 0.01% [test_vmalloc] [k] long_busy_list_alloc_test 23.53% 0.25% [kernel] [k] vfree.part.0 21.72% 0.00% [kernel] [k] remove_vm_area 20.08% 0.21% [kernel] [k] find_unlink_vmap_area 2.34% 0.61% [kernel] [k] free_vmap_area_noflush vs 82.32% 0.22% [test_vmalloc] [k] long_busy_list_alloc_test 63.36% 0.02% [kernel] [k] vmalloc 63.34% 2.64% [kernel] [k] __vmalloc_node_range 30.42% 4.46% [kernel] [k] vfree.part.0 28.98% 2.51% [kernel] [k] __alloc_pages_bulk 27.28% 0.19% [kernel] [k] __get_vm_area_node 26.13% 1.50% [kernel] [k] alloc_vmap_area 21.72% 21.67% [kernel] [k] clear_page_rep 19.51% 2.43% [kernel] [k] _raw_spin_lock 16.61% 16.51% [kernel] [k] native_queued_spin_lock_slowpath 13.40% 2.07% [kernel] [k] free_unref_page 10.62% 0.01% [kernel] [k] remove_vm_area 9.02% 8.73% [kernel] [k] insert_vmap_area 8.94% 0.00% [kernel] [k] ret_from_fork_asm 8.94% 0.00% [kernel] [k] ret_from_fork 8.94% 0.00% [kernel] [k] kthread 8.29% 0.00% [test_vmalloc] [k] test_func 7.81% 0.05% [test_vmalloc] [k] full_fit_alloc_test 5.30% 4.73% [kernel] [k] purge_vmap_node 4.47% 2.65% [kernel] [k] free_vmap_area_noflush confirms that a native_queued_spin_lock_slowpath goes down to 16.51% percent from 93.07%. The throughput is ~12x higher: urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64 Run the test with following parameters: run_test_mask=7 nr_threads=64 Done. Check the kernel ring buffer to see the summary. real 10m51.271s user 0m0.013s sys 0m0.187s urezki@pc638:~$ urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64 Run the test with following parameters: run_test_mask=7 nr_threads=64 Done. Check the kernel ring buffer to see the summary. real 0m51.301s user 0m0.015s sys 0m0.040s urezki@pc638:~$ This patch (of 11): Currently __alloc_vmap_area() function contains an open codded logic that finds and adjusts a VA based on allocation request. Introduce a va_alloc() helper that adjusts found VA only. There is no a functional change as a result of this patch. Link: https://lkml.kernel.org/r/20240102184633.748113-1-urezki@gmail.com Link: https://lkml.kernel.org/r/20240102184633.748113-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Lorenzo Stoakes Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Cc: Kazuhito Hagio Signed-off-by: Andrew Morton --- mm/vmalloc.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d12a17fc0c17..739401a9eafc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1481,6 +1481,32 @@ adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, return 0; } +static unsigned long +va_alloc(struct vmap_area *va, + struct rb_root *root, struct list_head *head, + unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend) +{ + unsigned long nva_start_addr; + int ret; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Check the "vend" restriction. */ + if (nva_start_addr + size > vend) + return vend; + + /* Update the free vmap_area. */ + ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size); + if (WARN_ON_ONCE(ret)) + return vend; + + return nva_start_addr; +} + /* * Returns a start address of the newly allocated area, if success. * Otherwise a vend is returned that indicates failure. @@ -1493,7 +1519,6 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, bool adjust_search_size = true; unsigned long nva_start_addr; struct vmap_area *va; - int ret; /* * Do not adjust when: @@ -1511,18 +1536,8 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, if (unlikely(!va)) return vend; - if (va->va_start > vstart) - nva_start_addr = ALIGN(va->va_start, align); - else - nva_start_addr = ALIGN(vstart, align); - - /* Check the "vend" restriction. */ - if (nva_start_addr + size > vend) - return vend; - - /* Update the free vmap_area. */ - ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size); - if (WARN_ON_ONCE(ret)) + nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend); + if (nva_start_addr == vend) return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK From 5b75b8e1b9040b34f43809b1948eaa4e83e39112 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:24 +0100 Subject: [PATCH 241/435] mm: vmalloc: rename adjust_va_to_fit_type() function This patch renames the adjust_va_to_fit_type() function to va_clip() which is shorter and more expressive. There is no a functional change as a result of this patch. Link: https://lkml.kernel.org/r/20240102184633.748113-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Lorenzo Stoakes Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 739401a9eafc..10f289e86512 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1382,9 +1382,9 @@ classify_va_fit_type(struct vmap_area *va, } static __always_inline int -adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, - struct vmap_area *va, unsigned long nva_start_addr, - unsigned long size) +va_clip(struct rb_root *root, struct list_head *head, + struct vmap_area *va, unsigned long nva_start_addr, + unsigned long size) { struct vmap_area *lva = NULL; enum fit_type type = classify_va_fit_type(va, nva_start_addr, size); @@ -1500,7 +1500,7 @@ va_alloc(struct vmap_area *va, return vend; /* Update the free vmap_area. */ - ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size); + ret = va_clip(root, head, va, nva_start_addr, size); if (WARN_ON_ONCE(ret)) return vend; @@ -4155,9 +4155,8 @@ retry: /* It is a BUG(), but trigger recovery instead. */ goto recovery; - ret = adjust_va_to_fit_type(&free_vmap_area_root, - &free_vmap_area_list, - va, start, size); + ret = va_clip(&free_vmap_area_root, + &free_vmap_area_list, va, start, size); if (WARN_ON_ONCE(unlikely(ret))) /* It is a BUG(), but trigger recovery instead. */ goto recovery; From 7fa8cee003166ef6db0bba70d610dbf173543811 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:25 +0100 Subject: [PATCH 242/435] mm: vmalloc: move vmap_init_free_space() down in vmalloc.c A vmap_init_free_space() is a function that setups a vmap space and is considered as part of initialization phase. Since a main entry which is vmalloc_init(), has been moved down in vmalloc.c it makes sense to follow the pattern. There is no a functional change as a result of this patch. Link: https://lkml.kernel.org/r/20240102184633.748113-4-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Lorenzo Stoakes Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 82 ++++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 10f289e86512..06bd843d18ae 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2512,47 +2512,6 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } -static void vmap_init_free_space(void) -{ - unsigned long vmap_start = 1; - const unsigned long vmap_end = ULONG_MAX; - struct vmap_area *busy, *free; - - /* - * B F B B B F - * -|-----|.....|-----|-----|-----|.....|- - * | The KVA space | - * |<--------------------------------->| - */ - list_for_each_entry(busy, &vmap_area_list, list) { - if (busy->va_start - vmap_start > 0) { - free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); - if (!WARN_ON_ONCE(!free)) { - free->va_start = vmap_start; - free->va_end = busy->va_start; - - insert_vmap_area_augment(free, NULL, - &free_vmap_area_root, - &free_vmap_area_list); - } - } - - vmap_start = busy->va_end; - } - - if (vmap_end - vmap_start > 0) { - free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); - if (!WARN_ON_ONCE(!free)) { - free->va_start = vmap_start; - free->va_end = vmap_end; - - insert_vmap_area_augment(free, NULL, - &free_vmap_area_root, - &free_vmap_area_list); - } - } -} - static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { @@ -4465,6 +4424,47 @@ module_init(proc_vmalloc_init); #endif +static void vmap_init_free_space(void) +{ + unsigned long vmap_start = 1; + const unsigned long vmap_end = ULONG_MAX; + struct vmap_area *busy, *free; + + /* + * B F B B B F + * -|-----|.....|-----|-----|-----|.....|- + * | The KVA space | + * |<--------------------------------->| + */ + list_for_each_entry(busy, &vmap_area_list, list) { + if (busy->va_start - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = busy->va_start; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } + + vmap_start = busy->va_end; + } + + if (vmap_end - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = vmap_end; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } +} + void __init vmalloc_init(void) { struct vmap_area *va; From d093602919ad5908532142a048539800fa94a0d1 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:26 +0100 Subject: [PATCH 243/435] mm: vmalloc: remove global vmap_area_root rb-tree Store allocated objects in a separate nodes. A va->va_start address is converted into a correct node where it should be placed and resided. An addr_to_node() function is used to do a proper address conversion to determine a node that contains a VA. Such approach balances VAs across nodes as a result an access becomes scalable. Number of nodes in a system depends on number of CPUs. Please note: 1. As of now allocated VAs are bound to a node-0. It means the patch does not give any difference comparing with a current behavior; 2. The global vmap_area_lock, vmap_area_root are removed as there is no need in it anymore. The vmap_area_list is still kept and is _empty_. It is exported for a kexec only; 3. The vmallocinfo and vread() have to be reworked to be able to handle multiple nodes. [urezki@gmail.com: mark vmap_init_free_space() with __init tag] Link: https://lkml.kernel.org/r/20240111132628.299644-1-urezki@gmail.com [urezki@gmail.com: fix a wrong value passed to __find_vmap_area()] Link: https://lkml.kernel.org/r/20240111121104.180993-1-urezki@gmail.com Link: https://lkml.kernel.org/r/20240102184633.748113-5-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Lorenzo Stoakes Reviewed-by: Anshuman Khandual Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 244 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 175 insertions(+), 69 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 06bd843d18ae..86efebf0e0c8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -728,11 +728,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn); #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 -static DEFINE_SPINLOCK(vmap_area_lock); static DEFINE_SPINLOCK(free_vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); -static struct rb_root vmap_area_root = RB_ROOT; static bool vmap_initialized __read_mostly; static struct rb_root purge_vmap_area_root = RB_ROOT; @@ -772,6 +770,38 @@ static struct rb_root free_vmap_area_root = RB_ROOT; */ static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); +/* + * An effective vmap-node logic. Users make use of nodes instead + * of a global heap. It allows to balance an access and mitigate + * contention. + */ +struct rb_list { + struct rb_root root; + struct list_head head; + spinlock_t lock; +}; + +static struct vmap_node { + /* Bookkeeping data of this node. */ + struct rb_list busy; +} single; + +static struct vmap_node *vmap_nodes = &single; +static __read_mostly unsigned int nr_vmap_nodes = 1; +static __read_mostly unsigned int vmap_zone_size = 1; + +static inline unsigned int +addr_to_node_id(unsigned long addr) +{ + return (addr / vmap_zone_size) % nr_vmap_nodes; +} + +static inline struct vmap_node * +addr_to_node(unsigned long addr) +{ + return &vmap_nodes[addr_to_node_id(addr)]; +} + static __always_inline unsigned long va_size(struct vmap_area *va) { @@ -803,10 +833,11 @@ unsigned long vmalloc_nr_pages(void) } /* Look up the first VA which satisfies addr < va_end, NULL if none. */ -static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) +static struct vmap_area * +find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) { struct vmap_area *va = NULL; - struct rb_node *n = vmap_area_root.rb_node; + struct rb_node *n = root->rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); @@ -1552,12 +1583,14 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, */ static void free_vmap_area(struct vmap_area *va) { + struct vmap_node *vn = addr_to_node(va->va_start); + /* * Remove from the busy tree/list. */ - spin_lock(&vmap_area_lock); - unlink_va(va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + spin_lock(&vn->busy.lock); + unlink_va(va, &vn->busy.root); + spin_unlock(&vn->busy.lock); /* * Insert/Merge it back to the free tree/list. @@ -1600,6 +1633,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int node, gfp_t gfp_mask, unsigned long va_flags) { + struct vmap_node *vn; struct vmap_area *va; unsigned long freed; unsigned long addr; @@ -1645,9 +1679,11 @@ retry: va->vm = NULL; va->flags = va_flags; - spin_lock(&vmap_area_lock); - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); - spin_unlock(&vmap_area_lock); + vn = addr_to_node(va->va_start); + + spin_lock(&vn->busy.lock); + insert_vmap_area(va, &vn->busy.root, &vn->busy.head); + spin_unlock(&vn->busy.lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); @@ -1871,26 +1907,61 @@ static void free_unmap_vmap_area(struct vmap_area *va) struct vmap_area *find_vmap_area(unsigned long addr) { + struct vmap_node *vn; struct vmap_area *va; + int i, j; - spin_lock(&vmap_area_lock); - va = __find_vmap_area(addr, &vmap_area_root); - spin_unlock(&vmap_area_lock); + /* + * An addr_to_node_id(addr) converts an address to a node index + * where a VA is located. If VA spans several zones and passed + * addr is not the same as va->va_start, what is not common, we + * may need to scan an extra nodes. See an example: + * + * <--va--> + * -|-----|-----|-----|-----|- + * 1 2 0 1 + * + * VA resides in node 1 whereas it spans 1 and 2. If passed + * addr is within a second node we should do extra work. We + * should mention that it is rare and is a corner case from + * the other hand it has to be covered. + */ + i = j = addr_to_node_id(addr); + do { + vn = &vmap_nodes[i]; - return va; + spin_lock(&vn->busy.lock); + va = __find_vmap_area(addr, &vn->busy.root); + spin_unlock(&vn->busy.lock); + + if (va) + return va; + } while ((i = (i + 1) % nr_vmap_nodes) != j); + + return NULL; } static struct vmap_area *find_unlink_vmap_area(unsigned long addr) { + struct vmap_node *vn; struct vmap_area *va; + int i, j; - spin_lock(&vmap_area_lock); - va = __find_vmap_area(addr, &vmap_area_root); - if (va) - unlink_va(va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + i = j = addr_to_node_id(addr); + do { + vn = &vmap_nodes[i]; - return va; + spin_lock(&vn->busy.lock); + va = __find_vmap_area(addr, &vn->busy.root); + if (va) + unlink_va(va, &vn->busy.root); + spin_unlock(&vn->busy.lock); + + if (va) + return va; + } while ((i = (i + 1) % nr_vmap_nodes) != j); + + return NULL; } /*** Per cpu kva allocator ***/ @@ -2092,6 +2163,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) static void free_vmap_block(struct vmap_block *vb) { + struct vmap_node *vn; struct vmap_block *tmp; struct xarray *xa; @@ -2099,9 +2171,10 @@ static void free_vmap_block(struct vmap_block *vb) tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb); - spin_lock(&vmap_area_lock); - unlink_va(vb->va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + vn = addr_to_node(vb->va->va_start); + spin_lock(&vn->busy.lock); + unlink_va(vb->va, &vn->busy.root); + spin_unlock(&vn->busy.lock); free_vmap_area_noflush(vb->va); kfree_rcu(vb, rcu_head); @@ -2525,9 +2598,11 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { - spin_lock(&vmap_area_lock); + struct vmap_node *vn = addr_to_node(va->va_start); + + spin_lock(&vn->busy.lock); setup_vmalloc_vm_locked(vm, va, flags, caller); - spin_unlock(&vmap_area_lock); + spin_unlock(&vn->busy.lock); } static void clear_vm_uninitialized_flag(struct vm_struct *vm) @@ -3715,6 +3790,7 @@ finished: */ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) { + struct vmap_node *vn; struct vmap_area *va; struct vm_struct *vm; char *vaddr; @@ -3728,8 +3804,11 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) remains = count; - spin_lock(&vmap_area_lock); - va = find_vmap_area_exceed_addr((unsigned long)addr); + /* Hooked to node_0 so far. */ + vn = addr_to_node(0); + spin_lock(&vn->busy.lock); + + va = find_vmap_area_exceed_addr((unsigned long)addr, &vn->busy.root); if (!va) goto finished_zero; @@ -3737,7 +3816,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) if ((unsigned long)addr + remains <= va->va_start) goto finished_zero; - list_for_each_entry_from(va, &vmap_area_list, list) { + list_for_each_entry_from(va, &vn->busy.head, list) { size_t copied; if (remains == 0) @@ -3796,12 +3875,12 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) } finished_zero: - spin_unlock(&vmap_area_lock); + spin_unlock(&vn->busy.lock); /* zero-fill memory holes */ return count - remains + zero_iter(iter, remains); finished: /* Nothing remains, or We couldn't copy/zero everything. */ - spin_unlock(&vmap_area_lock); + spin_unlock(&vn->busy.lock); return count - remains; } @@ -4135,14 +4214,15 @@ retry: } /* insert all vm's */ - spin_lock(&vmap_area_lock); for (area = 0; area < nr_vms; area++) { - insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); + struct vmap_node *vn = addr_to_node(vas[area]->va_start); + spin_lock(&vn->busy.lock); + insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head); setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, pcpu_get_vm_areas); + spin_unlock(&vn->busy.lock); } - spin_unlock(&vmap_area_lock); /* * Mark allocated areas as accessible. Do it now as a best-effort @@ -4253,55 +4333,57 @@ bool vmalloc_dump_obj(void *object) { void *objp = (void *)PAGE_ALIGN((unsigned long)object); const void *caller; - struct vm_struct *vm; struct vmap_area *va; + struct vmap_node *vn; unsigned long addr; unsigned int nr_pages; + bool success = false; - if (!spin_trylock(&vmap_area_lock)) - return false; - va = __find_vmap_area((unsigned long)objp, &vmap_area_root); - if (!va) { - spin_unlock(&vmap_area_lock); - return false; + vn = addr_to_node((unsigned long)objp); + + if (spin_trylock(&vn->busy.lock)) { + va = __find_vmap_area((unsigned long)objp, &vn->busy.root); + + if (va && va->vm) { + addr = (unsigned long)va->vm->addr; + caller = va->vm->caller; + nr_pages = va->vm->nr_pages; + success = true; + } + + spin_unlock(&vn->busy.lock); } - vm = va->vm; - if (!vm) { - spin_unlock(&vmap_area_lock); - return false; - } - addr = (unsigned long)vm->addr; - caller = vm->caller; - nr_pages = vm->nr_pages; - spin_unlock(&vmap_area_lock); - pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", - nr_pages, addr, caller); - return true; + if (success) + pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", + nr_pages, addr, caller); + + return success; } #endif #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) - __acquires(&vmap_purge_lock) - __acquires(&vmap_area_lock) { - mutex_lock(&vmap_purge_lock); - spin_lock(&vmap_area_lock); + struct vmap_node *vn = addr_to_node(0); - return seq_list_start(&vmap_area_list, *pos); + mutex_lock(&vmap_purge_lock); + spin_lock(&vn->busy.lock); + + return seq_list_start(&vn->busy.head, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - return seq_list_next(p, &vmap_area_list, pos); + struct vmap_node *vn = addr_to_node(0); + return seq_list_next(p, &vn->busy.head, pos); } static void s_stop(struct seq_file *m, void *p) - __releases(&vmap_area_lock) - __releases(&vmap_purge_lock) { - spin_unlock(&vmap_area_lock); + struct vmap_node *vn = addr_to_node(0); + + spin_unlock(&vn->busy.lock); mutex_unlock(&vmap_purge_lock); } @@ -4344,9 +4426,11 @@ static void show_purge_info(struct seq_file *m) static int s_show(struct seq_file *m, void *p) { + struct vmap_node *vn; struct vmap_area *va; struct vm_struct *v; + vn = addr_to_node(0); va = list_entry(p, struct vmap_area, list); if (!va->vm) { @@ -4397,7 +4481,7 @@ static int s_show(struct seq_file *m, void *p) * As a final step, dump "unpurged" areas. */ final: - if (list_is_last(&va->list, &vmap_area_list)) + if (list_is_last(&va->list, &vn->busy.head)) show_purge_info(m); return 0; @@ -4424,11 +4508,12 @@ module_init(proc_vmalloc_init); #endif -static void vmap_init_free_space(void) +static void __init vmap_init_free_space(void) { unsigned long vmap_start = 1; const unsigned long vmap_end = ULONG_MAX; - struct vmap_area *busy, *free; + struct vmap_area *free; + struct vm_struct *busy; /* * B F B B B F @@ -4436,12 +4521,12 @@ static void vmap_init_free_space(void) * | The KVA space | * |<--------------------------------->| */ - list_for_each_entry(busy, &vmap_area_list, list) { - if (busy->va_start - vmap_start > 0) { + for (busy = vmlist; busy; busy = busy->next) { + if ((unsigned long) busy->addr - vmap_start > 0) { free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (!WARN_ON_ONCE(!free)) { free->va_start = vmap_start; - free->va_end = busy->va_start; + free->va_end = (unsigned long) busy->addr; insert_vmap_area_augment(free, NULL, &free_vmap_area_root, @@ -4449,7 +4534,7 @@ static void vmap_init_free_space(void) } } - vmap_start = busy->va_end; + vmap_start = (unsigned long) busy->addr + busy->size; } if (vmap_end - vmap_start > 0) { @@ -4465,9 +4550,23 @@ static void vmap_init_free_space(void) } } +static void vmap_init_nodes(void) +{ + struct vmap_node *vn; + int i; + + for (i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; + vn->busy.root = RB_ROOT; + INIT_LIST_HEAD(&vn->busy.head); + spin_lock_init(&vn->busy.lock); + } +} + void __init vmalloc_init(void) { struct vmap_area *va; + struct vmap_node *vn; struct vm_struct *tmp; int i; @@ -4489,6 +4588,11 @@ void __init vmalloc_init(void) xa_init(&vbq->vmap_blocks); } + /* + * Setup nodes before importing vmlist. + */ + vmap_init_nodes(); + /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); @@ -4498,7 +4602,9 @@ void __init vmalloc_init(void) va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + + vn = addr_to_node(va->va_start); + insert_vmap_area(va, &vn->busy.root, &vn->busy.head); } /* From 55c49fee57af99f3c663e69dedc5b85e691bbe50 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 2 Jan 2024 19:46:27 +0100 Subject: [PATCH 244/435] mm/vmalloc: remove vmap_area_list Earlier, vmap_area_list is exported to vmcoreinfo so that makedumpfile get the base address of vmalloc area. Now, vmap_area_list is empty, so export VMALLOC_START to vmcoreinfo instead, and remove vmap_area_list. [urezki@gmail.com: fix a warning in the crash_save_vmcoreinfo_init()] Link: https://lkml.kernel.org/r/20240111192329.449189-1-urezki@gmail.com Link: https://lkml.kernel.org/r/20240102184633.748113-6-urezki@gmail.com Signed-off-by: Baoquan He Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Lorenzo Stoakes Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 8 ++++---- arch/arm64/kernel/crash_core.c | 1 - arch/riscv/kernel/crash_core.c | 1 - include/linux/vmalloc.h | 1 - kernel/crash_core.c | 4 +--- kernel/kallsyms_selftest.c | 1 - mm/nommu.c | 2 -- mm/vmalloc.c | 2 -- 8 files changed, 5 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index bced9e4b6e08..0f714fc945ac 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -65,11 +65,11 @@ Defines the beginning of the text section. In general, _stext indicates the kernel start address. Used to convert a virtual address from the direct kernel map to a physical address. -vmap_area_list --------------- +VMALLOC_START +------------- -Stores the virtual area list. makedumpfile gets the vmalloc start value -from this variable and its value is necessary for vmalloc translation. +Stores the base address of vmalloc area. makedumpfile gets this value +since is necessary for vmalloc translation. mem_map ------- diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c index 66cde752cd74..2a24199a9b81 100644 --- a/arch/arm64/kernel/crash_core.c +++ b/arch/arm64/kernel/crash_core.c @@ -23,7 +23,6 @@ void arch_crash_save_vmcoreinfo(void) /* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */ vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR); vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END); - vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START); vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END); vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START); vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END); diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/crash_core.c index 8706736fd4e2..d18d529fd9b9 100644 --- a/arch/riscv/kernel/crash_core.c +++ b/arch/riscv/kernel/crash_core.c @@ -8,7 +8,6 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_NUMBER(phys_ram_base); vmcoreinfo_append_str("NUMBER(PAGE_OFFSET)=0x%lx\n", PAGE_OFFSET); - vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START); vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END); #ifdef CONFIG_MMU VMCOREINFO_NUMBER(VA_BITS); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c720be70c8dd..91810b4e9510 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -253,7 +253,6 @@ extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count); /* * Internals. Don't use.. */ -extern struct list_head vmap_area_list; extern __init void vm_area_add_early(struct vm_struct *vm); extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 75cd6a736d03..49b31e59d3cc 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -748,7 +748,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); #endif VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmap_area_list); + vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START); #ifndef CONFIG_NUMA VMCOREINFO_SYMBOL(mem_map); @@ -789,8 +789,6 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(free_area, free_list); VMCOREINFO_OFFSET(list_head, next); VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vmap_area, va_start); - VMCOREINFO_OFFSET(vmap_area, list); VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); log_buf_vmcoreinfo_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index b4cac76ea5e9..8a689b4ff4f9 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -89,7 +89,6 @@ static struct test_item test_items[] = { ITEM_DATA(kallsyms_test_var_data_static), ITEM_DATA(kallsyms_test_var_bss), ITEM_DATA(kallsyms_test_var_data), - ITEM_DATA(vmap_area_list), #endif }; diff --git a/mm/nommu.c b/mm/nommu.c index b6dc558d3144..5ec8f44e7ce9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -131,8 +131,6 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL(follow_pfn); -LIST_HEAD(vmap_area_list); - void vfree(const void *addr) { kfree(addr); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 86efebf0e0c8..b5882790da00 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -729,8 +729,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn); static DEFINE_SPINLOCK(free_vmap_area_lock); -/* Export for kexec only */ -LIST_HEAD(vmap_area_list); static bool vmap_initialized __read_mostly; static struct rb_root purge_vmap_area_root = RB_ROOT; From 282631cb2447318e2a55b41a665dbe8571c46d70 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:28 +0100 Subject: [PATCH 245/435] mm: vmalloc: remove global purge_vmap_area_root rb-tree Similar to busy VA, lazily-freed area is stored to a node it belongs to. Such approach does not require any global locking primitive, instead an access becomes scalable what mitigates a contention. This patch removes a global purge-lock, global purge-tree and global purge list. Link: https://lkml.kernel.org/r/20240102184633.748113-7-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 135 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 82 insertions(+), 53 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b5882790da00..72822aeff55c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -731,10 +731,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn); static DEFINE_SPINLOCK(free_vmap_area_lock); static bool vmap_initialized __read_mostly; -static struct rb_root purge_vmap_area_root = RB_ROOT; -static LIST_HEAD(purge_vmap_area_list); -static DEFINE_SPINLOCK(purge_vmap_area_lock); - /* * This kmem_cache is used for vmap_area objects. Instead of * allocating from slab we reuse an object from this cache to @@ -782,6 +778,12 @@ struct rb_list { static struct vmap_node { /* Bookkeeping data of this node. */ struct rb_list busy; + struct rb_list lazy; + + /* + * Ready-to-free areas. + */ + struct list_head purge_list; } single; static struct vmap_node *vmap_nodes = &single; @@ -1766,40 +1768,22 @@ static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); +static cpumask_t purge_nodes; /* * Purges all lazily-freed vmap areas. */ -static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) +static unsigned long +purge_vmap_node(struct vmap_node *vn) { - unsigned long resched_threshold; - unsigned int num_purged_areas = 0; - struct list_head local_purge_list; + unsigned long num_purged_areas = 0; struct vmap_area *va, *n_va; - lockdep_assert_held(&vmap_purge_lock); - - spin_lock(&purge_vmap_area_lock); - purge_vmap_area_root = RB_ROOT; - list_replace_init(&purge_vmap_area_list, &local_purge_list); - spin_unlock(&purge_vmap_area_lock); - - if (unlikely(list_empty(&local_purge_list))) - goto out; - - start = min(start, - list_first_entry(&local_purge_list, - struct vmap_area, list)->va_start); - - end = max(end, - list_last_entry(&local_purge_list, - struct vmap_area, list)->va_end); - - flush_tlb_kernel_range(start, end); - resched_threshold = lazy_max_pages() << 1; + if (list_empty(&vn->purge_list)) + return 0; spin_lock(&free_vmap_area_lock); - list_for_each_entry_safe(va, n_va, &local_purge_list, list) { + list_for_each_entry_safe(va, n_va, &vn->purge_list, list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; unsigned long orig_start = va->va_start; unsigned long orig_end = va->va_end; @@ -1821,13 +1805,55 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) atomic_long_sub(nr, &vmap_lazy_nr); num_purged_areas++; - - if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) - cond_resched_lock(&free_vmap_area_lock); } spin_unlock(&free_vmap_area_lock); -out: + return num_purged_areas; +} + +/* + * Purges all lazily-freed vmap areas. + */ +static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) +{ + unsigned long num_purged_areas = 0; + struct vmap_node *vn; + int i; + + lockdep_assert_held(&vmap_purge_lock); + purge_nodes = CPU_MASK_NONE; + + for (i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; + + INIT_LIST_HEAD(&vn->purge_list); + + if (RB_EMPTY_ROOT(&vn->lazy.root)) + continue; + + spin_lock(&vn->lazy.lock); + WRITE_ONCE(vn->lazy.root.rb_node, NULL); + list_replace_init(&vn->lazy.head, &vn->purge_list); + spin_unlock(&vn->lazy.lock); + + start = min(start, list_first_entry(&vn->purge_list, + struct vmap_area, list)->va_start); + + end = max(end, list_last_entry(&vn->purge_list, + struct vmap_area, list)->va_end); + + cpumask_set_cpu(i, &purge_nodes); + } + + if (cpumask_weight(&purge_nodes) > 0) { + flush_tlb_kernel_range(start, end); + + for_each_cpu(i, &purge_nodes) { + vn = &nodes[i]; + num_purged_areas += purge_vmap_node(vn); + } + } + trace_purge_vmap_area_lazy(start, end, num_purged_areas); return num_purged_areas > 0; } @@ -1846,16 +1872,9 @@ static void reclaim_and_purge_vmap_areas(void) static void drain_vmap_area_work(struct work_struct *work) { - unsigned long nr_lazy; - - do { - mutex_lock(&vmap_purge_lock); - __purge_vmap_area_lazy(ULONG_MAX, 0); - mutex_unlock(&vmap_purge_lock); - - /* Recheck if further work is required. */ - nr_lazy = atomic_long_read(&vmap_lazy_nr); - } while (nr_lazy > lazy_max_pages()); + mutex_lock(&vmap_purge_lock); + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); } /* @@ -1865,6 +1884,7 @@ static void drain_vmap_area_work(struct work_struct *work) */ static void free_vmap_area_noflush(struct vmap_area *va) { + struct vmap_node *vn = addr_to_node(va->va_start); unsigned long nr_lazy_max = lazy_max_pages(); unsigned long va_start = va->va_start; unsigned long nr_lazy; @@ -1878,10 +1898,9 @@ static void free_vmap_area_noflush(struct vmap_area *va) /* * Merge or place it to the purge tree/list. */ - spin_lock(&purge_vmap_area_lock); - merge_or_add_vmap_area(va, - &purge_vmap_area_root, &purge_vmap_area_list); - spin_unlock(&purge_vmap_area_lock); + spin_lock(&vn->lazy.lock); + merge_or_add_vmap_area(va, &vn->lazy.root, &vn->lazy.head); + spin_unlock(&vn->lazy.lock); trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); @@ -4411,15 +4430,21 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) static void show_purge_info(struct seq_file *m) { + struct vmap_node *vn; struct vmap_area *va; + int i; - spin_lock(&purge_vmap_area_lock); - list_for_each_entry(va, &purge_vmap_area_list, list) { - seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", - (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + for (i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; + + spin_lock(&vn->lazy.lock); + list_for_each_entry(va, &vn->lazy.head, list) { + seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); + } + spin_unlock(&vn->lazy.lock); } - spin_unlock(&purge_vmap_area_lock); } static int s_show(struct seq_file *m, void *p) @@ -4558,6 +4583,10 @@ static void vmap_init_nodes(void) vn->busy.root = RB_ROOT; INIT_LIST_HEAD(&vn->busy.head); spin_lock_init(&vn->busy.lock); + + vn->lazy.root = RB_ROOT; + INIT_LIST_HEAD(&vn->lazy.head); + spin_lock_init(&vn->lazy.lock); } } From 72210662c5a2b6005f6daea7fe293a0dc573e1a5 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:29 +0100 Subject: [PATCH 246/435] mm: vmalloc: offload free_vmap_area_lock lock Concurrent access to a global vmap space is a bottle-neck. We can simulate a high contention by running a vmalloc test suite. To address it, introduce an effective vmap node logic. Each node behaves as independent entity. When a node is accessed it serves a request directly(if possible) from its pool. This model has a size based pool for requests, i.e. pools are serialized and populated based on object size and real demand. A maximum object size that pool can handle is set to 256 pages. This technique reduces a pressure on the global vmap lock. Link: https://lkml.kernel.org/r/20240102184633.748113-8-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 389 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 343 insertions(+), 46 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 72822aeff55c..e8b9621ea02b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -775,7 +775,22 @@ struct rb_list { spinlock_t lock; }; +struct vmap_pool { + struct list_head head; + unsigned long len; +}; + +/* + * A fast size storage contains VAs up to 1M size. + */ +#define MAX_VA_SIZE_PAGES 256 + static struct vmap_node { + /* Simple size segregated storage. */ + struct vmap_pool pool[MAX_VA_SIZE_PAGES]; + spinlock_t pool_lock; + bool skip_populate; + /* Bookkeeping data of this node. */ struct rb_list busy; struct rb_list lazy; @@ -784,6 +799,8 @@ static struct vmap_node { * Ready-to-free areas. */ struct list_head purge_list; + struct work_struct purge_work; + unsigned long nr_purged; } single; static struct vmap_node *vmap_nodes = &single; @@ -802,6 +819,61 @@ addr_to_node(unsigned long addr) return &vmap_nodes[addr_to_node_id(addr)]; } +static inline struct vmap_node * +id_to_node(unsigned int id) +{ + return &vmap_nodes[id % nr_vmap_nodes]; +} + +/* + * We use the value 0 to represent "no node", that is why + * an encoded value will be the node-id incremented by 1. + * It is always greater then 0. A valid node_id which can + * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id + * is not valid 0 is returned. + */ +static unsigned int +encode_vn_id(unsigned int node_id) +{ + /* Can store U8_MAX [0:254] nodes. */ + if (node_id < nr_vmap_nodes) + return (node_id + 1) << BITS_PER_BYTE; + + /* Warn and no node encoded. */ + WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id); + return 0; +} + +/* + * Returns an encoded node-id, the valid range is within + * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is + * returned if extracted data is wrong. + */ +static unsigned int +decode_vn_id(unsigned int val) +{ + unsigned int node_id = (val >> BITS_PER_BYTE) - 1; + + /* Can store U8_MAX [0:254] nodes. */ + if (node_id < nr_vmap_nodes) + return node_id; + + /* If it was _not_ zero, warn. */ + WARN_ONCE(node_id != UINT_MAX, + "Decode wrong node id (%d)\n", node_id); + + return nr_vmap_nodes; +} + +static bool +is_vn_id_valid(unsigned int node_id) +{ + if (node_id < nr_vmap_nodes) + return true; + + return false; +} + static __always_inline unsigned long va_size(struct vmap_area *va) { @@ -1623,6 +1695,104 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) kmem_cache_free(vmap_area_cachep, va); } +static struct vmap_pool * +size_to_va_pool(struct vmap_node *vn, unsigned long size) +{ + unsigned int idx = (size - 1) / PAGE_SIZE; + + if (idx < MAX_VA_SIZE_PAGES) + return &vn->pool[idx]; + + return NULL; +} + +static bool +node_pool_add_va(struct vmap_node *n, struct vmap_area *va) +{ + struct vmap_pool *vp; + + vp = size_to_va_pool(n, va_size(va)); + if (!vp) + return false; + + spin_lock(&n->pool_lock); + list_add(&va->list, &vp->head); + WRITE_ONCE(vp->len, vp->len + 1); + spin_unlock(&n->pool_lock); + + return true; +} + +static struct vmap_area * +node_pool_del_va(struct vmap_node *vn, unsigned long size, + unsigned long align, unsigned long vstart, + unsigned long vend) +{ + struct vmap_area *va = NULL; + struct vmap_pool *vp; + int err = 0; + + vp = size_to_va_pool(vn, size); + if (!vp || list_empty(&vp->head)) + return NULL; + + spin_lock(&vn->pool_lock); + if (!list_empty(&vp->head)) { + va = list_first_entry(&vp->head, struct vmap_area, list); + + if (IS_ALIGNED(va->va_start, align)) { + /* + * Do some sanity check and emit a warning + * if one of below checks detects an error. + */ + err |= (va_size(va) != size); + err |= (va->va_start < vstart); + err |= (va->va_end > vend); + + if (!WARN_ON_ONCE(err)) { + list_del_init(&va->list); + WRITE_ONCE(vp->len, vp->len - 1); + } else { + va = NULL; + } + } else { + list_move_tail(&va->list, &vp->head); + va = NULL; + } + } + spin_unlock(&vn->pool_lock); + + return va; +} + +static struct vmap_area * +node_alloc(unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend, + unsigned long *addr, unsigned int *vn_id) +{ + struct vmap_area *va; + + *vn_id = 0; + *addr = vend; + + /* + * Fallback to a global heap if not vmalloc or there + * is only one node. + */ + if (vstart != VMALLOC_START || vend != VMALLOC_END || + nr_vmap_nodes == 1) + return NULL; + + *vn_id = raw_smp_processor_id() % nr_vmap_nodes; + va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend); + *vn_id = encode_vn_id(*vn_id); + + if (va) + *addr = va->va_start; + + return va; +} + /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. @@ -1637,6 +1807,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, struct vmap_area *va; unsigned long freed; unsigned long addr; + unsigned int vn_id; int purged = 0; int ret; @@ -1647,11 +1818,23 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, return ERR_PTR(-EBUSY); might_sleep(); - gfp_mask = gfp_mask & GFP_RECLAIM_MASK; - va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); - if (unlikely(!va)) - return ERR_PTR(-ENOMEM); + /* + * If a VA is obtained from a global heap(if it fails here) + * it is anyway marked with this "vn_id" so it is returned + * to this pool's node later. Such way gives a possibility + * to populate pools based on users demand. + * + * On success a ready to go VA is returned. + */ + va = node_alloc(size, align, vstart, vend, &addr, &vn_id); + if (!va) { + gfp_mask = gfp_mask & GFP_RECLAIM_MASK; + + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); + if (unlikely(!va)) + return ERR_PTR(-ENOMEM); + } /* * Only scan the relevant parts containing pointers to other objects @@ -1660,10 +1843,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); retry: - preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); - addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, - size, align, vstart, vend); - spin_unlock(&free_vmap_area_lock); + if (addr == vend) { + preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); + addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, + size, align, vstart, vend); + spin_unlock(&free_vmap_area_lock); + } trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); @@ -1677,7 +1862,7 @@ retry: va->va_start = addr; va->va_end = addr + size; va->vm = NULL; - va->flags = va_flags; + va->flags = (va_flags | vn_id); vn = addr_to_node(va->va_start); @@ -1770,63 +1955,135 @@ static DEFINE_MUTEX(vmap_purge_lock); static void purge_fragmented_blocks_allcpus(void); static cpumask_t purge_nodes; -/* - * Purges all lazily-freed vmap areas. - */ -static unsigned long -purge_vmap_node(struct vmap_node *vn) +static void +reclaim_list_global(struct list_head *head) { - unsigned long num_purged_areas = 0; - struct vmap_area *va, *n_va; + struct vmap_area *va, *n; - if (list_empty(&vn->purge_list)) - return 0; + if (list_empty(head)) + return; spin_lock(&free_vmap_area_lock); + list_for_each_entry_safe(va, n, head, list) + merge_or_add_vmap_area_augment(va, + &free_vmap_area_root, &free_vmap_area_list); + spin_unlock(&free_vmap_area_lock); +} + +static void +decay_va_pool_node(struct vmap_node *vn, bool full_decay) +{ + struct vmap_area *va, *nva; + struct list_head decay_list; + struct rb_root decay_root; + unsigned long n_decay; + int i; + + decay_root = RB_ROOT; + INIT_LIST_HEAD(&decay_list); + + for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { + struct list_head tmp_list; + + if (list_empty(&vn->pool[i].head)) + continue; + + INIT_LIST_HEAD(&tmp_list); + + /* Detach the pool, so no-one can access it. */ + spin_lock(&vn->pool_lock); + list_replace_init(&vn->pool[i].head, &tmp_list); + spin_unlock(&vn->pool_lock); + + if (full_decay) + WRITE_ONCE(vn->pool[i].len, 0); + + /* Decay a pool by ~25% out of left objects. */ + n_decay = vn->pool[i].len >> 2; + + list_for_each_entry_safe(va, nva, &tmp_list, list) { + list_del_init(&va->list); + merge_or_add_vmap_area(va, &decay_root, &decay_list); + + if (!full_decay) { + WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1); + + if (!--n_decay) + break; + } + } + + /* Attach the pool back if it has been partly decayed. */ + if (!full_decay && !list_empty(&tmp_list)) { + spin_lock(&vn->pool_lock); + list_replace_init(&tmp_list, &vn->pool[i].head); + spin_unlock(&vn->pool_lock); + } + } + + reclaim_list_global(&decay_list); +} + +static void purge_vmap_node(struct work_struct *work) +{ + struct vmap_node *vn = container_of(work, + struct vmap_node, purge_work); + struct vmap_area *va, *n_va; + LIST_HEAD(local_list); + + vn->nr_purged = 0; + list_for_each_entry_safe(va, n_va, &vn->purge_list, list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; unsigned long orig_start = va->va_start; unsigned long orig_end = va->va_end; + unsigned int vn_id = decode_vn_id(va->flags); - /* - * Finally insert or merge lazily-freed area. It is - * detached and there is no need to "unlink" it from - * anything. - */ - va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, - &free_vmap_area_list); - - if (!va) - continue; + list_del_init(&va->list); if (is_vmalloc_or_module_addr((void *)orig_start)) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); atomic_long_sub(nr, &vmap_lazy_nr); - num_purged_areas++; - } - spin_unlock(&free_vmap_area_lock); + vn->nr_purged++; - return num_purged_areas; + if (is_vn_id_valid(vn_id) && !vn->skip_populate) + if (node_pool_add_va(vn, va)) + continue; + + /* Go back to global. */ + list_add(&va->list, &local_list); + } + + reclaim_list_global(&local_list); } /* * Purges all lazily-freed vmap areas. */ -static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) +static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, + bool full_pool_decay) { - unsigned long num_purged_areas = 0; + unsigned long nr_purged_areas = 0; + unsigned int nr_purge_helpers; + unsigned int nr_purge_nodes; struct vmap_node *vn; int i; lockdep_assert_held(&vmap_purge_lock); + + /* + * Use cpumask to mark which node has to be processed. + */ purge_nodes = CPU_MASK_NONE; for (i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; INIT_LIST_HEAD(&vn->purge_list); + vn->skip_populate = full_pool_decay; + decay_va_pool_node(vn, full_pool_decay); if (RB_EMPTY_ROOT(&vn->lazy.root)) continue; @@ -1845,17 +2102,45 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) cpumask_set_cpu(i, &purge_nodes); } - if (cpumask_weight(&purge_nodes) > 0) { + nr_purge_nodes = cpumask_weight(&purge_nodes); + if (nr_purge_nodes > 0) { flush_tlb_kernel_range(start, end); + /* One extra worker is per a lazy_max_pages() full set minus one. */ + nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages(); + nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1; + for_each_cpu(i, &purge_nodes) { - vn = &nodes[i]; - num_purged_areas += purge_vmap_node(vn); + vn = &vmap_nodes[i]; + + if (nr_purge_helpers > 0) { + INIT_WORK(&vn->purge_work, purge_vmap_node); + + if (cpumask_test_cpu(i, cpu_online_mask)) + schedule_work_on(i, &vn->purge_work); + else + schedule_work(&vn->purge_work); + + nr_purge_helpers--; + } else { + vn->purge_work.func = NULL; + purge_vmap_node(&vn->purge_work); + nr_purged_areas += vn->nr_purged; + } + } + + for_each_cpu(i, &purge_nodes) { + vn = &vmap_nodes[i]; + + if (vn->purge_work.func) { + flush_work(&vn->purge_work); + nr_purged_areas += vn->nr_purged; + } } } - trace_purge_vmap_area_lazy(start, end, num_purged_areas); - return num_purged_areas > 0; + trace_purge_vmap_area_lazy(start, end, nr_purged_areas); + return nr_purged_areas > 0; } /* @@ -1866,14 +2151,14 @@ static void reclaim_and_purge_vmap_areas(void) { mutex_lock(&vmap_purge_lock); purge_fragmented_blocks_allcpus(); - __purge_vmap_area_lazy(ULONG_MAX, 0); + __purge_vmap_area_lazy(ULONG_MAX, 0, true); mutex_unlock(&vmap_purge_lock); } static void drain_vmap_area_work(struct work_struct *work) { mutex_lock(&vmap_purge_lock); - __purge_vmap_area_lazy(ULONG_MAX, 0); + __purge_vmap_area_lazy(ULONG_MAX, 0, false); mutex_unlock(&vmap_purge_lock); } @@ -1884,9 +2169,10 @@ static void drain_vmap_area_work(struct work_struct *work) */ static void free_vmap_area_noflush(struct vmap_area *va) { - struct vmap_node *vn = addr_to_node(va->va_start); unsigned long nr_lazy_max = lazy_max_pages(); unsigned long va_start = va->va_start; + unsigned int vn_id = decode_vn_id(va->flags); + struct vmap_node *vn; unsigned long nr_lazy; if (WARN_ON_ONCE(!list_empty(&va->list))) @@ -1896,10 +2182,14 @@ static void free_vmap_area_noflush(struct vmap_area *va) PAGE_SHIFT, &vmap_lazy_nr); /* - * Merge or place it to the purge tree/list. + * If it was request by a certain node we would like to + * return it to that node, i.e. its pool for later reuse. */ + vn = is_vn_id_valid(vn_id) ? + id_to_node(vn_id):addr_to_node(va->va_start); + spin_lock(&vn->lazy.lock); - merge_or_add_vmap_area(va, &vn->lazy.root, &vn->lazy.head); + insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head); spin_unlock(&vn->lazy.lock); trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); @@ -2408,7 +2698,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) } free_purged_blocks(&purge_list); - if (!__purge_vmap_area_lazy(start, end) && flush) + if (!__purge_vmap_area_lazy(start, end, false) && flush) flush_tlb_kernel_range(start, end); mutex_unlock(&vmap_purge_lock); } @@ -4576,7 +4866,7 @@ static void __init vmap_init_free_space(void) static void vmap_init_nodes(void) { struct vmap_node *vn; - int i; + int i, j; for (i = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; @@ -4587,6 +4877,13 @@ static void vmap_init_nodes(void) vn->lazy.root = RB_ROOT; INIT_LIST_HEAD(&vn->lazy.head); spin_lock_init(&vn->lazy.lock); + + for (j = 0; j < MAX_VA_SIZE_PAGES; j++) { + INIT_LIST_HEAD(&vn->pool[j].head); + WRITE_ONCE(vn->pool[j].len, 0); + } + + spin_lock_init(&vn->pool_lock); } } From 96aa8437d169b8e030a98e2b74fd9a8ee9d3be7e Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 2 Feb 2024 20:06:28 +0100 Subject: [PATCH 247/435] mm: vmalloc: add a scan area of VA only once Invoke a kmemleak_scan_area() function only for newly allocated objects to add a scan area within that object. There is no reason to add a same scan area(pointer to beginning or inside the object) several times. If a VA is obtained from the cache its scan area has already been associated. Link: https://lkml.kernel.org/r/20240202190628.47806-1-urezki@gmail.com Fixes: 7db166b4aa0d ("mm: vmalloc: offload free_vmap_area_lock lock") Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Lorenzo Stoakes Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e8b9621ea02b..75e96cf377ef 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1834,13 +1834,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); - } - /* - * Only scan the relevant parts containing pointers to other objects - * to avoid false negatives. - */ - kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); + /* + * Only scan the relevant parts containing pointers to other objects + * to avoid false negatives. + */ + kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); + } retry: if (addr == vend) { From 53becf32aec1c8049b854f0c31a11df5ed75df6f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:30 +0100 Subject: [PATCH 248/435] mm: vmalloc: support multiple nodes in vread_iter Extend the vread_iter() to be able to perform a sequential reading of VAs which are spread among multiple nodes. So a data read over the /dev/kmem correctly reflects a vmalloc memory layout. Link: https://lkml.kernel.org/r/20240102184633.748113-9-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 14 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 75e96cf377ef..5a2db5b66333 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -906,7 +906,7 @@ unsigned long vmalloc_nr_pages(void) /* Look up the first VA which satisfies addr < va_end, NULL if none. */ static struct vmap_area * -find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) +__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) { struct vmap_area *va = NULL; struct rb_node *n = root->rb_node; @@ -930,6 +930,41 @@ find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) return va; } +/* + * Returns a node where a first VA, that satisfies addr < va_end, resides. + * If success, a node is locked. A user is responsible to unlock it when a + * VA is no longer needed to be accessed. + * + * Returns NULL if nothing found. + */ +static struct vmap_node * +find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) +{ + struct vmap_node *vn, *va_node = NULL; + struct vmap_area *va_lowest; + int i; + + for (i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; + + spin_lock(&vn->busy.lock); + va_lowest = __find_vmap_area_exceed_addr(addr, &vn->busy.root); + if (va_lowest) { + if (!va_node || va_lowest->va_start < (*va)->va_start) { + if (va_node) + spin_unlock(&va_node->busy.lock); + + *va = va_lowest; + va_node = vn; + continue; + } + } + spin_unlock(&vn->busy.lock); + } + + return va_node; +} + static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { struct rb_node *n = root->rb_node; @@ -4102,6 +4137,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) struct vm_struct *vm; char *vaddr; size_t n, size, flags, remains; + unsigned long next; addr = kasan_reset_tag(addr); @@ -4111,19 +4147,15 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) remains = count; - /* Hooked to node_0 so far. */ - vn = addr_to_node(0); - spin_lock(&vn->busy.lock); - - va = find_vmap_area_exceed_addr((unsigned long)addr, &vn->busy.root); - if (!va) + vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va); + if (!vn) goto finished_zero; /* no intersects with alive vmap_area */ if ((unsigned long)addr + remains <= va->va_start) goto finished_zero; - list_for_each_entry_from(va, &vn->busy.head, list) { + do { size_t copied; if (remains == 0) @@ -4138,10 +4170,10 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) WARN_ON(flags == VMAP_BLOCK); if (!vm && !flags) - continue; + goto next_va; if (vm && (vm->flags & VM_UNINITIALIZED)) - continue; + goto next_va; /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ smp_rmb(); @@ -4150,7 +4182,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) size = vm ? get_vm_area_size(vm) : va_size(va); if (addr >= vaddr + size) - continue; + goto next_va; if (addr < vaddr) { size_t to_zero = min_t(size_t, vaddr - addr, remains); @@ -4179,15 +4211,22 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) if (copied != n) goto finished; - } + + next_va: + next = va->va_end; + spin_unlock(&vn->busy.lock); + } while ((vn = find_vmap_area_exceed_addr_lock(next, &va))); finished_zero: - spin_unlock(&vn->busy.lock); + if (vn) + spin_unlock(&vn->busy.lock); + /* zero-fill memory holes */ return count - remains + zero_iter(iter, remains); finished: /* Nothing remains, or We couldn't copy/zero everything. */ - spin_unlock(&vn->busy.lock); + if (vn) + spin_unlock(&vn->busy.lock); return count - remains; } From 8e1d743f2c2671aa54f6f91a2b33823f92512870 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:31 +0100 Subject: [PATCH 249/435] mm: vmalloc: support multiple nodes in vmallocinfo Allocated areas are spread among nodes, it implies that the scanning has to be performed individually of each node in order to dump all existing VAs. Link: https://lkml.kernel.org/r/20240102184633.748113-10-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 142 +++++++++++++++++++++------------------------------ 1 file changed, 58 insertions(+), 84 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5a2db5b66333..41f924a9b52e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4709,30 +4709,6 @@ bool vmalloc_dump_obj(void *object) #endif #ifdef CONFIG_PROC_FS -static void *s_start(struct seq_file *m, loff_t *pos) -{ - struct vmap_node *vn = addr_to_node(0); - - mutex_lock(&vmap_purge_lock); - spin_lock(&vn->busy.lock); - - return seq_list_start(&vn->busy.head, *pos); -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) -{ - struct vmap_node *vn = addr_to_node(0); - return seq_list_next(p, &vn->busy.head, pos); -} - -static void s_stop(struct seq_file *m, void *p) -{ - struct vmap_node *vn = addr_to_node(0); - - spin_unlock(&vn->busy.lock); - mutex_unlock(&vmap_purge_lock); -} - static void show_numa_info(struct seq_file *m, struct vm_struct *v) { if (IS_ENABLED(CONFIG_NUMA)) { @@ -4776,84 +4752,82 @@ static void show_purge_info(struct seq_file *m) } } -static int s_show(struct seq_file *m, void *p) +static int vmalloc_info_show(struct seq_file *m, void *p) { struct vmap_node *vn; struct vmap_area *va; struct vm_struct *v; + int i; - vn = addr_to_node(0); - va = list_entry(p, struct vmap_area, list); + for (i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; - if (!va->vm) { - if (va->flags & VMAP_RAM) - seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", - (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + spin_lock(&vn->busy.lock); + list_for_each_entry(va, &vn->busy.head, list) { + if (!va->vm) { + if (va->flags & VMAP_RAM) + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); - goto final; + continue; + } + + v = va->vm; + + seq_printf(m, "0x%pK-0x%pK %7ld", + v->addr, v->addr + v->size, v->size); + + if (v->caller) + seq_printf(m, " %pS", v->caller); + + if (v->nr_pages) + seq_printf(m, " pages=%d", v->nr_pages); + + if (v->phys_addr) + seq_printf(m, " phys=%pa", &v->phys_addr); + + if (v->flags & VM_IOREMAP) + seq_puts(m, " ioremap"); + + if (v->flags & VM_ALLOC) + seq_puts(m, " vmalloc"); + + if (v->flags & VM_MAP) + seq_puts(m, " vmap"); + + if (v->flags & VM_USERMAP) + seq_puts(m, " user"); + + if (v->flags & VM_DMA_COHERENT) + seq_puts(m, " dma-coherent"); + + if (is_vmalloc_addr(v->pages)) + seq_puts(m, " vpages"); + + show_numa_info(m, v); + seq_putc(m, '\n'); + } + spin_unlock(&vn->busy.lock); } - v = va->vm; - - seq_printf(m, "0x%pK-0x%pK %7ld", - v->addr, v->addr + v->size, v->size); - - if (v->caller) - seq_printf(m, " %pS", v->caller); - - if (v->nr_pages) - seq_printf(m, " pages=%d", v->nr_pages); - - if (v->phys_addr) - seq_printf(m, " phys=%pa", &v->phys_addr); - - if (v->flags & VM_IOREMAP) - seq_puts(m, " ioremap"); - - if (v->flags & VM_ALLOC) - seq_puts(m, " vmalloc"); - - if (v->flags & VM_MAP) - seq_puts(m, " vmap"); - - if (v->flags & VM_USERMAP) - seq_puts(m, " user"); - - if (v->flags & VM_DMA_COHERENT) - seq_puts(m, " dma-coherent"); - - if (is_vmalloc_addr(v->pages)) - seq_puts(m, " vpages"); - - show_numa_info(m, v); - seq_putc(m, '\n'); - /* * As a final step, dump "unpurged" areas. */ -final: - if (list_is_last(&va->list, &vn->busy.head)) - show_purge_info(m); - + show_purge_info(m); return 0; } -static const struct seq_operations vmalloc_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - static int __init proc_vmalloc_init(void) { + void *priv_data = NULL; + if (IS_ENABLED(CONFIG_NUMA)) - proc_create_seq_private("vmallocinfo", 0400, NULL, - &vmalloc_op, - nr_node_ids * sizeof(unsigned int), NULL); - else - proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); + priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + + proc_create_single_data("vmallocinfo", + 0400, NULL, vmalloc_info_show, priv_data); + return 0; } module_init(proc_vmalloc_init); From 8f33a2ff307248c3e55a7696f60b3658b28edb57 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:32 +0100 Subject: [PATCH 250/435] mm: vmalloc: set nr_nodes based on CPUs in a system A number of nodes which are used in the alloc/free paths is set based on num_possible_cpus() in a system. Please note a high limit threshold though is fixed and corresponds to 128 nodes. For 32-bit or single core systems an access to a global vmap heap is not balanced. Such small systems do not suffer from lock contentions due to low number of CPUs. In such case the nr_nodes is equal to 1. Test on AMD Ryzen Threadripper 3970X 32-Core Processor: sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64 94.41% 0.89% [kernel] [k] _raw_spin_lock 93.35% 93.07% [kernel] [k] native_queued_spin_lock_slowpath 76.13% 0.28% [kernel] [k] __vmalloc_node_range 72.96% 0.81% [kernel] [k] alloc_vmap_area 56.94% 0.00% [kernel] [k] __get_vm_area_node 41.95% 0.00% [kernel] [k] vmalloc 37.15% 0.01% [test_vmalloc] [k] full_fit_alloc_test 35.17% 0.00% [kernel] [k] ret_from_fork_asm 35.17% 0.00% [kernel] [k] ret_from_fork 35.17% 0.00% [kernel] [k] kthread 35.08% 0.00% [test_vmalloc] [k] test_func 34.45% 0.00% [test_vmalloc] [k] fix_size_alloc_test 28.09% 0.01% [test_vmalloc] [k] long_busy_list_alloc_test 23.53% 0.25% [kernel] [k] vfree.part.0 21.72% 0.00% [kernel] [k] remove_vm_area 20.08% 0.21% [kernel] [k] find_unlink_vmap_area 2.34% 0.61% [kernel] [k] free_vmap_area_noflush vs 82.32% 0.22% [test_vmalloc] [k] long_busy_list_alloc_test 63.36% 0.02% [kernel] [k] vmalloc 63.34% 2.64% [kernel] [k] __vmalloc_node_range 30.42% 4.46% [kernel] [k] vfree.part.0 28.98% 2.51% [kernel] [k] __alloc_pages_bulk 27.28% 0.19% [kernel] [k] __get_vm_area_node 26.13% 1.50% [kernel] [k] alloc_vmap_area 21.72% 21.67% [kernel] [k] clear_page_rep 19.51% 2.43% [kernel] [k] _raw_spin_lock 16.61% 16.51% [kernel] [k] native_queued_spin_lock_slowpath 13.40% 2.07% [kernel] [k] free_unref_page 10.62% 0.01% [kernel] [k] remove_vm_area 9.02% 8.73% [kernel] [k] insert_vmap_area 8.94% 0.00% [kernel] [k] ret_from_fork_asm 8.94% 0.00% [kernel] [k] ret_from_fork 8.94% 0.00% [kernel] [k] kthread 8.29% 0.00% [test_vmalloc] [k] test_func 7.81% 0.05% [test_vmalloc] [k] full_fit_alloc_test 5.30% 4.73% [kernel] [k] purge_vmap_node 4.47% 2.65% [kernel] [k] free_vmap_area_noflush confirms that a native_queued_spin_lock_slowpath goes down to 16.51% percent from 93.07%. The throughput is ~12x higher: urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64 Run the test with following parameters: run_test_mask=7 nr_threads=64 Done. Check the kernel ring buffer to see the summary. real 10m51.271s user 0m0.013s sys 0m0.187s urezki@pc638:~$ urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64 Run the test with following parameters: run_test_mask=7 nr_threads=64 Done. Check the kernel ring buffer to see the summary. real 0m51.301s user 0m0.015s sys 0m0.040s urezki@pc638:~$ Link: https://lkml.kernel.org/r/20240102184633.748113-11-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 41f924a9b52e..4d90a6ad83aa 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4879,10 +4879,27 @@ static void __init vmap_init_free_space(void) static void vmap_init_nodes(void) { struct vmap_node *vn; - int i, j; + int i, n; - for (i = 0; i < nr_vmap_nodes; i++) { - vn = &vmap_nodes[i]; +#if BITS_PER_LONG == 64 + /* A high threshold of max nodes is fixed and bound to 128. */ + n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); + + if (n > 1) { + vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN); + if (vn) { + /* Node partition is 16 pages. */ + vmap_zone_size = (1 << 4) * PAGE_SIZE; + nr_vmap_nodes = n; + vmap_nodes = vn; + } else { + pr_err("Failed to allocate an array. Disable a node layer\n"); + } + } +#endif + + for (n = 0; n < nr_vmap_nodes; n++) { + vn = &vmap_nodes[n]; vn->busy.root = RB_ROOT; INIT_LIST_HEAD(&vn->busy.head); spin_lock_init(&vn->busy.lock); @@ -4891,9 +4908,9 @@ static void vmap_init_nodes(void) INIT_LIST_HEAD(&vn->lazy.head); spin_lock_init(&vn->lazy.lock); - for (j = 0; j < MAX_VA_SIZE_PAGES; j++) { - INIT_LIST_HEAD(&vn->pool[j].head); - WRITE_ONCE(vn->pool[j].len, 0); + for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { + INIT_LIST_HEAD(&vn->pool[i].head); + WRITE_ONCE(vn->pool[i].len, 0); } spin_lock_init(&vn->pool_lock); From 7679ba6b36dbb300b757b672d6a32a606499e14b Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 2 Jan 2024 19:46:33 +0100 Subject: [PATCH 251/435] mm: vmalloc: add a shrinker to drain vmap pools The added shrinker is used to return back current cached VAs into a global vmap space, when a system enters into a low memory mode. Link: https://lkml.kernel.org/r/20240102184633.748113-12-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Kazuhito Hagio Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- mm/vmalloc.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4d90a6ad83aa..6746491de381 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4917,8 +4917,37 @@ static void vmap_init_nodes(void) } } +static unsigned long +vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long count; + struct vmap_node *vn; + int i, j; + + for (count = 0, i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; + + for (j = 0; j < MAX_VA_SIZE_PAGES; j++) + count += READ_ONCE(vn->pool[j].len); + } + + return count ? count : SHRINK_EMPTY; +} + +static unsigned long +vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int i; + + for (i = 0; i < nr_vmap_nodes; i++) + decay_va_pool_node(&vmap_nodes[i], true); + + return SHRINK_STOP; +} + void __init vmalloc_init(void) { + struct shrinker *vmap_node_shrinker; struct vmap_area *va; struct vmap_node *vn; struct vm_struct *tmp; @@ -4966,4 +4995,14 @@ void __init vmalloc_init(void) */ vmap_init_free_space(); vmap_initialized = true; + + vmap_node_shrinker = shrinker_alloc(0, "vmap-node"); + if (!vmap_node_shrinker) { + pr_err("Failed to allocate vmap-node shrinker!\n"); + return; + } + + vmap_node_shrinker->count_objects = vmap_node_shrink_count; + vmap_node_shrinker->scan_objects = vmap_node_shrink_scan; + shrinker_register(vmap_node_shrinker); } From 15e02a39fb6b43f37100563c6a246252d5d1e6da Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 24 Jan 2024 19:09:19 +0100 Subject: [PATCH 252/435] mm: vmalloc: improve description of vmap node layer This patch adds extra explanation of recently added vmap node layer based on community feedback. No functional change. Link: https://lkml.kernel.org/r/20240124180920.50725-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Lorenzo Stoakes Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 60 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6746491de381..568f6c0b1fb5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -765,9 +765,10 @@ static struct rb_root free_vmap_area_root = RB_ROOT; static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); /* - * An effective vmap-node logic. Users make use of nodes instead - * of a global heap. It allows to balance an access and mitigate - * contention. + * This structure defines a single, solid model where a list and + * rb-tree are part of one entity protected by the lock. Nodes are + * sorted in ascending order, thus for O(1) access to left/right + * neighbors a list is used as well as for sequential traversal. */ struct rb_list { struct rb_root root; @@ -775,16 +776,23 @@ struct rb_list { spinlock_t lock; }; +/* + * A fast size storage contains VAs up to 1M size. A pool consists + * of linked between each other ready to go VAs of certain sizes. + * An index in the pool-array corresponds to number of pages + 1. + */ +#define MAX_VA_SIZE_PAGES 256 + struct vmap_pool { struct list_head head; unsigned long len; }; /* - * A fast size storage contains VAs up to 1M size. + * An effective vmap-node logic. Users make use of nodes instead + * of a global heap. It allows to balance an access and mitigate + * contention. */ -#define MAX_VA_SIZE_PAGES 256 - static struct vmap_node { /* Simple size segregated storage. */ struct vmap_pool pool[MAX_VA_SIZE_PAGES]; @@ -803,6 +811,11 @@ static struct vmap_node { unsigned long nr_purged; } single; +/* + * Initial setup consists of one single node, i.e. a balancing + * is fully disabled. Later on, after vmap is initialized these + * parameters are updated based on a system capacity. + */ static struct vmap_node *vmap_nodes = &single; static __read_mostly unsigned int nr_vmap_nodes = 1; static __read_mostly unsigned int vmap_zone_size = 1; @@ -2048,7 +2061,12 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) } } - /* Attach the pool back if it has been partly decayed. */ + /* + * Attach the pool back if it has been partly decayed. + * Please note, it is supposed that nobody(other contexts) + * can populate the pool therefore a simple list replace + * operation takes place here. + */ if (!full_decay && !list_empty(&tmp_list)) { spin_lock(&vn->pool_lock); list_replace_init(&tmp_list, &vn->pool[i].head); @@ -2257,16 +2275,14 @@ struct vmap_area *find_vmap_area(unsigned long addr) * An addr_to_node_id(addr) converts an address to a node index * where a VA is located. If VA spans several zones and passed * addr is not the same as va->va_start, what is not common, we - * may need to scan an extra nodes. See an example: + * may need to scan extra nodes. See an example: * - * <--va--> + * <----va----> * -|-----|-----|-----|-----|- * 1 2 0 1 * - * VA resides in node 1 whereas it spans 1 and 2. If passed - * addr is within a second node we should do extra work. We - * should mention that it is rare and is a corner case from - * the other hand it has to be covered. + * VA resides in node 1 whereas it spans 1, 2 an 0. If passed + * addr is within 2 or 0 nodes we should do extra work. */ i = j = addr_to_node_id(addr); do { @@ -2289,6 +2305,9 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr) struct vmap_area *va; int i, j; + /* + * Check the comment in the find_vmap_area() about the loop. + */ i = j = addr_to_node_id(addr); do { vn = &vmap_nodes[i]; @@ -4882,7 +4901,20 @@ static void vmap_init_nodes(void) int i, n; #if BITS_PER_LONG == 64 - /* A high threshold of max nodes is fixed and bound to 128. */ + /* + * A high threshold of max nodes is fixed and bound to 128, + * thus a scale factor is 1 for systems where number of cores + * are less or equal to specified threshold. + * + * As for NUMA-aware notes. For bigger systems, for example + * NUMA with multi-sockets, where we can end-up with thousands + * of cores in total, a "sub-numa-clustering" should be added. + * + * In this case a NUMA domain is considered as a single entity + * with dedicated sub-nodes in it which describe one group or + * set of cores. Therefore a per-domain purging is supposed to + * be added as well as a per-domain balancing. + */ n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); if (n > 1) { From 8be4d46e12af32342569840d958272dbb3be3f4c Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 24 Jan 2024 19:09:20 +0100 Subject: [PATCH 253/435] mm: vmalloc: refactor vmalloc_dump_obj() function This patch tends to simplify the function in question, by removing an extra stack "objp" variable, returning back to an early exit approach if spin_trylock() fails or VA was not found. Link: https://lkml.kernel.org/r/20240124180920.50725-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Lorenzo Stoakes Cc: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 568f6c0b1fb5..25a8df497255 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4696,34 +4696,35 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) #ifdef CONFIG_PRINTK bool vmalloc_dump_obj(void *object) { - void *objp = (void *)PAGE_ALIGN((unsigned long)object); const void *caller; + struct vm_struct *vm; struct vmap_area *va; struct vmap_node *vn; unsigned long addr; unsigned int nr_pages; - bool success = false; - vn = addr_to_node((unsigned long)objp); + addr = PAGE_ALIGN((unsigned long) object); + vn = addr_to_node(addr); - if (spin_trylock(&vn->busy.lock)) { - va = __find_vmap_area((unsigned long)objp, &vn->busy.root); - - if (va && va->vm) { - addr = (unsigned long)va->vm->addr; - caller = va->vm->caller; - nr_pages = va->vm->nr_pages; - success = true; - } + if (!spin_trylock(&vn->busy.lock)) + return false; + va = __find_vmap_area(addr, &vn->busy.root); + if (!va || !va->vm) { spin_unlock(&vn->busy.lock); + return false; } - if (success) - pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", - nr_pages, addr, caller); + vm = va->vm; + addr = (unsigned long) vm->addr; + caller = vm->caller; + nr_pages = vm->nr_pages; + spin_unlock(&vn->busy.lock); - return success; + pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", + nr_pages, addr, caller); + + return true; } #endif From 85fcde402db191b5f222ebfecda653777d7d084e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:41 +0800 Subject: [PATCH 254/435] kexec: split crashkernel reservation code out from crash_core.c Patch series "Split crash out from kexec and clean up related config items", v3. Motivation: ============= Previously, LKP reported a building error. When investigating, it can't be resolved reasonablly with the present messy kdump config items. https://lore.kernel.org/oe-kbuild-all/202312182200.Ka7MzifQ-lkp@intel.com/ The kdump (crash dumping) related config items could causes confusions: Firstly, CRASH_CORE enables codes including - crashkernel reservation; - elfcorehdr updating; - vmcoreinfo exporting; - crash hotplug handling; Now fadump of powerpc, kcore dynamic debugging and kdump all selects CRASH_CORE, while fadump - fadump needs crashkernel parsing, vmcoreinfo exporting, and accessing global variable 'elfcorehdr_addr'; - kcore only needs vmcoreinfo exporting; - kdump needs all of the current kernel/crash_core.c. So only enabling PROC_CORE or FA_DUMP will enable CRASH_CORE, this mislead people that we enable crash dumping, actual it's not. Secondly, It's not reasonable to allow KEXEC_CORE select CRASH_CORE. Because KEXEC_CORE enables codes which allocate control pages, copy kexec/kdump segments, and prepare for switching. These codes are shared by both kexec reboot and kdump. We could want kexec reboot, but disable kdump. In that case, CRASH_CORE should not be selected. -------------------- CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y --------------------- Thirdly, It's not reasonable to allow CRASH_DUMP select KEXEC_CORE. That could make KEXEC_CORE, CRASH_DUMP are enabled independently from KEXEC or KEXEC_FILE. However, w/o KEXEC or KEXEC_FILE, the KEXEC_CORE code built in doesn't make any sense because no kernel loading or switching will happen to utilize the KEXEC_CORE code. --------------------- CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y CONFIG_CRASH_DUMP=y --------------------- In this case, what is worse, on arch sh and arm, KEXEC relies on MMU, while CRASH_DUMP can still be enabled when !MMU, then compiling error is seen as the lkp test robot reported in above link. ------arch/sh/Kconfig------ config ARCH_SUPPORTS_KEXEC def_bool MMU config ARCH_SUPPORTS_CRASH_DUMP def_bool BROKEN_ON_SMP --------------------------- Changes: =========== 1, split out crash_reserve.c from crash_core.c; 2, split out vmcore_infoc. from crash_core.c; 3, move crash related codes in kexec_core.c into crash_core.c; 4, remove dependency of FA_DUMP on CRASH_DUMP; 5, clean up kdump related config items; 6, wrap up crash codes in crash related ifdefs on all 8 arch-es which support crash dumping, except of ppc; Achievement: =========== With above changes, I can rearrange the config item logic as below (the right item depends on or is selected by the left item): PROC_KCORE -----------> VMCORE_INFO |----------> VMCORE_INFO FA_DUMP----| |----------> CRASH_RESERVE ---->VMCORE_INFO / |---->CRASH_RESERVE KEXEC --| /| |--> KEXEC_CORE--> CRASH_DUMP-->/-|---->PROC_VMCORE KEXEC_FILE --| \ | \---->CRASH_HOTPLUG KEXEC --| |--> KEXEC_CORE (for kexec reboot only) KEXEC_FILE --| Test ======== On all 8 architectures, including x86_64, arm64, s390x, sh, arm, mips, riscv, loongarch, I did below three cases of config item setting and building all passed. Take configs on x86_64 as exampmle here: (1) Both CONFIG_KEXEC and KEXEC_FILE is unset, then all kexec/kdump items are unset automatically: # Kexec and crash features # CONFIG_KEXEC is not set # CONFIG_KEXEC_FILE is not set # end of Kexec and crash features (2) set CONFIG_KEXEC_FILE and 'make olddefconfig': --------------- # Kexec and crash features CONFIG_CRASH_RESERVE=y CONFIG_VMCORE_INFO=y CONFIG_KEXEC_CORE=y CONFIG_KEXEC_FILE=y CONFIG_CRASH_DUMP=y CONFIG_CRASH_HOTPLUG=y CONFIG_CRASH_MAX_MEMORY_RANGES=8192 # end of Kexec and crash features --------------- (3) unset CONFIG_CRASH_DUMP in case 2 and execute 'make olddefconfig': ------------------------ # Kexec and crash features CONFIG_KEXEC_CORE=y CONFIG_KEXEC_FILE=y # end of Kexec and crash features ------------------------ Note: For ppc, it needs investigation to make clear how to split out crash code in arch folder. Hope Hari and Pingfan can help have a look, see if it's doable. Now, I make it either have both kexec and crash enabled, or disable both of them altogether. This patch (of 14): Both kdump and fa_dump of ppc rely on crashkernel reservation. Move the relevant codes into separate files: crash_reserve.c, include/linux/crash_reserve.h. And also add config item CRASH_RESERVE to control its enabling of the codes. And update config items which has relationship with crashkernel reservation. And also change ifdeffery from CONFIG_CRASH_CORE to CONFIG_CRASH_RESERVE when those scopes are only crashkernel reservation related. And also rename arch/XXX/include/asm/{crash_core.h => crash_reserve.h} on arm64, x86 and risc-v because those architectures' crash_core.h is only related to crashkernel reservation. [akpm@linux-foundation.org: s/CRASH_RESEERVE/CRASH_RESERVE/, per Klara Modin] Link: https://lkml.kernel.org/r/20240124051254.67105-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20240124051254.67105-2-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Hari Bathini Cc: Al Viro Cc: Eric W. Biederman Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 2 +- .../asm/{crash_core.h => crash_reserve.h} | 4 +- arch/powerpc/Kconfig | 1 + arch/powerpc/mm/nohash/kaslr_booke.c | 4 +- arch/riscv/Kconfig | 2 +- .../asm/{crash_core.h => crash_reserve.h} | 4 +- arch/x86/Kconfig | 2 +- .../asm/{crash_core.h => crash_reserve.h} | 6 +- include/linux/crash_core.h | 40 -- include/linux/crash_reserve.h | 48 ++ include/linux/kexec.h | 1 + kernel/Kconfig.kexec | 5 +- kernel/Makefile | 1 + kernel/crash_core.c | 438 ----------------- kernel/crash_reserve.c | 464 ++++++++++++++++++ 15 files changed, 531 insertions(+), 491 deletions(-) rename arch/arm64/include/asm/{crash_core.h => crash_reserve.h} (81%) rename arch/riscv/include/asm/{crash_core.h => crash_reserve.h} (78%) rename arch/x86/include/asm/{crash_core.h => crash_reserve.h} (92%) create mode 100644 include/linux/crash_reserve.h create mode 100644 kernel/crash_reserve.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 386566138620..5a7ac1f37bdc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1519,7 +1519,7 @@ config ARCH_SUPPORTS_CRASH_DUMP def_bool y config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION - def_bool CRASH_CORE + def_bool CRASH_RESERVE config TRANS_TABLE def_bool y diff --git a/arch/arm64/include/asm/crash_core.h b/arch/arm64/include/asm/crash_reserve.h similarity index 81% rename from arch/arm64/include/asm/crash_core.h rename to arch/arm64/include/asm/crash_reserve.h index 9f5c8d339f44..4afe027a4e7b 100644 --- a/arch/arm64/include/asm/crash_core.h +++ b/arch/arm64/include/asm/crash_reserve.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _ARM64_CRASH_CORE_H -#define _ARM64_CRASH_CORE_H +#ifndef _ARM64_CRASH_RESERVE_H +#define _ARM64_CRASH_RESERVE_H /* Current arm64 boot protocol requires 2MB alignment */ #define CRASH_ALIGN SZ_2M diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b9fc064d38d2..7f704ae5c5ef 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -691,6 +691,7 @@ config FA_DUMP bool "Firmware-assisted dump" depends on PPC64 && (PPC_RTAS || PPC_POWERNV) select CRASH_CORE + select CRASH_RESERVE select CRASH_DUMP help A robust mechanism to get reliable kernel crash dump with diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index b4f2786a7d2b..cdff129abb14 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -173,7 +173,7 @@ static __init bool overlaps_region(const void *fdt, u32 start, static void __init get_crash_kernel(void *fdt, unsigned long size) { -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_CRASH_RESERVE unsigned long long crash_size, crash_base; int ret; diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bffbd869a068..bd06ad1bb97c 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -767,7 +767,7 @@ config ARCH_SUPPORTS_CRASH_DUMP def_bool y config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION - def_bool CRASH_CORE + def_bool CRASH_RESERVE config COMPAT bool "Kernel support for 32-bit U-mode" diff --git a/arch/riscv/include/asm/crash_core.h b/arch/riscv/include/asm/crash_reserve.h similarity index 78% rename from arch/riscv/include/asm/crash_core.h rename to arch/riscv/include/asm/crash_reserve.h index e1874b23feaf..013962e63587 100644 --- a/arch/riscv/include/asm/crash_core.h +++ b/arch/riscv/include/asm/crash_reserve.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _RISCV_CRASH_CORE_H -#define _RISCV_CRASH_CORE_H +#ifndef _RISCV_CRASH_RESERVE_H +#define _RISCV_CRASH_RESERVE_H #define CRASH_ALIGN PMD_SIZE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5edec175b9bf..5bd925815154 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2106,7 +2106,7 @@ config ARCH_SUPPORTS_CRASH_HOTPLUG def_bool y config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION - def_bool CRASH_CORE + def_bool CRASH_RESERVE config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP) diff --git a/arch/x86/include/asm/crash_core.h b/arch/x86/include/asm/crash_reserve.h similarity index 92% rename from arch/x86/include/asm/crash_core.h rename to arch/x86/include/asm/crash_reserve.h index 76af98f4e801..152239f95541 100644 --- a/arch/x86/include/asm/crash_core.h +++ b/arch/x86/include/asm/crash_reserve.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _X86_CRASH_CORE_H -#define _X86_CRASH_CORE_H +#ifndef _X86_CRASH_RESERVE_H +#define _X86_CRASH_RESERVE_H /* 16M alignment for crash kernel regions */ #define CRASH_ALIGN SZ_16M @@ -39,4 +39,4 @@ static inline unsigned long crash_low_size_default(void) #endif } -#endif /* _X86_CRASH_CORE_H */ +#endif /* _X86_CRASH_RESERVE_H */ diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 9eaeaafe0cad..1fde49246fa6 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -5,14 +5,6 @@ #include #include #include -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -#include -#endif - -/* Location of a reserved region to hold the crash kernel. - */ -extern struct resource crashk_res; -extern struct resource crashk_low_res; #define CRASH_CORE_NOTE_NAME "CORE" #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) @@ -87,38 +79,6 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); void final_note(Elf_Word *buf); -int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base, - unsigned long long *low_size, bool *high); - -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE -#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) -#endif -#ifndef CRASH_ALIGN -#define CRASH_ALIGN SZ_2M -#endif -#ifndef CRASH_ADDR_LOW_MAX -#define CRASH_ADDR_LOW_MAX SZ_4G -#endif -#ifndef CRASH_ADDR_HIGH_MAX -#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() -#endif - -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high); -#else -static inline void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high) -{} -#endif - /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h new file mode 100644 index 000000000000..5a9df944fb80 --- /dev/null +++ b/include/linux/crash_reserve.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_CRASH_RESERVE_H +#define LINUX_CRASH_RESERVE_H + +#include +#include +#include +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#include +#endif + +/* Location of a reserved region to hold the crash kernel. + */ +extern struct resource crashk_res; +extern struct resource crashk_low_res; + +int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base, + unsigned long long *low_size, bool *high); + +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE +#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) +#endif +#ifndef CRASH_ALIGN +#define CRASH_ALIGN SZ_2M +#endif +#ifndef CRASH_ADDR_LOW_MAX +#define CRASH_ADDR_LOW_MAX SZ_4G +#endif +#ifndef CRASH_ADDR_HIGH_MAX +#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() +#endif + +void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high); +#else +static inline void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) +{} +#endif +#endif /* LINUX_CRASH_RESERVE_H */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 400cb6c02176..6d79bfb52e5b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -16,6 +16,7 @@ #if !defined(__ASSEMBLY__) #include +#include #include #include diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 946dffa048b7..8b7be71edd85 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -2,11 +2,15 @@ menu "Kexec and crash features" +config CRASH_RESERVE + bool + config CRASH_CORE bool config KEXEC_CORE select CRASH_CORE + select CRASH_RESERVE bool config KEXEC_ELF @@ -96,7 +100,6 @@ config KEXEC_JUMP config CRASH_DUMP bool "kernel crash dumps" depends on ARCH_SUPPORTS_CRASH_DUMP - select CRASH_CORE select KEXEC_CORE help Generate crash dump after being started by kexec. diff --git a/kernel/Makefile b/kernel/Makefile index ce105a5558fc..05fa88b3ab74 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 49b31e59d3cc..ae0d1ce89b46 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -34,444 +34,6 @@ u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ static unsigned char *vmcoreinfo_data_safecopy; -/* Location of the reserved area for the crash kernel */ -struct resource crashk_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, - .desc = IORES_DESC_CRASH_KERNEL -}; -struct resource crashk_low_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, - .desc = IORES_DESC_CRASH_KERNEL -}; - -/* - * parsing the "crashkernel" commandline - * - * this code is intended to be called from architecture specific code - */ - - -/* - * This function parses command lines in the format - * - * crashkernel=ramsize-range:size[,...][@offset] - * - * The function returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline, *tmp; - unsigned long long total_mem = system_ram; - - /* - * Firmware sometimes reserves some memory regions for its own use, - * so the system memory size is less than the actual physical memory - * size. Work around this by rounding up the total size to 128M, - * which is enough for most test cases. - */ - total_mem = roundup(total_mem, SZ_128M); - - /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= total_mem) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (total_mem >= start && total_mem < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); - - if (*crash_size > 0) { - while (*cur && *cur != ' ' && *cur != '@') - cur++; - if (*cur == '@') { - cur++; - *crash_base = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); - return -EINVAL; - } - } - } else - pr_info("crashkernel size resulted in zero bytes\n"); - - return 0; -} - -/* - * That function parses "simple" (old) crashkernel command lines like - * - * crashkernel=size[@offset] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - if (*cur == '@') - *crash_base = memparse(cur+1, &cur); - else if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - - return 0; -} - -#define SUFFIX_HIGH 0 -#define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 -static __initdata char *suffix_tbl[] = { - [SUFFIX_HIGH] = ",high", - [SUFFIX_LOW] = ",low", - [SUFFIX_NULL] = NULL, -}; - -/* - * That function parses "suffix" crashkernel command lines like - * - * crashkernel=size,[high|low] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, - const char *suffix) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - /* check with suffix */ - if (strncmp(cur, suffix, strlen(suffix))) { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - cur += strlen(suffix); - if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - - return 0; -} - -static __init char *get_last_crashkernel(char *cmdline, - const char *name, - const char *suffix) -{ - char *p = cmdline, *ck_cmdline = NULL; - - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - char *end_p = strchr(p, ' '); - char *q; - - if (!end_p) - end_p = p + strlen(p); - - if (!suffix) { - int i; - - /* skip the one with any known suffix */ - for (i = 0; suffix_tbl[i]; i++) { - q = end_p - strlen(suffix_tbl[i]); - if (!strncmp(q, suffix_tbl[i], - strlen(suffix_tbl[i]))) - goto next; - } - ck_cmdline = p; - } else { - q = end_p - strlen(suffix); - if (!strncmp(q, suffix, strlen(suffix))) - ck_cmdline = p; - } -next: - p = strstr(p+1, name); - } - - return ck_cmdline; -} - -static int __init __parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - const char *suffix) -{ - char *first_colon, *first_space; - char *ck_cmdline; - char *name = "crashkernel="; - - BUG_ON(!crash_size || !crash_base); - *crash_size = 0; - *crash_base = 0; - - ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - if (!ck_cmdline) - return -ENOENT; - - ck_cmdline += strlen(name); - - if (suffix) - return parse_crashkernel_suffix(ck_cmdline, crash_size, - suffix); - /* - * if the commandline contains a ':', then that's the extended - * syntax -- if not, it must be the classic syntax - */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first_colon < first_space)) - return parse_crashkernel_mem(ck_cmdline, system_ram, - crash_size, crash_base); - - return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); -} - -/* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. - * - * If crashkernel=,high|low is supported on architecture, non-NULL values - * should be passed to parameters 'low_size' and 'high'. - */ -int __init parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - unsigned long long *low_size, - bool *high) -{ - int ret; - - /* crashkernel=X[@offset] */ - ret = __parse_crashkernel(cmdline, system_ram, crash_size, - crash_base, NULL); -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION - /* - * If non-NULL 'high' passed in and no normal crashkernel - * setting detected, try parsing crashkernel=,high|low. - */ - if (high && ret == -ENOENT) { - ret = __parse_crashkernel(cmdline, 0, crash_size, - crash_base, suffix_tbl[SUFFIX_HIGH]); - if (ret || !*crash_size) - return -EINVAL; - - /* - * crashkernel=Y,low can be specified or not, but invalid value - * is not allowed. - */ - ret = __parse_crashkernel(cmdline, 0, low_size, - crash_base, suffix_tbl[SUFFIX_LOW]); - if (ret == -ENOENT) { - *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; - ret = 0; - } else if (ret) { - return ret; - } - - *high = true; - } -#endif - if (!*crash_size) - ret = -EINVAL; - - return ret; -} - -/* - * Add a dummy early_param handler to mark crashkernel= as a known command line - * parameter and suppress incorrect warnings in init/main.c. - */ -static int __init parse_crashkernel_dummy(char *arg) -{ - return 0; -} -early_param("crashkernel", parse_crashkernel_dummy); - -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -static int __init reserve_crashkernel_low(unsigned long long low_size) -{ -#ifdef CONFIG_64BIT - unsigned long long low_base; - - low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); - if (!low_base) { - pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); - return -ENOMEM; - } - - pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n", - low_base, low_base + low_size, low_size >> 20); - - crashk_low_res.start = low_base; - crashk_low_res.end = low_base + low_size - 1; -#endif - return 0; -} - -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high) -{ - unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0; - bool fixed_base = false; - - /* User specifies base address explicitly. */ - if (crash_base) { - fixed_base = true; - search_base = crash_base; - search_end = crash_base + crash_size; - } else if (high) { - search_base = CRASH_ADDR_LOW_MAX; - search_end = CRASH_ADDR_HIGH_MAX; - } - -retry: - crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, - search_base, search_end); - if (!crash_base) { - /* - * For crashkernel=size[KMG]@offset[KMG], print out failure - * message if can't reserve the specified region. - */ - if (fixed_base) { - pr_warn("crashkernel reservation failed - memory is in use.\n"); - return; - } - - /* - * For crashkernel=size[KMG], if the first attempt was for - * low memory, fall back to high memory, the minimum required - * low memory will be reserved later. - */ - if (!high && search_end == CRASH_ADDR_LOW_MAX) { - search_end = CRASH_ADDR_HIGH_MAX; - search_base = CRASH_ADDR_LOW_MAX; - crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; - goto retry; - } - - /* - * For crashkernel=size[KMG],high, if the first attempt was - * for high memory, fall back to low memory. - */ - if (high && search_end == CRASH_ADDR_HIGH_MAX) { - search_end = CRASH_ADDR_LOW_MAX; - search_base = 0; - goto retry; - } - pr_warn("cannot allocate crashkernel (size:0x%llx)\n", - crash_size); - return; - } - - if ((crash_base >= CRASH_ADDR_LOW_MAX) && - crash_low_size && reserve_crashkernel_low(crash_low_size)) { - memblock_phys_free(crash_base, crash_size); - return; - } - - pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", - crash_base, crash_base + crash_size, crash_size >> 20); - - /* - * The crashkernel memory will be removed from the kernel linear - * map. Inform kmemleak so that it won't try to access it. - */ - kmemleak_ignore_phys(crash_base); - if (crashk_low_res.end) - kmemleak_ignore_phys(crashk_low_res.start); - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; -} - -static __init int insert_crashkernel_resources(void) -{ - if (crashk_res.start < crashk_res.end) - insert_resource(&iomem_resource, &crashk_res); - - if (crashk_low_res.start < crashk_low_res.end) - insert_resource(&iomem_resource, &crashk_low_res); - - return 0; -} -early_initcall(insert_crashkernel_resources); -#endif - int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz) { diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c new file mode 100644 index 000000000000..bbb6c3cb00e4 --- /dev/null +++ b/kernel/crash_reserve.c @@ -0,0 +1,464 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * crash.c - kernel crash support code. + * Copyright (C) 2002-2004 Eric Biederman + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "kallsyms_internal.h" +#include "kexec_internal.h" + +/* Location of the reserved area for the crash kernel */ +struct resource crashk_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL +}; +struct resource crashk_low_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL +}; + +/* + * parsing the "crashkernel" commandline + * + * this code is intended to be called from architecture specific code + */ + + +/* + * This function parses command lines in the format + * + * crashkernel=ramsize-range:size[,...][@offset] + * + * The function returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline, *tmp; + unsigned long long total_mem = system_ram; + + /* + * Firmware sometimes reserves some memory regions for its own use, + * so the system memory size is less than the actual physical memory + * size. Work around this by rounding up the total size to 128M, + * which is enough for most test cases. + */ + total_mem = roundup(total_mem, SZ_128M); + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + pr_warn("crashkernel: '-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + pr_warn("crashkernel: end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + pr_warn("crashkernel: ':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= total_mem) { + pr_warn("crashkernel: invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (total_mem >= start && total_mem < end) { + *crash_size = size; + break; + } + } while (*cur++ == ','); + + if (*crash_size > 0) { + while (*cur && *cur != ' ' && *cur != '@') + cur++; + if (*cur == '@') { + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected after '@'\n"); + return -EINVAL; + } + } + } else + pr_info("crashkernel size resulted in zero bytes\n"); + + return 0; +} + +/* + * That function parses "simple" (old) crashkernel command lines like + * + * crashkernel=size[@offset] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + +/* + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + return ck_cmdline; +} + +static int __init __parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *suffix) +{ + char *first_colon, *first_space; + char *ck_cmdline; + char *name = "crashkernel="; + + BUG_ON(!crash_size || !crash_base); + *crash_size = 0; + *crash_base = 0; + + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); + if (!ck_cmdline) + return -ENOENT; + + ck_cmdline += strlen(name); + + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + suffix); + /* + * if the commandline contains a ':', then that's the extended + * syntax -- if not, it must be the classic syntax + */ + first_colon = strchr(ck_cmdline, ':'); + first_space = strchr(ck_cmdline, ' '); + if (first_colon && (!first_space || first_colon < first_space)) + return parse_crashkernel_mem(ck_cmdline, system_ram, + crash_size, crash_base); + + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); +} + +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + * + * If crashkernel=,high|low is supported on architecture, non-NULL values + * should be passed to parameters 'low_size' and 'high'. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + unsigned long long *low_size, + bool *high) +{ + int ret; + + /* crashkernel=X[@offset] */ + ret = __parse_crashkernel(cmdline, system_ram, crash_size, + crash_base, NULL); +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION + /* + * If non-NULL 'high' passed in and no normal crashkernel + * setting detected, try parsing crashkernel=,high|low. + */ + if (high && ret == -ENOENT) { + ret = __parse_crashkernel(cmdline, 0, crash_size, + crash_base, suffix_tbl[SUFFIX_HIGH]); + if (ret || !*crash_size) + return -EINVAL; + + /* + * crashkernel=Y,low can be specified or not, but invalid value + * is not allowed. + */ + ret = __parse_crashkernel(cmdline, 0, low_size, + crash_base, suffix_tbl[SUFFIX_LOW]); + if (ret == -ENOENT) { + *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + ret = 0; + } else if (ret) { + return ret; + } + + *high = true; + } +#endif + if (!*crash_size) + ret = -EINVAL; + + return ret; +} + +/* + * Add a dummy early_param handler to mark crashkernel= as a known command line + * parameter and suppress incorrect warnings in init/main.c. + */ +static int __init parse_crashkernel_dummy(char *arg) +{ + return 0; +} +early_param("crashkernel", parse_crashkernel_dummy); + +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +static int __init reserve_crashkernel_low(unsigned long long low_size) +{ +#ifdef CONFIG_64BIT + unsigned long long low_base; + + low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); + if (!low_base) { + pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); + return -ENOMEM; + } + + pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n", + low_base, low_base + low_size, low_size >> 20); + + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + insert_resource(&iomem_resource, &crashk_low_res); +#endif + return 0; +} + +void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) +{ + unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0; + bool fixed_base = false; + + /* User specifies base address explicitly. */ + if (crash_base) { + fixed_base = true; + search_base = crash_base; + search_end = crash_base + crash_size; + } else if (high) { + search_base = CRASH_ADDR_LOW_MAX; + search_end = CRASH_ADDR_HIGH_MAX; + } + +retry: + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, + search_base, search_end); + if (!crash_base) { + /* + * For crashkernel=size[KMG]@offset[KMG], print out failure + * message if can't reserve the specified region. + */ + if (fixed_base) { + pr_warn("crashkernel reservation failed - memory is in use.\n"); + return; + } + + /* + * For crashkernel=size[KMG], if the first attempt was for + * low memory, fall back to high memory, the minimum required + * low memory will be reserved later. + */ + if (!high && search_end == CRASH_ADDR_LOW_MAX) { + search_end = CRASH_ADDR_HIGH_MAX; + search_base = CRASH_ADDR_LOW_MAX; + crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + goto retry; + } + + /* + * For crashkernel=size[KMG],high, if the first attempt was + * for high memory, fall back to low memory. + */ + if (high && search_end == CRASH_ADDR_HIGH_MAX) { + search_end = CRASH_ADDR_LOW_MAX; + search_base = 0; + goto retry; + } + pr_warn("cannot allocate crashkernel (size:0x%llx)\n", + crash_size); + return; + } + + if ((crash_base >= CRASH_ADDR_LOW_MAX) && + crash_low_size && reserve_crashkernel_low(crash_low_size)) { + memblock_phys_free(crash_base, crash_size); + return; + } + + pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + crash_base, crash_base + crash_size, crash_size >> 20); + + /* + * The crashkernel memory will be removed from the kernel linear + * map. Inform kmemleak so that it won't try to access it. + */ + kmemleak_ignore_phys(crash_base); + if (crashk_low_res.end) + kmemleak_ignore_phys(crashk_low_res.start); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; +} + +static __init int insert_crashkernel_resources(void) +{ + if (crashk_res.start < crashk_res.end) + insert_resource(&iomem_resource, &crashk_res); + + if (crashk_low_res.start < crashk_low_res.end) + insert_resource(&iomem_resource, &crashk_low_res); + + return 0; +} +early_initcall(insert_crashkernel_resources); +#endif From 443cbaf9e2fdbef7d7cae457434a6cb8a679441b Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:42 +0800 Subject: [PATCH 255/435] crash: split vmcoreinfo exporting code out from crash_core.c Now move the relevant codes into separate files: kernel/crash_reserve.c, include/linux/crash_reserve.h. And add config item CRASH_RESERVE to control its enabling. And also update the old ifdeffery of CONFIG_CRASH_CORE, including of and config item dependency on CRASH_CORE accordingly. And also do renaming as follows: - arch/xxx/kernel/{crash_core.c => vmcore_info.c} because they are only related to vmcoreinfo exporting on x86, arm64, riscv. And also Remove config item CRASH_CORE, and rely on CONFIG_KEXEC_CORE to decide if build in crash_core.c. [yang.lee@linux.alibaba.com: remove duplicated include in vmcore_info.c] Link: https://lkml.kernel.org/r/20240126005744.16561-1-yang.lee@linux.alibaba.com Link: https://lkml.kernel.org/r/20240124051254.67105-3-bhe@redhat.com Signed-off-by: Baoquan He Signed-off-by: Yang Li Acked-by: Hari Bathini Cc: Al Viro Cc: Eric W. Biederman Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/arm64/kernel/Makefile | 2 +- .../kernel/{crash_core.c => vmcore_info.c} | 2 +- arch/powerpc/Kconfig | 2 +- arch/powerpc/kernel/setup-common.c | 2 +- arch/powerpc/platforms/powernv/opal-core.c | 2 +- arch/riscv/kernel/Makefile | 2 +- .../kernel/{crash_core.c => vmcore_info.c} | 2 +- arch/x86/kernel/Makefile | 2 +- .../{crash_core_32.c => vmcore_info_32.c} | 2 +- .../{crash_core_64.c => vmcore_info_64.c} | 2 +- drivers/firmware/qemu_fw_cfg.c | 14 +- fs/proc/Kconfig | 2 +- fs/proc/kcore.c | 2 +- include/linux/buildid.h | 2 +- include/linux/crash_core.h | 73 ------ include/linux/kexec.h | 1 + include/linux/vmcore_info.h | 81 ++++++ kernel/Kconfig.kexec | 4 +- kernel/Makefile | 4 +- kernel/crash_core.c | 206 ---------------- kernel/ksysfs.c | 6 +- kernel/printk/printk.c | 4 +- kernel/vmcore_info.c | 230 ++++++++++++++++++ lib/buildid.c | 2 +- 24 files changed, 342 insertions(+), 309 deletions(-) rename arch/arm64/kernel/{crash_core.c => vmcore_info.c} (97%) rename arch/riscv/kernel/{crash_core.c => vmcore_info.c} (96%) rename arch/x86/kernel/{crash_core_32.c => vmcore_info_32.c} (90%) rename arch/x86/kernel/{crash_core_64.c => vmcore_info_64.c} (94%) create mode 100644 include/linux/vmcore_info.h create mode 100644 kernel/vmcore_info.c diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 467cb7117273..a3882cccf049 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -66,7 +66,7 @@ obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o -obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o obj-$(CONFIG_ARM64_MTE) += mte.o diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/vmcore_info.c similarity index 97% rename from arch/arm64/kernel/crash_core.c rename to arch/arm64/kernel/vmcore_info.c index 2a24199a9b81..b19d5d6cb8b3 100644 --- a/arch/arm64/kernel/crash_core.c +++ b/arch/arm64/kernel/vmcore_info.c @@ -4,7 +4,7 @@ * Copyright (C) Huawei Futurewei Technologies. */ -#include +#include #include #include #include diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7f704ae5c5ef..495d197c9b27 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -690,7 +690,7 @@ config ARCH_SELECTS_CRASH_DUMP config FA_DUMP bool "Firmware-assisted dump" depends on PPC64 && (PPC_RTAS || PPC_POWERNV) - select CRASH_CORE + select VMCORE_INFO select CRASH_RESERVE select CRASH_DUMP help diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 9b142b9d5187..733f210ffda1 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -109,7 +109,7 @@ int ppc_do_canonicalize_irqs; EXPORT_SYMBOL(ppc_do_canonicalize_irqs); #endif -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO /* This keeps a track of which one is the crashing cpu. */ int crashing_cpu = -1; #endif diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c index bb7657115f1d..c9a9b759cc92 100644 --- a/arch/powerpc/platforms/powernv/opal-core.c +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index f71910718053..d6fd8dcfceb5 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -92,7 +92,7 @@ obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_KEXEC_CORE) += kexec_relocate.o crash_save_regs.o machine_kexec.o obj-$(CONFIG_KEXEC_FILE) += elf_kexec.o machine_kexec_file.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o -obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/vmcore_info.c similarity index 96% rename from arch/riscv/kernel/crash_core.c rename to arch/riscv/kernel/vmcore_info.c index d18d529fd9b9..6d7a22522d63 100644 --- a/arch/riscv/kernel/crash_core.c +++ b/arch/riscv/kernel/vmcore_info.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0000325ab98f..913d4022131e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -98,7 +98,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_RETHOOK) += rethook.o -obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/vmcore_info_32.c similarity index 90% rename from arch/x86/kernel/crash_core_32.c rename to arch/x86/kernel/vmcore_info_32.c index 8a89c109e20a..5995a749288a 100644 --- a/arch/x86/kernel/crash_core_32.c +++ b/arch/x86/kernel/vmcore_info_32.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include #include diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/vmcore_info_64.c similarity index 94% rename from arch/x86/kernel/crash_core_64.c rename to arch/x86/kernel/vmcore_info_64.c index 7d255f882afe..0dec7d868754 100644 --- a/arch/x86/kernel/crash_core_64.c +++ b/arch/x86/kernel/vmcore_info_64.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include #include diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c index 03da9a4354f8..5f43dfa22f79 100644 --- a/drivers/firmware/qemu_fw_cfg.c +++ b/drivers/firmware/qemu_fw_cfg.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include MODULE_AUTHOR("Gabriel L. Somlo "); MODULE_DESCRIPTION("QEMU fw_cfg sysfs support"); @@ -67,7 +67,7 @@ static void fw_cfg_sel_endianness(u16 key) iowrite16(key, fw_cfg_reg_ctrl); } -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO static inline bool fw_cfg_dma_enabled(void) { return (fw_cfg_rev & FW_CFG_VERSION_DMA) && fw_cfg_reg_dma; @@ -156,7 +156,7 @@ static ssize_t fw_cfg_read_blob(u16 key, return count; } -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO /* write chunk of given fw_cfg blob (caller responsible for sanity-check) */ static ssize_t fw_cfg_write_blob(u16 key, void *buf, loff_t pos, size_t count) @@ -195,7 +195,7 @@ end: return ret; } -#endif /* CONFIG_CRASH_CORE */ +#endif /* CONFIG_VMCORE_INFO */ /* clean up fw_cfg device i/o */ static void fw_cfg_io_cleanup(void) @@ -319,7 +319,7 @@ struct fw_cfg_sysfs_entry { struct list_head list; }; -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO static ssize_t fw_cfg_write_vmcoreinfo(const struct fw_cfg_file *f) { static struct fw_cfg_vmcoreinfo *data; @@ -343,7 +343,7 @@ static ssize_t fw_cfg_write_vmcoreinfo(const struct fw_cfg_file *f) kfree(data); return ret; } -#endif /* CONFIG_CRASH_CORE */ +#endif /* CONFIG_VMCORE_INFO */ /* get fw_cfg_sysfs_entry from kobject member */ static inline struct fw_cfg_sysfs_entry *to_entry(struct kobject *kobj) @@ -583,7 +583,7 @@ static int fw_cfg_register_file(const struct fw_cfg_file *f) int err; struct fw_cfg_sysfs_entry *entry; -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO if (fw_cfg_dma_enabled() && strcmp(f->name, FW_CFG_VMCOREINFO_FILENAME) == 0 && !is_kdump_kernel()) { diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 32b1116ae137..d80a1431ef7b 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -32,7 +32,7 @@ config PROC_FS config PROC_KCORE bool "/proc/kcore support" if !ARM depends on PROC_FS && MMU - select CRASH_CORE + select VMCORE_INFO help Provides a virtual ELF core file of the live kernel. This can be read with gdb and other ELF tools. No modifications can be diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 6422e569b080..8e08a9a1b7ed 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -10,7 +10,7 @@ * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar */ -#include +#include #include #include #include diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 8a582d242f06..20aa3c2d89f7 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -11,7 +11,7 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size); -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE) +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO) extern unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX]; void init_vmlinux_build_id(void); #else diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 1fde49246fa6..7f19f62018ef 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -6,79 +6,6 @@ #include #include -#define CRASH_CORE_NOTE_NAME "CORE" -#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) -#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) -#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) - -/* - * The per-cpu notes area is a list of notes terminated by a "NULL" - * note header. For kdump, the code in vmcore.c runs in the context - * of the second kernel to combine them into one note. - */ -#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ - CRASH_CORE_NOTE_NAME_BYTES + \ - CRASH_CORE_NOTE_DESC_BYTES) - -#define VMCOREINFO_BYTES PAGE_SIZE -#define VMCOREINFO_NOTE_NAME "VMCOREINFO" -#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) -#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ - VMCOREINFO_NOTE_NAME_BYTES + \ - VMCOREINFO_BYTES) - -typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; -/* Per cpu memory for storing cpu states in case of system crash. */ -extern note_buf_t __percpu *crash_notes; - -void crash_update_vmcoreinfo_safecopy(void *ptr); -void crash_save_vmcoreinfo(void); -void arch_crash_save_vmcoreinfo(void); -__printf(1, 2) -void vmcoreinfo_append_str(const char *fmt, ...); -phys_addr_t paddr_vmcoreinfo_note(void); - -#define VMCOREINFO_OSRELEASE(value) \ - vmcoreinfo_append_str("OSRELEASE=%s\n", value) -#define VMCOREINFO_BUILD_ID() \ - ({ \ - static_assert(sizeof(vmlinux_build_id) == 20); \ - vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \ - }) - -#define VMCOREINFO_PAGESIZE(value) \ - vmcoreinfo_append_str("PAGESIZE=%ld\n", value) -#define VMCOREINFO_SYMBOL(name) \ - vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) -#define VMCOREINFO_SYMBOL_ARRAY(name) \ - vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name) -#define VMCOREINFO_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ - (unsigned long)sizeof(name)) -#define VMCOREINFO_STRUCT_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ - (unsigned long)sizeof(struct name)) -#define VMCOREINFO_OFFSET(name, field) \ - vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ - (unsigned long)offsetof(struct name, field)) -#define VMCOREINFO_TYPE_OFFSET(name, field) \ - vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ - (unsigned long)offsetof(name, field)) -#define VMCOREINFO_LENGTH(name, value) \ - vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) -#define VMCOREINFO_NUMBER(name) \ - vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) -#define VMCOREINFO_CONFIG(name) \ - vmcoreinfo_append_str("CONFIG_%s=y\n", #name) - -extern unsigned char *vmcoreinfo_data; -extern size_t vmcoreinfo_size; -extern u32 *vmcoreinfo_note; - -Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, - void *data, size_t data_len); -void final_note(Elf_Word *buf); - /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 6d79bfb52e5b..9c7bb8b56ed6 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -16,6 +16,7 @@ #if !defined(__ASSEMBLY__) #include +#include #include #include #include diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h new file mode 100644 index 000000000000..e1dec1a6a749 --- /dev/null +++ b/include/linux/vmcore_info.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_VMCORE_INFO_H +#define LINUX_VMCORE_INFO_H + +#include +#include +#include + +#define CRASH_CORE_NOTE_NAME "CORE" +#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) +#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) +#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) + +/* + * The per-cpu notes area is a list of notes terminated by a "NULL" + * note header. For kdump, the code in vmcore.c runs in the context + * of the second kernel to combine them into one note. + */ +#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ + CRASH_CORE_NOTE_NAME_BYTES + \ + CRASH_CORE_NOTE_DESC_BYTES) + +#define VMCOREINFO_BYTES PAGE_SIZE +#define VMCOREINFO_NOTE_NAME "VMCOREINFO" +#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) +#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ + VMCOREINFO_NOTE_NAME_BYTES + \ + VMCOREINFO_BYTES) + +typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; +/* Per cpu memory for storing cpu states in case of system crash. */ +extern note_buf_t __percpu *crash_notes; + +void crash_update_vmcoreinfo_safecopy(void *ptr); +void crash_save_vmcoreinfo(void); +void arch_crash_save_vmcoreinfo(void); +__printf(1, 2) +void vmcoreinfo_append_str(const char *fmt, ...); +phys_addr_t paddr_vmcoreinfo_note(void); + +#define VMCOREINFO_OSRELEASE(value) \ + vmcoreinfo_append_str("OSRELEASE=%s\n", value) +#define VMCOREINFO_BUILD_ID() \ + ({ \ + static_assert(sizeof(vmlinux_build_id) == 20); \ + vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \ + }) + +#define VMCOREINFO_PAGESIZE(value) \ + vmcoreinfo_append_str("PAGESIZE=%ld\n", value) +#define VMCOREINFO_SYMBOL(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) +#define VMCOREINFO_SYMBOL_ARRAY(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name) +#define VMCOREINFO_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(name)) +#define VMCOREINFO_STRUCT_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(struct name)) +#define VMCOREINFO_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ + (unsigned long)offsetof(struct name, field)) +#define VMCOREINFO_TYPE_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ + (unsigned long)offsetof(name, field)) +#define VMCOREINFO_LENGTH(name, value) \ + vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) +#define VMCOREINFO_NUMBER(name) \ + vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) +#define VMCOREINFO_CONFIG(name) \ + vmcoreinfo_append_str("CONFIG_%s=y\n", #name) + +extern unsigned char *vmcoreinfo_data; +extern size_t vmcoreinfo_size; +extern u32 *vmcoreinfo_note; + +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len); +void final_note(Elf_Word *buf); +#endif /* LINUX_VMCORE_INFO_H */ diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 8b7be71edd85..8faf27043432 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -5,11 +5,11 @@ menu "Kexec and crash features" config CRASH_RESERVE bool -config CRASH_CORE +config VMCORE_INFO bool config KEXEC_CORE - select CRASH_CORE + select VMCORE_INFO select CRASH_RESERVE bool diff --git a/kernel/Makefile b/kernel/Makefile index 05fa88b3ab74..649272a1d6b9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -68,9 +68,9 @@ obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o -obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o -obj-$(CONFIG_KEXEC_CORE) += kexec_core.o +obj-$(CONFIG_KEXEC_CORE) += kexec_core.o crash_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o diff --git a/kernel/crash_core.c b/kernel/crash_core.c index ae0d1ce89b46..2f4df1fe6f7a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -26,14 +26,6 @@ /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; -/* vmcoreinfo stuff */ -unsigned char *vmcoreinfo_data; -size_t vmcoreinfo_size; -u32 *vmcoreinfo_note; - -/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ -static unsigned char *vmcoreinfo_data_safecopy; - int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz) { @@ -195,204 +187,6 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } -Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, - void *data, size_t data_len) -{ - struct elf_note *note = (struct elf_note *)buf; - - note->n_namesz = strlen(name) + 1; - note->n_descsz = data_len; - note->n_type = type; - buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); - memcpy(buf, name, note->n_namesz); - buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); - memcpy(buf, data, data_len); - buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); - - return buf; -} - -void final_note(Elf_Word *buf) -{ - memset(buf, 0, sizeof(struct elf_note)); -} - -static void update_vmcoreinfo_note(void) -{ - u32 *buf = vmcoreinfo_note; - - if (!vmcoreinfo_size) - return; - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - vmcoreinfo_size); - final_note(buf); -} - -void crash_update_vmcoreinfo_safecopy(void *ptr) -{ - if (ptr) - memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); - - vmcoreinfo_data_safecopy = ptr; -} - -void crash_save_vmcoreinfo(void) -{ - if (!vmcoreinfo_note) - return; - - /* Use the safe copy to generate vmcoreinfo note if have */ - if (vmcoreinfo_data_safecopy) - vmcoreinfo_data = vmcoreinfo_data_safecopy; - - vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds()); - update_vmcoreinfo_note(); -} - -void vmcoreinfo_append_str(const char *fmt, ...) -{ - va_list args; - char buf[0x50]; - size_t r; - - va_start(args, fmt); - r = vscnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - - r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); - - memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); - - vmcoreinfo_size += r; - - WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES, - "vmcoreinfo data exceeds allocated size, truncating"); -} - -/* - * provide an empty default implementation here -- architecture - * code may override this - */ -void __weak arch_crash_save_vmcoreinfo(void) -{} - -phys_addr_t __weak paddr_vmcoreinfo_note(void) -{ - return __pa(vmcoreinfo_note); -} -EXPORT_SYMBOL(paddr_vmcoreinfo_note); - -static int __init crash_save_vmcoreinfo_init(void) -{ - vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); - if (!vmcoreinfo_data) { - pr_warn("Memory allocation for vmcoreinfo_data failed\n"); - return -ENOMEM; - } - - vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, - GFP_KERNEL | __GFP_ZERO); - if (!vmcoreinfo_note) { - free_page((unsigned long)vmcoreinfo_data); - vmcoreinfo_data = NULL; - pr_warn("Memory allocation for vmcoreinfo_note failed\n"); - return -ENOMEM; - } - - VMCOREINFO_OSRELEASE(init_uts_ns.name.release); - VMCOREINFO_BUILD_ID(); - VMCOREINFO_PAGESIZE(PAGE_SIZE); - - VMCOREINFO_SYMBOL(init_uts_ns); - VMCOREINFO_OFFSET(uts_namespace, name); - VMCOREINFO_SYMBOL(node_online_map); -#ifdef CONFIG_MMU - VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); -#endif - VMCOREINFO_SYMBOL(_stext); - vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START); - -#ifndef CONFIG_NUMA - VMCOREINFO_SYMBOL(mem_map); - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL_ARRAY(mem_section); - VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); - VMCOREINFO_STRUCT_SIZE(mem_section); - VMCOREINFO_OFFSET(mem_section, section_mem_map); - VMCOREINFO_NUMBER(SECTION_SIZE_BITS); - VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); -#endif - VMCOREINFO_STRUCT_SIZE(page); - VMCOREINFO_STRUCT_SIZE(pglist_data); - VMCOREINFO_STRUCT_SIZE(zone); - VMCOREINFO_STRUCT_SIZE(free_area); - VMCOREINFO_STRUCT_SIZE(list_head); - VMCOREINFO_SIZE(nodemask_t); - VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _refcount); - VMCOREINFO_OFFSET(page, mapping); - VMCOREINFO_OFFSET(page, lru); - VMCOREINFO_OFFSET(page, _mapcount); - VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(page, compound_head); - VMCOREINFO_OFFSET(pglist_data, node_zones); - VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLATMEM - VMCOREINFO_OFFSET(pglist_data, node_mem_map); -#endif - VMCOREINFO_OFFSET(pglist_data, node_start_pfn); - VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); - VMCOREINFO_OFFSET(pglist_data, node_id); - VMCOREINFO_OFFSET(zone, free_area); - VMCOREINFO_OFFSET(zone, vm_stat); - VMCOREINFO_OFFSET(zone, spanned_pages); - VMCOREINFO_OFFSET(free_area, free_list); - VMCOREINFO_OFFSET(list_head, next); - VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); - log_buf_vmcoreinfo_setup(); - VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); - VMCOREINFO_NUMBER(NR_FREE_PAGES); - VMCOREINFO_NUMBER(PG_lru); - VMCOREINFO_NUMBER(PG_private); - VMCOREINFO_NUMBER(PG_swapcache); - VMCOREINFO_NUMBER(PG_swapbacked); - VMCOREINFO_NUMBER(PG_slab); -#ifdef CONFIG_MEMORY_FAILURE - VMCOREINFO_NUMBER(PG_hwpoison); -#endif - VMCOREINFO_NUMBER(PG_head_mask); -#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) - VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLB_PAGE - VMCOREINFO_NUMBER(PG_hugetlb); -#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) - VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); -#endif - -#ifdef CONFIG_KALLSYMS - VMCOREINFO_SYMBOL(kallsyms_names); - VMCOREINFO_SYMBOL(kallsyms_num_syms); - VMCOREINFO_SYMBOL(kallsyms_token_table); - VMCOREINFO_SYMBOL(kallsyms_token_index); -#ifdef CONFIG_KALLSYMS_BASE_RELATIVE - VMCOREINFO_SYMBOL(kallsyms_offsets); - VMCOREINFO_SYMBOL(kallsyms_relative_base); -#else - VMCOREINFO_SYMBOL(kallsyms_addresses); -#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */ -#endif /* CONFIG_KALLSYMS */ - - arch_crash_save_vmcoreinfo(); - update_vmcoreinfo_note(); - - return 0; -} - -subsys_initcall(crash_save_vmcoreinfo_init); - static int __init crash_notes_memory_init(void) { /* Allocate memory for saving cpu registers. */ diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 1d4bc493b2f4..11526fc42bc2 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -154,7 +154,7 @@ KERNEL_ATTR_RW(kexec_crash_size); #endif /* CONFIG_KEXEC_CORE */ -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -177,7 +177,7 @@ KERNEL_ATTR_RO(crash_elfcorehdr_size); #endif -#endif /* CONFIG_CRASH_CORE */ +#endif /* CONFIG_VMCORE_INFO */ /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, @@ -265,7 +265,7 @@ static struct attribute * kernel_attrs[] = { &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, #endif -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, #ifdef CONFIG_CRASH_HOTPLUG &crash_elfcorehdr_size_attr.attr, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f2444b581e16..7d74b000b43a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include @@ -951,7 +951,7 @@ const struct file_operations kmsg_fops = { .release = devkmsg_release, }; -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_VMCORE_INFO /* * This appends the listed symbols to /proc/vmcore * diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c new file mode 100644 index 000000000000..8f77e238a54f --- /dev/null +++ b/kernel/vmcore_info.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * crash.c - kernel crash support code. + * Copyright (C) 2002-2004 Eric Biederman + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "kallsyms_internal.h" +#include "kexec_internal.h" + +/* vmcoreinfo stuff */ +unsigned char *vmcoreinfo_data; +size_t vmcoreinfo_size; +u32 *vmcoreinfo_note; + +/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ +static unsigned char *vmcoreinfo_data_safecopy; + +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len) +{ + struct elf_note *note = (struct elf_note *)buf; + + note->n_namesz = strlen(name) + 1; + note->n_descsz = data_len; + note->n_type = type; + buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); + memcpy(buf, name, note->n_namesz); + buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); + memcpy(buf, data, data_len); + buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); + + return buf; +} + +void final_note(Elf_Word *buf) +{ + memset(buf, 0, sizeof(struct elf_note)); +} + +static void update_vmcoreinfo_note(void) +{ + u32 *buf = vmcoreinfo_note; + + if (!vmcoreinfo_size) + return; + buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, + vmcoreinfo_size); + final_note(buf); +} + +void crash_update_vmcoreinfo_safecopy(void *ptr) +{ + if (ptr) + memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); + + vmcoreinfo_data_safecopy = ptr; +} + +void crash_save_vmcoreinfo(void) +{ + if (!vmcoreinfo_note) + return; + + /* Use the safe copy to generate vmcoreinfo note if have */ + if (vmcoreinfo_data_safecopy) + vmcoreinfo_data = vmcoreinfo_data_safecopy; + + vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds()); + update_vmcoreinfo_note(); +} + +void vmcoreinfo_append_str(const char *fmt, ...) +{ + va_list args; + char buf[0x50]; + size_t r; + + va_start(args, fmt); + r = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); + + memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); + + vmcoreinfo_size += r; + + WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES, + "vmcoreinfo data exceeds allocated size, truncating"); +} + +/* + * provide an empty default implementation here -- architecture + * code may override this + */ +void __weak arch_crash_save_vmcoreinfo(void) +{} + +phys_addr_t __weak paddr_vmcoreinfo_note(void) +{ + return __pa(vmcoreinfo_note); +} +EXPORT_SYMBOL(paddr_vmcoreinfo_note); + +static int __init crash_save_vmcoreinfo_init(void) +{ + vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); + if (!vmcoreinfo_data) { + pr_warn("Memory allocation for vmcoreinfo_data failed\n"); + return -ENOMEM; + } + + vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, + GFP_KERNEL | __GFP_ZERO); + if (!vmcoreinfo_note) { + free_page((unsigned long)vmcoreinfo_data); + vmcoreinfo_data = NULL; + pr_warn("Memory allocation for vmcoreinfo_note failed\n"); + return -ENOMEM; + } + + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + VMCOREINFO_BUILD_ID(); + VMCOREINFO_PAGESIZE(PAGE_SIZE); + + VMCOREINFO_SYMBOL(init_uts_ns); + VMCOREINFO_OFFSET(uts_namespace, name); + VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU + VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); +#endif + VMCOREINFO_SYMBOL(_stext); + vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START); + +#ifndef CONFIG_NUMA + VMCOREINFO_SYMBOL(mem_map); + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#ifdef CONFIG_SPARSEMEM + VMCOREINFO_SYMBOL_ARRAY(mem_section); + VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); + VMCOREINFO_STRUCT_SIZE(mem_section); + VMCOREINFO_OFFSET(mem_section, section_mem_map); + VMCOREINFO_NUMBER(SECTION_SIZE_BITS); + VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); +#endif + VMCOREINFO_STRUCT_SIZE(page); + VMCOREINFO_STRUCT_SIZE(pglist_data); + VMCOREINFO_STRUCT_SIZE(zone); + VMCOREINFO_STRUCT_SIZE(free_area); + VMCOREINFO_STRUCT_SIZE(list_head); + VMCOREINFO_SIZE(nodemask_t); + VMCOREINFO_OFFSET(page, flags); + VMCOREINFO_OFFSET(page, _refcount); + VMCOREINFO_OFFSET(page, mapping); + VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(page, _mapcount); + VMCOREINFO_OFFSET(page, private); + VMCOREINFO_OFFSET(page, compound_head); + VMCOREINFO_OFFSET(pglist_data, node_zones); + VMCOREINFO_OFFSET(pglist_data, nr_zones); +#ifdef CONFIG_FLATMEM + VMCOREINFO_OFFSET(pglist_data, node_mem_map); +#endif + VMCOREINFO_OFFSET(pglist_data, node_start_pfn); + VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); + VMCOREINFO_OFFSET(pglist_data, node_id); + VMCOREINFO_OFFSET(zone, free_area); + VMCOREINFO_OFFSET(zone, vm_stat); + VMCOREINFO_OFFSET(zone, spanned_pages); + VMCOREINFO_OFFSET(free_area, free_list); + VMCOREINFO_OFFSET(list_head, next); + VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); + log_buf_vmcoreinfo_setup(); + VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); + VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_swapbacked); + VMCOREINFO_NUMBER(PG_slab); +#ifdef CONFIG_MEMORY_FAILURE + VMCOREINFO_NUMBER(PG_hwpoison); +#endif + VMCOREINFO_NUMBER(PG_head_mask); +#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) + VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); +#ifdef CONFIG_HUGETLB_PAGE + VMCOREINFO_NUMBER(PG_hugetlb); +#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) + VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); +#endif + +#ifdef CONFIG_KALLSYMS + VMCOREINFO_SYMBOL(kallsyms_names); + VMCOREINFO_SYMBOL(kallsyms_num_syms); + VMCOREINFO_SYMBOL(kallsyms_token_table); + VMCOREINFO_SYMBOL(kallsyms_token_index); +#ifdef CONFIG_KALLSYMS_BASE_RELATIVE + VMCOREINFO_SYMBOL(kallsyms_offsets); + VMCOREINFO_SYMBOL(kallsyms_relative_base); +#else + VMCOREINFO_SYMBOL(kallsyms_addresses); +#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */ +#endif /* CONFIG_KALLSYMS */ + + arch_crash_save_vmcoreinfo(); + update_vmcoreinfo_note(); + + return 0; +} + +subsys_initcall(crash_save_vmcoreinfo_init); diff --git a/lib/buildid.c b/lib/buildid.c index e3a7acdeef0e..3e6868c86b45 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -174,7 +174,7 @@ int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size) return parse_build_id_buf(build_id, NULL, buf, buf_size); } -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE) +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO) unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX] __ro_after_init; /** From 2c44b67e2ef345c44095d241530c10cfdd610960 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:43 +0800 Subject: [PATCH 256/435] crash: remove dependency of FA_DUMP on CRASH_DUMP In kdump kernel, /proc/vmcore is an elf file mapping the crashed kernel's old memory content. Its elf header is constructed in 1st kernel and passed to kdump kernel via elfcorehdr_addr. Config CRASH_DUMP enables the code of 1st kernel's old memory accessing in different architectures. Currently, config FA_DUMP has dependency on CRASH_DUMP because fadump needs access global variable 'elfcorehdr_addr' to judge if it's in kdump kernel within function is_kdump_kernel(). In the current kernel/crash_dump.c, variable 'elfcorehdr_addr' is defined, and function setup_elfcorehdr() used to parse kernel parameter to fetch the passed value of elfcorehdr_addr. Only for accessing elfcorehdr_addr, FA_DUMP really doesn't have to depends on CRASH_DUMP. To remove the dependency of FA_DUMP on CRASH_DUMP to avoid confusion, rename kernel/crash_dump.c to kernel/elfcorehdr.c, and build it when CONFIG_VMCORE_INFO is ebabled. With this, FA_DUMP doesn't need to depend on CRASH_DUMP. [bhe@redhat.com: power/fadump: make FA_DUMP select CRASH_DUMP] Link: https://lkml.kernel.org/r/Zb8D1ASrgX0qVm9z@MiWiFi-R3L-srv Link: https://lkml.kernel.org/r/20240124051254.67105-4-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Hari Bathini Cc: Al Viro Cc: Eric W. Biederman Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 2 -- kernel/Makefile | 3 +-- kernel/{crash_dump.c => elfcorehdr.c} | 0 kernel/kexec_internal.h | 2 ++ 4 files changed, 3 insertions(+), 4 deletions(-) rename kernel/{crash_dump.c => elfcorehdr.c} (100%) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 495d197c9b27..a9efaf87966d 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -690,8 +690,6 @@ config ARCH_SELECTS_CRASH_DUMP config FA_DUMP bool "Firmware-assisted dump" depends on PPC64 && (PPC_RTAS || PPC_POWERNV) - select VMCORE_INFO - select CRASH_RESERVE select CRASH_DUMP help A robust mechanism to get reliable kernel crash dump with diff --git a/kernel/Makefile b/kernel/Makefile index 649272a1d6b9..35abc65e1f1a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -68,7 +68,7 @@ obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o -obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o crash_core.o obj-$(CONFIG_KEXEC) += kexec.o @@ -121,7 +121,6 @@ obj-$(CONFIG_PERF_EVENTS) += events/ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o diff --git a/kernel/crash_dump.c b/kernel/elfcorehdr.c similarity index 100% rename from kernel/crash_dump.c rename to kernel/elfcorehdr.c diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 74da1409cd14..2595defe8c0d 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -4,6 +4,8 @@ #include +struct kexec_segment; + struct kimage *do_kimage_alloc_init(void); int sanity_check_segment_list(struct kimage *image); void kimage_free_page_list(struct list_head *list); From 02aff8480533817a29e820729360866441d7403d Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:44 +0800 Subject: [PATCH 257/435] crash: split crash dumping code out from kexec_core.c Currently, KEXEC_CORE select CRASH_CORE automatically because crash codes need be built in to avoid compiling error when building kexec code even though the crash dumping functionality is not enabled. E.g -------------------- CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y --------------------- After splitting out crashkernel reservation code and vmcoreinfo exporting code, there's only crash related code left in kernel/crash_core.c. Now move crash related codes from kexec_core.c to crash_core.c and only build it in when CONFIG_CRASH_DUMP=y. And also wrap up crash codes inside CONFIG_CRASH_DUMP ifdeffery scope, or replace inappropriate CONFIG_KEXEC_CORE ifdef with CONFIG_CRASH_DUMP ifdef in generic kernel files. With these changes, crash_core codes are abstracted from kexec codes and can be disabled at all if only kexec reboot feature is wanted. Link: https://lkml.kernel.org/r/20240124051254.67105-5-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- drivers/base/cpu.c | 6 +- include/linux/crash_core.h | 61 +++++++++ include/linux/kexec.h | 45 +------ init/initramfs.c | 2 +- kernel/Makefile | 3 +- kernel/crash_core.c | 256 +++++++++++++++++++++++++++++++++++++ kernel/kexec.c | 11 +- kernel/kexec_core.c | 250 ++---------------------------------- kernel/kexec_file.c | 13 +- kernel/ksysfs.c | 4 + 10 files changed, 359 insertions(+), 292 deletions(-) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 47de0f140ba6..b621a0fc75e1 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -144,7 +144,7 @@ static DEVICE_ATTR(release, S_IWUSR, NULL, cpu_release_store); #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ #endif /* CONFIG_HOTPLUG_CPU */ -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP #include static ssize_t crash_notes_show(struct device *dev, @@ -189,14 +189,14 @@ static const struct attribute_group crash_note_cpu_attr_group = { #endif static const struct attribute_group *common_cpu_attr_groups[] = { -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP &crash_note_cpu_attr_group, #endif NULL }; static const struct attribute_group *hotplugable_cpu_attr_groups[] = { -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP &crash_note_cpu_attr_group, #endif NULL diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 7f19f62018ef..23270b16e1db 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -6,6 +6,48 @@ #include #include +struct kimage; + +#ifdef CONFIG_CRASH_DUMP + +int crash_shrink_memory(unsigned long new_size); +ssize_t crash_get_memory_size(void); + +#ifndef arch_kexec_protect_crashkres +/* + * Protection mechanism for crashkernel reserved memory after + * the kdump kernel is loaded. + * + * Provide an empty default implementation here -- architecture + * code may override this + */ +static inline void arch_kexec_protect_crashkres(void) { } +#endif + +#ifndef arch_kexec_unprotect_crashkres +static inline void arch_kexec_unprotect_crashkres(void) { } +#endif + + + +#ifndef arch_crash_handle_hotplug_event +static inline void arch_crash_handle_hotplug_event(struct kimage *image) { } +#endif + +int crash_check_update_elfcorehdr(void); + +#ifndef crash_hotplug_cpu_support +static inline int crash_hotplug_cpu_support(void) { return 0; } +#endif + +#ifndef crash_hotplug_memory_support +static inline int crash_hotplug_memory_support(void) { return 0; } +#endif + +#ifndef crash_get_elfcorehdr_size +static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } +#endif + /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 @@ -31,4 +73,23 @@ struct kexec_segment; #define KEXEC_CRASH_HP_REMOVE_MEMORY 4 #define KEXEC_CRASH_HP_INVALID_CPU -1U +extern void __crash_kexec(struct pt_regs *regs); +extern void crash_kexec(struct pt_regs *regs); +int kexec_should_crash(struct task_struct *p); +int kexec_crash_loaded(void); +void crash_save_cpu(struct pt_regs *regs, int cpu); +extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); + +#else /* !CONFIG_CRASH_DUMP*/ +struct pt_regs; +struct task_struct; +struct kimage; +static inline void __crash_kexec(struct pt_regs *regs) { } +static inline void crash_kexec(struct pt_regs *regs) { } +static inline int kexec_should_crash(struct task_struct *p) { return 0; } +static inline int kexec_crash_loaded(void) { return 0; } +static inline void crash_save_cpu(struct pt_regs *regs, int cpu) {}; +static inline int kimage_crash_copy_vmcoreinfo(struct kimage *image) { return 0; }; +#endif /* CONFIG_CRASH_DUMP*/ + #endif /* LINUX_CRASH_CORE_H */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 9c7bb8b56ed6..060835bb82d5 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -15,7 +15,6 @@ #if !defined(__ASSEMBLY__) -#include #include #include #include @@ -33,6 +32,7 @@ extern note_buf_t __percpu *crash_notes; #include #include #include +#include /* Verify architecture specific macros are defined */ @@ -380,13 +380,6 @@ extern struct page *kimage_alloc_control_pages(struct kimage *image, static inline int machine_kexec_post_load(struct kimage *image) { return 0; } #endif -extern void __crash_kexec(struct pt_regs *); -extern void crash_kexec(struct pt_regs *); -int kexec_should_crash(struct task_struct *); -int kexec_crash_loaded(void); -void crash_save_cpu(struct pt_regs *regs, int cpu); -extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); - extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; @@ -410,24 +403,6 @@ bool kexec_load_permitted(int kexec_image_type); /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; -int crash_shrink_memory(unsigned long new_size); -ssize_t crash_get_memory_size(void); - -#ifndef arch_kexec_protect_crashkres -/* - * Protection mechanism for crashkernel reserved memory after - * the kdump kernel is loaded. - * - * Provide an empty default implementation here -- architecture - * code may override this - */ -static inline void arch_kexec_protect_crashkres(void) { } -#endif - -#ifndef arch_kexec_unprotect_crashkres -static inline void arch_kexec_unprotect_crashkres(void) { } -#endif - #ifndef page_to_boot_pfn static inline unsigned long page_to_boot_pfn(struct page *page) { @@ -484,24 +459,6 @@ static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, g static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { } #endif -#ifndef arch_crash_handle_hotplug_event -static inline void arch_crash_handle_hotplug_event(struct kimage *image) { } -#endif - -int crash_check_update_elfcorehdr(void); - -#ifndef crash_hotplug_cpu_support -static inline int crash_hotplug_cpu_support(void) { return 0; } -#endif - -#ifndef crash_hotplug_memory_support -static inline int crash_hotplug_memory_support(void) { return 0; } -#endif - -#ifndef crash_get_elfcorehdr_size -static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } -#endif - extern bool kexec_file_dbg_print; #define kexec_dprintk(fmt, ...) \ diff --git a/init/initramfs.c b/init/initramfs.c index 76deb48c38cb..6f095f54eec9 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -642,7 +642,7 @@ void __weak __init free_initrd_mem(unsigned long start, unsigned long end) "initrd"); } -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_RESERVE static bool __init kexec_free_initrd(void) { unsigned long crashk_start = (unsigned long)__va(crashk_res.start); diff --git a/kernel/Makefile b/kernel/Makefile index 35abc65e1f1a..3c13240dfc9f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -70,7 +70,8 @@ obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o -obj-$(CONFIG_KEXEC_CORE) += kexec_core.o crash_core.o +obj-$(CONFIG_KEXEC_CORE) += kexec_core.o +obj-$(CONFIG_CRASH_DUMP) += crash_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 2f4df1fe6f7a..78b5dc7cee3a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -11,9 +11,14 @@ #include #include #include +#include #include #include #include +#include +#include +#include +#include #include #include @@ -26,6 +31,131 @@ /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; +#ifdef CONFIG_CRASH_DUMP + +int kimage_crash_copy_vmcoreinfo(struct kimage *image) +{ + struct page *vmcoreinfo_page; + void *safecopy; + + if (!IS_ENABLED(CONFIG_CRASH_DUMP)) + return 0; + if (image->type != KEXEC_TYPE_CRASH) + return 0; + + /* + * For kdump, allocate one vmcoreinfo safe copy from the + * crash memory. as we have arch_kexec_protect_crashkres() + * after kexec syscall, we naturally protect it from write + * (even read) access under kernel direct mapping. But on + * the other hand, we still need to operate it when crash + * happens to generate vmcoreinfo note, hereby we rely on + * vmap for this purpose. + */ + vmcoreinfo_page = kimage_alloc_control_pages(image, 0); + if (!vmcoreinfo_page) { + pr_warn("Could not allocate vmcoreinfo buffer\n"); + return -ENOMEM; + } + safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); + if (!safecopy) { + pr_warn("Could not vmap vmcoreinfo buffer\n"); + return -ENOMEM; + } + + image->vmcoreinfo_data_copy = safecopy; + crash_update_vmcoreinfo_safecopy(safecopy); + + return 0; +} + + + +int kexec_should_crash(struct task_struct *p) +{ + /* + * If crash_kexec_post_notifiers is enabled, don't run + * crash_kexec() here yet, which must be run after panic + * notifiers in panic(). + */ + if (crash_kexec_post_notifiers) + return 0; + /* + * There are 4 panic() calls in make_task_dead() path, each of which + * corresponds to each of these 4 conditions. + */ + if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) + return 1; + return 0; +} + +int kexec_crash_loaded(void) +{ + return !!kexec_crash_image; +} +EXPORT_SYMBOL_GPL(kexec_crash_loaded); + +/* + * No panic_cpu check version of crash_kexec(). This function is called + * only when panic_cpu holds the current CPU number; this is the only CPU + * which processes crash_kexec routines. + */ +void __noclone __crash_kexec(struct pt_regs *regs) +{ + /* Take the kexec_lock here to prevent sys_kexec_load + * running on one cpu from replacing the crash kernel + * we are using after a panic on a different cpu. + * + * If the crash kernel was not located in a fixed area + * of memory the xchg(&kexec_crash_image) would be + * sufficient. But since I reuse the memory... + */ + if (kexec_trylock()) { + if (kexec_crash_image) { + struct pt_regs fixed_regs; + + crash_setup_regs(&fixed_regs, regs); + crash_save_vmcoreinfo(); + machine_crash_shutdown(&fixed_regs); + machine_kexec(kexec_crash_image); + } + kexec_unlock(); + } +} +STACK_FRAME_NON_STANDARD(__crash_kexec); + +__bpf_kfunc void crash_kexec(struct pt_regs *regs) +{ + int old_cpu, this_cpu; + + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + old_cpu = PANIC_CPU_INVALID; + this_cpu = raw_smp_processor_id(); + + if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + /* This is the 1st CPU which comes here, so go ahead. */ + __crash_kexec(regs); + + /* + * Reset panic_cpu to allow another panic()/crash_kexec() + * call. + */ + atomic_set(&panic_cpu, PANIC_CPU_INVALID); + } +} + +static inline resource_size_t crash_resource_size(const struct resource *res) +{ + return !res->end ? 0 : resource_size(res); +} + + + + int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz) { @@ -187,6 +317,130 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } +ssize_t crash_get_memory_size(void) +{ + ssize_t size = 0; + + if (!kexec_trylock()) + return -EBUSY; + + size += crash_resource_size(&crashk_res); + size += crash_resource_size(&crashk_low_res); + + kexec_unlock(); + return size; +} + +static int __crash_shrink_memory(struct resource *old_res, + unsigned long new_size) +{ + struct resource *ram_res; + + ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); + if (!ram_res) + return -ENOMEM; + + ram_res->start = old_res->start + new_size; + ram_res->end = old_res->end; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; + ram_res->name = "System RAM"; + + if (!new_size) { + release_resource(old_res); + old_res->start = 0; + old_res->end = 0; + } else { + crashk_res.end = ram_res->start - 1; + } + + crash_free_reserved_phys_range(ram_res->start, ram_res->end); + insert_resource(&iomem_resource, ram_res); + + return 0; +} + +int crash_shrink_memory(unsigned long new_size) +{ + int ret = 0; + unsigned long old_size, low_size; + + if (!kexec_trylock()) + return -EBUSY; + + if (kexec_crash_image) { + ret = -ENOENT; + goto unlock; + } + + low_size = crash_resource_size(&crashk_low_res); + old_size = crash_resource_size(&crashk_res) + low_size; + new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN); + if (new_size >= old_size) { + ret = (new_size == old_size) ? 0 : -EINVAL; + goto unlock; + } + + /* + * (low_size > new_size) implies that low_size is greater than zero. + * This also means that if low_size is zero, the else branch is taken. + * + * If low_size is greater than 0, (low_size > new_size) indicates that + * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res + * needs to be shrunken. + */ + if (low_size > new_size) { + ret = __crash_shrink_memory(&crashk_res, 0); + if (ret) + goto unlock; + + ret = __crash_shrink_memory(&crashk_low_res, new_size); + } else { + ret = __crash_shrink_memory(&crashk_res, new_size - low_size); + } + + /* Swap crashk_res and crashk_low_res if needed */ + if (!crashk_res.end && crashk_low_res.end) { + crashk_res.start = crashk_low_res.start; + crashk_res.end = crashk_low_res.end; + release_resource(&crashk_low_res); + crashk_low_res.start = 0; + crashk_low_res.end = 0; + insert_resource(&iomem_resource, &crashk_res); + } + +unlock: + kexec_unlock(); + return ret; +} + +void crash_save_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= nr_cpu_ids)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that, so there is no need to invent something new. + */ + buf = (u32 *)per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.common.pr_pid = current->pid; + elf_core_copy_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + final_note(buf); +} + + + static int __init crash_notes_memory_init(void) { /* Allocate memory for saving cpu registers. */ @@ -220,6 +474,8 @@ static int __init crash_notes_memory_init(void) } subsys_initcall(crash_notes_memory_init); +#endif /*CONFIG_CRASH_DUMP*/ + #ifdef CONFIG_CRASH_HOTPLUG #undef pr_fmt #define pr_fmt(fmt) "crash hp: " fmt diff --git a/kernel/kexec.c b/kernel/kexec.c index 8f35a5a42af8..bab542fc1463 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -28,12 +28,14 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, struct kimage *image; bool kexec_on_panic = flags & KEXEC_ON_CRASH; +#ifdef CONFIG_CRASH_DUMP if (kexec_on_panic) { /* Verify we have a valid entry point */ if ((entry < phys_to_boot_phys(crashk_res.start)) || (entry > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } +#endif /* Allocate and initialize a controlling structure */ image = do_kimage_alloc_init(); @@ -44,11 +46,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, image->nr_segments = nr_segments; memcpy(image->segment, segments, nr_segments * sizeof(*segments)); +#ifdef CONFIG_CRASH_DUMP if (kexec_on_panic) { /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; } +#endif ret = sanity_check_segment_list(image); if (ret) @@ -99,13 +103,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (!kexec_trylock()) return -EBUSY; +#ifdef CONFIG_CRASH_DUMP if (flags & KEXEC_ON_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) arch_kexec_unprotect_crashkres(); - } else { + } else +#endif dest_image = &kexec_image; - } if (nr_segments == 0) { /* Uninstall image */ @@ -162,8 +167,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, image = xchg(dest_image, image); out: +#ifdef CONFIG_CRASH_DUMP if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); +#endif kimage_free(image); out_unlock: diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d08fc7b5db97..ce3429e7972c 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -54,30 +54,6 @@ bool kexec_in_progress = false; bool kexec_file_dbg_print; -int kexec_should_crash(struct task_struct *p) -{ - /* - * If crash_kexec_post_notifiers is enabled, don't run - * crash_kexec() here yet, which must be run after panic - * notifiers in panic(). - */ - if (crash_kexec_post_notifiers) - return 0; - /* - * There are 4 panic() calls in make_task_dead() path, each of which - * corresponds to each of these 4 conditions. - */ - if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) - return 1; - return 0; -} - -int kexec_crash_loaded(void) -{ - return !!kexec_crash_image; -} -EXPORT_SYMBOL_GPL(kexec_crash_loaded); - /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors @@ -209,6 +185,7 @@ int sanity_check_segment_list(struct kimage *image) if (total_pages > nr_pages / 2) return -EINVAL; +#ifdef CONFIG_CRASH_DUMP /* * Verify we have good destination addresses. Normally * the caller is responsible for making certain we don't @@ -231,6 +208,7 @@ int sanity_check_segment_list(struct kimage *image) return -EADDRNOTAVAIL; } } +#endif return 0; } @@ -403,6 +381,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, return pages; } +#ifdef CONFIG_CRASH_DUMP static struct page *kimage_alloc_crash_control_pages(struct kimage *image, unsigned int order) { @@ -468,6 +447,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, return pages; } +#endif struct page *kimage_alloc_control_pages(struct kimage *image, @@ -479,48 +459,16 @@ struct page *kimage_alloc_control_pages(struct kimage *image, case KEXEC_TYPE_DEFAULT: pages = kimage_alloc_normal_control_pages(image, order); break; +#ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: pages = kimage_alloc_crash_control_pages(image, order); break; +#endif } return pages; } -int kimage_crash_copy_vmcoreinfo(struct kimage *image) -{ - struct page *vmcoreinfo_page; - void *safecopy; - - if (image->type != KEXEC_TYPE_CRASH) - return 0; - - /* - * For kdump, allocate one vmcoreinfo safe copy from the - * crash memory. as we have arch_kexec_protect_crashkres() - * after kexec syscall, we naturally protect it from write - * (even read) access under kernel direct mapping. But on - * the other hand, we still need to operate it when crash - * happens to generate vmcoreinfo note, hereby we rely on - * vmap for this purpose. - */ - vmcoreinfo_page = kimage_alloc_control_pages(image, 0); - if (!vmcoreinfo_page) { - pr_warn("Could not allocate vmcoreinfo buffer\n"); - return -ENOMEM; - } - safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); - if (!safecopy) { - pr_warn("Could not vmap vmcoreinfo buffer\n"); - return -ENOMEM; - } - - image->vmcoreinfo_data_copy = safecopy; - crash_update_vmcoreinfo_safecopy(safecopy); - - return 0; -} - static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) @@ -603,10 +551,12 @@ void kimage_free(struct kimage *image) if (!image) return; +#ifdef CONFIG_CRASH_DUMP if (image->vmcoreinfo_data_copy) { crash_update_vmcoreinfo_safecopy(NULL); vunmap(image->vmcoreinfo_data_copy); } +#endif kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { @@ -824,6 +774,7 @@ out: return result; } +#ifdef CONFIG_CRASH_DUMP static int kimage_load_crash_segment(struct kimage *image, struct kexec_segment *segment) { @@ -891,6 +842,7 @@ static int kimage_load_crash_segment(struct kimage *image, out: return result; } +#endif int kimage_load_segment(struct kimage *image, struct kexec_segment *segment) @@ -901,9 +853,11 @@ int kimage_load_segment(struct kimage *image, case KEXEC_TYPE_DEFAULT: result = kimage_load_normal_segment(image, segment); break; +#ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: result = kimage_load_crash_segment(image, segment); break; +#endif } return result; @@ -1027,186 +981,6 @@ bool kexec_load_permitted(int kexec_image_type) return true; } -/* - * No panic_cpu check version of crash_kexec(). This function is called - * only when panic_cpu holds the current CPU number; this is the only CPU - * which processes crash_kexec routines. - */ -void __noclone __crash_kexec(struct pt_regs *regs) -{ - /* Take the kexec_lock here to prevent sys_kexec_load - * running on one cpu from replacing the crash kernel - * we are using after a panic on a different cpu. - * - * If the crash kernel was not located in a fixed area - * of memory the xchg(&kexec_crash_image) would be - * sufficient. But since I reuse the memory... - */ - if (kexec_trylock()) { - if (kexec_crash_image) { - struct pt_regs fixed_regs; - - crash_setup_regs(&fixed_regs, regs); - crash_save_vmcoreinfo(); - machine_crash_shutdown(&fixed_regs); - machine_kexec(kexec_crash_image); - } - kexec_unlock(); - } -} -STACK_FRAME_NON_STANDARD(__crash_kexec); - -__bpf_kfunc void crash_kexec(struct pt_regs *regs) -{ - int old_cpu, this_cpu; - - /* - * Only one CPU is allowed to execute the crash_kexec() code as with - * panic(). Otherwise parallel calls of panic() and crash_kexec() - * may stop each other. To exclude them, we use panic_cpu here too. - */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { - /* This is the 1st CPU which comes here, so go ahead. */ - __crash_kexec(regs); - - /* - * Reset panic_cpu to allow another panic()/crash_kexec() - * call. - */ - atomic_set(&panic_cpu, PANIC_CPU_INVALID); - } -} - -static inline resource_size_t crash_resource_size(const struct resource *res) -{ - return !res->end ? 0 : resource_size(res); -} - -ssize_t crash_get_memory_size(void) -{ - ssize_t size = 0; - - if (!kexec_trylock()) - return -EBUSY; - - size += crash_resource_size(&crashk_res); - size += crash_resource_size(&crashk_low_res); - - kexec_unlock(); - return size; -} - -static int __crash_shrink_memory(struct resource *old_res, - unsigned long new_size) -{ - struct resource *ram_res; - - ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); - if (!ram_res) - return -ENOMEM; - - ram_res->start = old_res->start + new_size; - ram_res->end = old_res->end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; - ram_res->name = "System RAM"; - - if (!new_size) { - release_resource(old_res); - old_res->start = 0; - old_res->end = 0; - } else { - crashk_res.end = ram_res->start - 1; - } - - crash_free_reserved_phys_range(ram_res->start, ram_res->end); - insert_resource(&iomem_resource, ram_res); - - return 0; -} - -int crash_shrink_memory(unsigned long new_size) -{ - int ret = 0; - unsigned long old_size, low_size; - - if (!kexec_trylock()) - return -EBUSY; - - if (kexec_crash_image) { - ret = -ENOENT; - goto unlock; - } - - low_size = crash_resource_size(&crashk_low_res); - old_size = crash_resource_size(&crashk_res) + low_size; - new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN); - if (new_size >= old_size) { - ret = (new_size == old_size) ? 0 : -EINVAL; - goto unlock; - } - - /* - * (low_size > new_size) implies that low_size is greater than zero. - * This also means that if low_size is zero, the else branch is taken. - * - * If low_size is greater than 0, (low_size > new_size) indicates that - * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res - * needs to be shrunken. - */ - if (low_size > new_size) { - ret = __crash_shrink_memory(&crashk_res, 0); - if (ret) - goto unlock; - - ret = __crash_shrink_memory(&crashk_low_res, new_size); - } else { - ret = __crash_shrink_memory(&crashk_res, new_size - low_size); - } - - /* Swap crashk_res and crashk_low_res if needed */ - if (!crashk_res.end && crashk_low_res.end) { - crashk_res.start = crashk_low_res.start; - crashk_res.end = crashk_low_res.end; - release_resource(&crashk_low_res); - crashk_low_res.start = 0; - crashk_low_res.end = 0; - insert_resource(&iomem_resource, &crashk_res); - } - -unlock: - kexec_unlock(); - return ret; -} - -void crash_save_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= nr_cpu_ids)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that, so there is no need to invent something new. - */ - buf = (u32 *)per_cpu_ptr(crash_notes, cpu); - if (!buf) - return; - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.common.pr_pid = current->pid; - elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); - final_note(buf); -} - /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index bef2f6f2571b..ce7ce2ae27cd 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -285,11 +285,13 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG); image->file_mode = 1; +#ifdef CONFIG_CRASH_DUMP if (kexec_on_panic) { /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; } +#endif ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, cmdline_ptr, cmdline_len, flags); @@ -349,13 +351,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (!kexec_trylock()) return -EBUSY; +#ifdef CONFIG_CRASH_DUMP if (image_type == KEXEC_TYPE_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) arch_kexec_unprotect_crashkres(); - } else { + } else +#endif dest_image = &kexec_image; - } if (flags & KEXEC_FILE_UNLOAD) goto exchange; @@ -419,8 +422,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, exchange: image = xchg(dest_image, image); out: +#ifdef CONFIG_CRASH_DUMP if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); +#endif kexec_unlock(); kimage_free(image); @@ -595,12 +600,14 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, static int kexec_walk_resources(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)) { +#ifdef CONFIG_CRASH_DUMP if (kbuf->image->type == KEXEC_TYPE_CRASH) return walk_iomem_res_desc(crashk_res.desc, IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, crashk_res.start, crashk_res.end, kbuf, func); - else if (kbuf->top_down) +#endif + if (kbuf->top_down) return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func); else return walk_system_ram_res(0, ULONG_MAX, kbuf, func); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 11526fc42bc2..fe7a517fc4ab 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -120,6 +120,7 @@ static ssize_t kexec_loaded_show(struct kobject *kobj, } KERNEL_ATTR_RO(kexec_loaded); +#ifdef CONFIG_CRASH_DUMP static ssize_t kexec_crash_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -152,6 +153,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, } KERNEL_ATTR_RW(kexec_crash_size); +#endif /* CONFIG_CRASH_DUMP*/ #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_VMCORE_INFO @@ -262,9 +264,11 @@ static struct attribute * kernel_attrs[] = { #endif #ifdef CONFIG_KEXEC_CORE &kexec_loaded_attr.attr, +#ifdef CONFIG_CRASH_DUMP &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, #endif +#endif #ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, #ifdef CONFIG_CRASH_HOTPLUG From 75bc255a7444801d64c7a7bd09e3f452f86b3585 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:45 +0800 Subject: [PATCH 258/435] crash: clean up kdump related config items By splitting CRASH_RESERVE and VMCORE_INFO out from CRASH_CORE, cleaning up the dependency of FA_DMUMP on CRASH_DUMP, and moving crash codes from kexec_core.c to crash_core.c, now we can rearrange CRASH_DUMP to depend on KEXEC_CORE, and make CRASH_DUMP select CRASH_RESERVE and VMCORE_INFO. KEXEC_CORE won't select CRASH_RESERVE and VMCORE_INFO any more because KEXEC_CORE enables codes which allocate control pages, copy kexec/kdump segments, and prepare for switching. These codes are shared by both kexec reboot and crash dumping. Doing this makes codes and the corresponding config items more logical (the right item depends on or is selected by the left item). PROC_KCORE -----------> VMCORE_INFO |----------> VMCORE_INFO FA_DUMP----| |----------> CRASH_RESERVE ---->VMCORE_INFO / |---->CRASH_RESERVE KEXEC --| /| |--> KEXEC_CORE--> CRASH_DUMP-->/-|---->PROC_VMCORE KEXEC_FILE --| \ | \---->CRASH_HOTPLUG KEXEC --| |--> KEXEC_CORE--> kexec reboot KEXEC_FILE --| Link: https://lkml.kernel.org/r/20240124051254.67105-6-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 8faf27043432..6c34e63c88ff 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -9,8 +9,6 @@ config VMCORE_INFO bool config KEXEC_CORE - select VMCORE_INFO - select CRASH_RESERVE bool config KEXEC_ELF @@ -99,8 +97,11 @@ config KEXEC_JUMP config CRASH_DUMP bool "kernel crash dumps" + default y depends on ARCH_SUPPORTS_CRASH_DUMP - select KEXEC_CORE + depends on KEXEC_CORE + select VMCORE_INFO + select CRASH_RESERVE help Generate crash dump after being started by kexec. This should be normally only set in special crash dump kernels From a4eeb2176d89fdf2785851521577b94b31690a60 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:46 +0800 Subject: [PATCH 259/435] x86, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on x86 with some adjustments. Here, also change some ifdefs or IS_ENABLED() check to more appropriate ones, e,g - #ifdef CONFIG_KEXEC_CORE -> #ifdef CONFIG_CRASH_DUMP - (!IS_ENABLED(CONFIG_KEXEC_CORE)) - > (!IS_ENABLED(CONFIG_CRASH_RESERVE)) [bhe@redhat.com: don't nest CONFIG_CRASH_DUMP ifdef inside CONFIG_KEXEC_CODE ifdef scope] Link: https://lore.kernel.org/all/SN6PR02MB4157931105FA68D72E3D3DB8D47B2@SN6PR02MB4157.namprd02.prod.outlook.com/T/#u Link: https://lkml.kernel.org/r/20240124051254.67105-7-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/x86/kernel/Makefile | 4 ++-- arch/x86/kernel/cpu/mshyperv.c | 10 ++++++++-- arch/x86/kernel/kexec-bzimage64.c | 4 ++++ arch/x86/kernel/kvm.c | 4 ++-- arch/x86/kernel/machine_kexec_64.c | 3 +++ arch/x86/kernel/reboot.c | 4 ++-- arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/smp.c | 2 +- arch/x86/xen/enlighten_hvm.c | 4 ++++ arch/x86/xen/mmu_pv.c | 2 +- 10 files changed, 28 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 913d4022131e..3668b1edef2d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -100,9 +100,9 @@ obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o -obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o +obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o crash.o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_X86_32) += doublefault_32.o diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 01fa06dd06b6..2e8cd5a4ae85 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -209,7 +209,9 @@ static void hv_machine_shutdown(void) if (kexec_in_progress) hyperv_cleanup(); } +#endif /* CONFIG_KEXEC_CORE */ +#ifdef CONFIG_CRASH_DUMP static void hv_machine_crash_shutdown(struct pt_regs *regs) { if (hv_crash_handler) @@ -221,7 +223,7 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs) /* Disable the hypercall page when there is only 1 active CPU. */ hyperv_cleanup(); } -#endif /* CONFIG_KEXEC_CORE */ +#endif /* CONFIG_CRASH_DUMP */ #endif /* CONFIG_HYPERV */ static uint32_t __init ms_hyperv_platform(void) @@ -495,9 +497,13 @@ static void __init ms_hyperv_init_platform(void) no_timer_check = 1; #endif -#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE) +#if IS_ENABLED(CONFIG_HYPERV) +#if defined(CONFIG_KEXEC_CORE) machine_ops.shutdown = hv_machine_shutdown; +#endif +#if defined(CONFIG_CRASH_DUMP) machine_ops.crash_shutdown = hv_machine_crash_shutdown; +#endif #endif if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { /* diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 2a422e00ed4b..b55737b83a84 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -263,11 +263,13 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, memset(¶ms->hd0_info, 0, sizeof(params->hd0_info)); memset(¶ms->hd1_info, 0, sizeof(params->hd1_info)); +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) { ret = crash_setup_memmap_entries(image, params); if (ret) return ret; } else +#endif setup_e820_entries(params); nr_e820_entries = params->e820_entries; @@ -433,12 +435,14 @@ static void *bzImage64_load(struct kimage *image, char *kernel, return ERR_PTR(-EINVAL); } +#ifdef CONFIG_CRASH_DUMP /* Allocate and load backup region */ if (image->type == KEXEC_TYPE_CRASH) { ret = crash_load_segments(image); if (ret) return ERR_PTR(ret); } +#endif /* * Load purgatory. For 64bit entry point, purgatory code can be diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 428ee74002e1..3c9c327d6706 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -770,7 +770,7 @@ static struct notifier_block kvm_pv_reboot_nb = { * won't be valid. In cases like kexec, in which you install a new kernel, this * means a random memory location will be kept being written. */ -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP static void kvm_crash_shutdown(struct pt_regs *regs) { kvm_guest_cpu_offline(true); @@ -853,7 +853,7 @@ static void __init kvm_guest_init(void) kvm_guest_cpu_init(); #endif -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP machine_ops.crash_shutdown = kvm_crash_shutdown; #endif diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index bc0a5348b4a6..b180d8e497c3 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -508,6 +508,8 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) } #endif /* CONFIG_KEXEC_FILE */ +#ifdef CONFIG_CRASH_DUMP + static int kexec_mark_range(unsigned long start, unsigned long end, bool protect) { @@ -552,6 +554,7 @@ void arch_kexec_unprotect_crashkres(void) { kexec_mark_crashkres(false); } +#endif /* * During a traditional boot under SME, SME will encrypt the kernel, diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 830425e6d38e..f3130f762784 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -796,7 +796,7 @@ struct machine_ops machine_ops __ro_after_init = { .emergency_restart = native_machine_emergency_restart, .restart = native_machine_restart, .halt = native_machine_halt, -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP .crash_shutdown = native_machine_crash_shutdown, #endif }; @@ -826,7 +826,7 @@ void machine_halt(void) machine_ops.halt(); } -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP void machine_crash_shutdown(struct pt_regs *regs) { machine_ops.crash_shutdown(regs); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 84201071dfac..899d839a2954 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -471,7 +471,7 @@ static void __init arch_reserve_crashkernel(void) bool high = false; int ret; - if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 96a771f9f930..52c3823b7211 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -282,7 +282,7 @@ struct smp_ops smp_ops = { .smp_cpus_done = native_smp_cpus_done, .stop_other_cpus = native_stop_other_cpus, -#if defined(CONFIG_KEXEC_CORE) +#if defined(CONFIG_CRASH_DUMP) .crash_stop_other_cpus = kdump_nmi_shootdown_cpus, #endif .smp_send_reschedule = native_smp_send_reschedule, diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 3f8c34707c50..0b367c1e086d 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -148,7 +148,9 @@ static void xen_hvm_shutdown(void) if (kexec_in_progress) xen_reboot(SHUTDOWN_soft_reset); } +#endif +#ifdef CONFIG_CRASH_DUMP static void xen_hvm_crash_shutdown(struct pt_regs *regs) { native_machine_crash_shutdown(regs); @@ -236,6 +238,8 @@ static void __init xen_hvm_guest_init(void) #ifdef CONFIG_KEXEC_CORE machine_ops.shutdown = xen_hvm_shutdown; +#endif +#ifdef CONFIG_CRASH_DUMP machine_ops.crash_shutdown = xen_hvm_crash_shutdown; #endif } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 218773cfb009..e21974f2cf2d 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2520,7 +2520,7 @@ out: } EXPORT_SYMBOL_GPL(xen_remap_pfn); -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_VMCORE_INFO phys_addr_t paddr_vmcoreinfo_note(void) { if (xen_pv_domain()) From 40254101d87870b2e5ac3ddc28af40aa04c48486 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:47 +0800 Subject: [PATCH 260/435] arm64, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on arm64 with some adjustments. Here wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery. [bhe@redhat.com: fix building error in generic codes] Link: https://lkml.kernel.org/r/20240129135033.157195-2-bhe@redhat.com Link: https://lkml.kernel.org/r/20240124051254.67105-8-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/arm64/include/asm/kexec.h | 2 +- arch/arm64/kernel/machine_kexec.c | 2 +- arch/arm64/kernel/machine_kexec_file.c | 10 ++++++++-- arch/arm64/mm/init.c | 2 +- drivers/of/kexec.c | 2 ++ kernel/kexec_file.c | 2 ++ 6 files changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index 9ac9572a3bbe..4d9cc7a76d9c 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -80,7 +80,7 @@ static inline void crash_setup_regs(struct pt_regs *newregs, } } -#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_HIBERNATION) +#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION) extern bool crash_is_nosave(unsigned long pfn); extern void crash_prepare_suspend(void); extern void crash_post_resume(void); diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index b38aae5b488d..82e2203d86a3 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -255,7 +255,7 @@ void machine_crash_shutdown(struct pt_regs *regs) pr_info("Starting crashdump kernel...\n"); } -#ifdef CONFIG_HIBERNATION +#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION) /* * To preserve the crash dump kernel image, the relevant memory segments * should be mapped again around the hibernation. diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index 0e017358f4ba..af1ca875c52c 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -39,6 +39,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) return kexec_image_post_load_cleanup_default(image); } +#ifdef CONFIG_CRASH_DUMP static int prepare_elf_headers(void **addr, unsigned long *sz) { struct crash_mem *cmem; @@ -80,6 +81,7 @@ out: kfree(cmem); return ret; } +#endif /* * Tries to add the initrd and DTB to the image. If it is not possible to find @@ -93,8 +95,8 @@ int load_other_segments(struct kimage *image, char *cmdline) { struct kexec_buf kbuf; - void *headers, *dtb = NULL; - unsigned long headers_sz, initrd_load_addr = 0, dtb_len, + void *dtb = NULL; + unsigned long initrd_load_addr = 0, dtb_len, orig_segments = image->nr_segments; int ret = 0; @@ -102,7 +104,10 @@ int load_other_segments(struct kimage *image, /* not allocate anything below the kernel */ kbuf.buf_min = kernel_load_addr + kernel_size; +#ifdef CONFIG_CRASH_DUMP /* load elf core header */ + void *headers; + unsigned long headers_sz; if (image->type == KEXEC_TYPE_CRASH) { ret = prepare_elf_headers(&headers, &headers_sz); if (ret) { @@ -130,6 +135,7 @@ int load_other_segments(struct kimage *image, kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", image->elf_load_addr, kbuf.bufsz, kbuf.memsz); } +#endif /* load initrd */ if (initrd) { diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 74c1db8ce271..c1f6213e77f3 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -100,7 +100,7 @@ static void __init arch_reserve_crashkernel(void) bool high = false; int ret; - if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c index 68278340cecf..9ccde2fd77cb 100644 --- a/drivers/of/kexec.c +++ b/drivers/of/kexec.c @@ -395,6 +395,7 @@ void *of_kexec_alloc_and_setup_fdt(const struct kimage *image, if (ret) goto out; +#ifdef CONFIG_CRASH_DUMP /* add linux,usable-memory-range */ ret = fdt_appendprop_addrrange(fdt, 0, chosen_node, "linux,usable-memory-range", crashk_res.start, @@ -410,6 +411,7 @@ void *of_kexec_alloc_and_setup_fdt(const struct kimage *image, if (ret) goto out; } +#endif } /* add bootargs */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index ce7ce2ae27cd..2d1db05fbf04 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -540,8 +540,10 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, phys_addr_t mstart, mend; struct resource res = { }; +#ifdef CONFIG_CRASH_DUMP if (kbuf->image->type == KEXEC_TYPE_CRASH) return func(&crashk_res, kbuf); +#endif /* * Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See From 086d67ef33ecd07ef58f271bb35c1efd20c5acbf Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:48 +0800 Subject: [PATCH 261/435] ppc, crash: enforce KEXEC and KEXEC_FILE to select CRASH_DUMP In PowerPC, the crash dumping and kexec reboot share code in arch_kexec_locate_mem_hole(), in which struct crash_mem is used. Here enfoce enforce KEXEC and KEXEC_FILE to select CRASH_DUMP for now. [bhe@redhat.com: fix allnoconfig on ppc] Link: https://lkml.kernel.org/r/ZbJwMyCpz4HDySoo@MiWiFi-R3L-srv Link: https://lkml.kernel.org/r/20240124051254.67105-9-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Hari Bathini Cc: Al Viro Cc: Eric W. Biederman Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a9efaf87966d..c3b44eba0533 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -608,6 +608,11 @@ config PPC64_SUPPORTS_MEMORY_FAILURE config ARCH_SUPPORTS_KEXEC def_bool PPC_BOOK3S || PPC_E500 || (44x && !SMP) +config ARCH_SELECTS_KEXEC + def_bool y + depends on KEXEC + select CRASH_DUMP + config ARCH_SUPPORTS_KEXEC_FILE def_bool PPC64 @@ -618,6 +623,7 @@ config ARCH_SELECTS_KEXEC_FILE def_bool y depends on KEXEC_FILE select KEXEC_ELF + select CRASH_DUMP select HAVE_IMA_KEXEC if IMA config PPC64_BIG_ENDIAN_ELF_ABI_V2 From 865e2acd3eb971026a29e8b03443847a9ffb7d51 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:49 +0800 Subject: [PATCH 262/435] s390, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on s390 with some adjustments. Here wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery. Link: https://lkml.kernel.org/r/20240124051254.67105-10-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/s390/kernel/kexec_elf.c | 2 ++ arch/s390/kernel/kexec_image.c | 2 ++ arch/s390/kernel/machine_kexec_file.c | 10 ++++++++++ 3 files changed, 14 insertions(+) diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 9da6fa30c447..4d364de43799 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -40,8 +40,10 @@ static int kexec_file_add_kernel_elf(struct kimage *image, buf.bufsz = phdr->p_filesz; buf.mem = ALIGN(phdr->p_paddr, phdr->p_align); +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif buf.memsz = phdr->p_memsz; data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index af23eff5774d..a32ce8bea745 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -24,8 +24,10 @@ static int kexec_file_add_kernel_image(struct kimage *image, buf.bufsz = image->kernel_buf_len; buf.mem = 0; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif buf.memsz = buf.bufsz; data->kernel_buf = image->kernel_buf; diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 8d207b82d9fe..c2bac14dd668 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -105,6 +105,7 @@ static int kexec_file_update_purgatory(struct kimage *image, if (ret) return ret; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) { u64 crash_size; @@ -121,6 +122,7 @@ static int kexec_file_update_purgatory(struct kimage *image, sizeof(crash_size), false); } +#endif return ret; } @@ -134,8 +136,10 @@ static int kexec_file_add_purgatory(struct kimage *image, data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif ret = kexec_load_purgatory(image, &buf); if (ret) @@ -158,8 +162,10 @@ static int kexec_file_add_initrd(struct kimage *image, data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif buf.memsz = buf.bufsz; data->parm->initrd_start = data->memsz; @@ -223,8 +229,10 @@ static int kexec_file_add_ipl_report(struct kimage *image, data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr); *lc_ipl_parmblock_ptr = (__u32)buf.mem; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif ret = kexec_add_buffer(&buf); out: @@ -268,10 +276,12 @@ void *kexec_file_add_components(struct kimage *image, memcpy(data.parm->command_line, image->cmdline_buf, image->cmdline_buf_len); +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) { data.parm->oldmem_base = crashk_res.start; data.parm->oldmem_size = crashk_res.end - crashk_res.start + 1; } +#endif if (image->initrd_buf) { ret = kexec_file_add_initrd(image, &data); From e389263561d8b57cbefc3f48ded821c99594e1de Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:50 +0800 Subject: [PATCH 263/435] sh, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on SuperH with some adjustments. Wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery, and use IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling in the crashkernel reservation code. Link: https://lkml.kernel.org/r/20240124051254.67105-11-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/sh/kernel/machine_kexec.c | 3 +++ arch/sh/kernel/setup.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c index fa3a7b36190a..8daa8a6e6fa6 100644 --- a/arch/sh/kernel/machine_kexec.c +++ b/arch/sh/kernel/machine_kexec.c @@ -153,6 +153,9 @@ void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) + return; + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base, NULL, NULL); if (ret == 0 && crash_size > 0) { diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index d3175f09b3aa..620e5cf8ae1e 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -220,7 +220,7 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn, request_resource(res, &code_resource); request_resource(res, &data_resource); request_resource(res, &bss_resource); -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_RESERVE request_resource(res, &crashk_res); #endif From d739f190c035230d535ef9ce34a3789c8d5929fd Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:51 +0800 Subject: [PATCH 264/435] mips, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on mips with some adjustments. Here use IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling in the crashkernel reservation code. Link: https://lkml.kernel.org/r/20240124051254.67105-12-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/mips/kernel/setup.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 9c30de151597..12a1a4ffb602 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -442,8 +442,6 @@ static void __init mips_reserve_vmcore(void) #endif } -#ifdef CONFIG_KEXEC - /* 64M alignment for crash kernel regions */ #define CRASH_ALIGN SZ_64M #define CRASH_ADDR_MAX SZ_512M @@ -454,6 +452,9 @@ static void __init mips_parse_crashkernel(void) unsigned long long crash_size, crash_base; int ret; + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) + return; + total_mem = memblock_phys_mem_size(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base, @@ -489,6 +490,9 @@ static void __init request_crashkernel(struct resource *res) { int ret; + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) + return; + if (crashk_res.start == crashk_res.end) return; @@ -498,15 +502,6 @@ static void __init request_crashkernel(struct resource *res) (unsigned long)(resource_size(&crashk_res) >> 20), (unsigned long)(crashk_res.start >> 20)); } -#else /* !defined(CONFIG_KEXEC) */ -static void __init mips_parse_crashkernel(void) -{ -} - -static void __init request_crashkernel(struct resource *res) -{ -} -#endif /* !defined(CONFIG_KEXEC) */ static void __init check_kernel_sections_mem(void) { From 0978a63f9c8b807f073dd23c6116958d043194d3 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:52 +0800 Subject: [PATCH 265/435] riscv, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on risc-v with some adjustments. Here wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery, and use IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling in the crashkernel reservation code. Link: https://lkml.kernel.org/r/20240124051254.67105-13-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/riscv/kernel/elf_kexec.c | 9 +++++++-- arch/riscv/mm/init.c | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c index 5bd1ec3341fe..54260c16f991 100644 --- a/arch/riscv/kernel/elf_kexec.c +++ b/arch/riscv/kernel/elf_kexec.c @@ -117,6 +117,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, return ret; } +#ifdef CONFIG_CRASH_DUMP static int get_nr_ram_ranges_callback(struct resource *res, void *arg) { unsigned int *nr_ranges = arg; @@ -189,6 +190,7 @@ static char *setup_kdump_cmdline(struct kimage *image, char *cmdline, cmdline_ptr[COMMAND_LINE_SIZE - 1] = '\0'; return cmdline_ptr; } +#endif static void *elf_kexec_load(struct kimage *image, char *kernel_buf, unsigned long kernel_len, char *initrd, @@ -196,12 +198,11 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf, unsigned long cmdline_len) { int ret; + void *fdt; unsigned long old_kernel_pbase = ULONG_MAX; unsigned long new_kernel_pbase = 0UL; unsigned long initrd_pbase = 0UL; - unsigned long headers_sz; unsigned long kernel_start; - void *fdt, *headers; struct elfhdr ehdr; struct kexec_buf kbuf; struct kexec_elf_info elf_info; @@ -227,8 +228,11 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf, kbuf.buf_min = new_kernel_pbase + kernel_len; kbuf.buf_max = ULONG_MAX; +#ifdef CONFIG_CRASH_DUMP /* Add elfcorehdr */ if (image->type == KEXEC_TYPE_CRASH) { + void *headers; + unsigned long headers_sz; ret = prepare_elf_headers(&headers, &headers_sz); if (ret) { pr_err("Preparing elf core header failed\n"); @@ -264,6 +268,7 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf, } cmdline = modified_cmdline; } +#endif #ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY /* Add purgatory to the image */ diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index a4f218cfb845..b5ffb2ef54ad 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1355,7 +1355,7 @@ static void __init arch_reserve_crashkernel(void) bool high = false; int ret; - if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), From 5057dff3cf80f631ef7ce6cace2929377405cb0f Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:53 +0800 Subject: [PATCH 266/435] arm, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on arm with some adjustments. Here use CONFIG_CRASH_RESERVE ifdef to replace CONFIG_KEXEC ifdef. Link: https://lkml.kernel.org/r/20240124051254.67105-14-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/arm/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index ff2299ce1ad7..7b33b157fca0 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -979,7 +979,7 @@ static int __init init_machine_late(void) } late_initcall(init_machine_late); -#ifdef CONFIG_KEXEC +#ifdef CONFIG_CRASH_RESERVE /* * The crash region must be aligned to 128MB to avoid * zImage relocating below the reserved region. @@ -1066,7 +1066,7 @@ static void __init reserve_crashkernel(void) } #else static inline void reserve_crashkernel(void) {} -#endif /* CONFIG_KEXEC */ +#endif /* CONFIG_CRASH_RESERVE*/ void __init hyp_mode_check(void) { From ea034d0b07441cd4f4411cb627de6f550102f0d6 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:54 +0800 Subject: [PATCH 267/435] loongarch, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on loongarch with some adjustments. Here use IS_ENABLED(CONFIG_CRASH_RESERVE) check to decide if compiling in the crashkernel reservation code. Link: https://lkml.kernel.org/r/20240124051254.67105-15-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/loongarch/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index edf2bba80130..57d37dd9f964 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -260,7 +260,7 @@ static void __init arch_reserve_crashkernel(void) char *cmdline = boot_command_line; bool high = false; - if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), From 199da8714c8f4c45f46509c5eb188f5551404574 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 29 Jan 2024 21:50:33 +0800 Subject: [PATCH 268/435] arch, crash: move arch_crash_save_vmcoreinfo() out to file vmcore_info.c Nathan reported below building error: ===== $ curl -LSso .config https://git.alpinelinux.org/aports/plain/community/linux-edge/config-edge.armv7 $ make -skj"$(nproc)" ARCH=arm CROSS_COMPILE=arm-linux-gnueabi- olddefconfig all .. arm-linux-gnueabi-ld: arch/arm/kernel/machine_kexec.o: in function `arch_crash_save_vmcoreinfo': machine_kexec.c:(.text+0x488): undefined reference to `vmcoreinfo_append_str' ==== On architecutres, like arm, s390, ppc, sh, function arch_crash_save_vmcoreinfo() is located in machine_kexec.c and it can only be compiled in when CONFIG_KEXEC_CORE=y. That's not right because arch_crash_save_vmcoreinfo() is used to export arch specific vmcoreinfo. CONFIG_VMCORE_INFO is supposed to control its compiling in. However, CONFIG_VMVCORE_INFO could be independent of CONFIG_KEXEC_CORE, e.g CONFIG_PROC_KCORE=y will select CONFIG_VMVCORE_INFO. Or CONFIG_KEXEC/CONFIG_KEXEC_FILE is set while CONFIG_CRASH_DUMP is not set, it will report linking error. So, on arm, s390, ppc and sh, move arch_crash_save_vmcoreinfo out to a new file vmcore_info.c. Let CONFIG_VMCORE_INFO decide if compiling in arch_crash_save_vmcoreinfo(). [akpm@linux-foundation.org: remove stray newlines at eof] Link: https://lkml.kernel.org/r/20240129135033.157195-3-bhe@redhat.com Signed-off-by: Baoquan He Reported-by: Nathan Chancellor Closes: https://lore.kernel.org/all/20240126045551.GA126645@dev-arch.thelio-3990X/T/#u Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Klara Modin Cc: Michael Kelley Cc: Pingfan Liu Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/arm/kernel/Makefile | 1 + arch/arm/kernel/machine_kexec.c | 7 ------- arch/arm/kernel/vmcore_info.c | 10 ++++++++++ arch/powerpc/kexec/Makefile | 1 + arch/powerpc/kexec/core.c | 28 ---------------------------- arch/powerpc/kexec/vmcore_info.c | 32 ++++++++++++++++++++++++++++++++ arch/s390/kernel/Makefile | 1 + arch/s390/kernel/machine_kexec.c | 15 --------------- arch/s390/kernel/vmcore_info.c | 21 +++++++++++++++++++++ arch/sh/kernel/Makefile | 1 + arch/sh/kernel/machine_kexec.c | 11 ----------- arch/sh/kernel/vmcore_info.c | 15 +++++++++++++++ 12 files changed, 82 insertions(+), 61 deletions(-) create mode 100644 arch/arm/kernel/vmcore_info.c create mode 100644 arch/powerpc/kexec/vmcore_info.c create mode 100644 arch/s390/kernel/vmcore_info.c create mode 100644 arch/sh/kernel/vmcore_info.c diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 771264d4726a..6a9de826ffd3 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -60,6 +60,7 @@ obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o insn.o patch.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o insn.o patch.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o insn.o patch.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o # Main staffs in KPROBES are in arch/arm/probes/ . obj-$(CONFIG_KPROBES) += patch.o insn.o obj-$(CONFIG_OABI_COMPAT) += sys_oabi-compat.o diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 5d07cf9e0044..80ceb5bd2680 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -198,10 +198,3 @@ void machine_kexec(struct kimage *image) soft_restart(reboot_entry_phys); } - -void arch_crash_save_vmcoreinfo(void) -{ -#ifdef CONFIG_ARM_LPAE - VMCOREINFO_CONFIG(ARM_LPAE); -#endif -} diff --git a/arch/arm/kernel/vmcore_info.c b/arch/arm/kernel/vmcore_info.c new file mode 100644 index 000000000000..1437aba47787 --- /dev/null +++ b/arch/arm/kernel/vmcore_info.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include + +void arch_crash_save_vmcoreinfo(void) +{ +#ifdef CONFIG_ARM_LPAE + VMCOREINFO_CONFIG(ARM_LPAE); +#endif +} diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile index 0c2abe7f9908..91e96f5168b7 100644 --- a/arch/powerpc/kexec/Makefile +++ b/arch/powerpc/kexec/Makefile @@ -8,6 +8,7 @@ obj-y += core.o crash.o core_$(BITS).o obj-$(CONFIG_PPC32) += relocate_32.o obj-$(CONFIG_KEXEC_FILE) += file_load.o ranges.o file_load_$(BITS).o elf_$(BITS).o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o # Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_core_$(BITS).o := n diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index 27fa9098a5b7..3ff4411ed496 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -53,34 +53,6 @@ void machine_kexec_cleanup(struct kimage *image) { } -void arch_crash_save_vmcoreinfo(void) -{ - -#ifdef CONFIG_NUMA - VMCOREINFO_SYMBOL(node_data); - VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); -#endif -#ifndef CONFIG_NUMA - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) - VMCOREINFO_SYMBOL(vmemmap_list); - VMCOREINFO_SYMBOL(mmu_vmemmap_psize); - VMCOREINFO_SYMBOL(mmu_psize_defs); - VMCOREINFO_STRUCT_SIZE(vmemmap_backing); - VMCOREINFO_OFFSET(vmemmap_backing, list); - VMCOREINFO_OFFSET(vmemmap_backing, phys); - VMCOREINFO_OFFSET(vmemmap_backing, virt_addr); - VMCOREINFO_STRUCT_SIZE(mmu_psize_def); - VMCOREINFO_OFFSET(mmu_psize_def, shift); -#endif - VMCOREINFO_SYMBOL(cur_cpu_spec); - VMCOREINFO_OFFSET(cpu_spec, cpu_features); - VMCOREINFO_OFFSET(cpu_spec, mmu_features); - vmcoreinfo_append_str("NUMBER(RADIX_MMU)=%d\n", early_radix_enabled()); - vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); -} - /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. diff --git a/arch/powerpc/kexec/vmcore_info.c b/arch/powerpc/kexec/vmcore_info.c new file mode 100644 index 000000000000..2b65d2adca5e --- /dev/null +++ b/arch/powerpc/kexec/vmcore_info.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +void arch_crash_save_vmcoreinfo(void) +{ + +#ifdef CONFIG_NUMA + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif +#ifndef CONFIG_NUMA + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) + VMCOREINFO_SYMBOL(vmemmap_list); + VMCOREINFO_SYMBOL(mmu_vmemmap_psize); + VMCOREINFO_SYMBOL(mmu_psize_defs); + VMCOREINFO_STRUCT_SIZE(vmemmap_backing); + VMCOREINFO_OFFSET(vmemmap_backing, list); + VMCOREINFO_OFFSET(vmemmap_backing, phys); + VMCOREINFO_OFFSET(vmemmap_backing, virt_addr); + VMCOREINFO_STRUCT_SIZE(mmu_psize_def); + VMCOREINFO_OFFSET(mmu_psize_def, shift); +#endif + VMCOREINFO_SYMBOL(cur_cpu_spec); + VMCOREINFO_OFFSET(cpu_spec, cpu_features); + VMCOREINFO_OFFSET(cpu_spec, mmu_features); + vmcoreinfo_append_str("NUMBER(RADIX_MMU)=%d\n", early_radix_enabled()); + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); +} diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 7a562b4199c8..fa029d0dc28f 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -64,6 +64,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o obj-$(CONFIG_FUNCTION_TRACER) += mcount.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index aa22ffc16bcd..10277a460204 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -209,21 +209,6 @@ void machine_kexec_cleanup(struct kimage *image) { } -void arch_crash_save_vmcoreinfo(void) -{ - struct lowcore *abs_lc; - - VMCOREINFO_SYMBOL(lowcore_ptr); - VMCOREINFO_SYMBOL(high_memory); - VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); - vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31); - vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31); - vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); - abs_lc = get_abs_lowcore(); - abs_lc->vmcore_info = paddr_vmcoreinfo_note(); - put_abs_lowcore(abs_lc); -} - void machine_shutdown(void) { } diff --git a/arch/s390/kernel/vmcore_info.c b/arch/s390/kernel/vmcore_info.c new file mode 100644 index 000000000000..d296dfc22191 --- /dev/null +++ b/arch/s390/kernel/vmcore_info.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +void arch_crash_save_vmcoreinfo(void) +{ + struct lowcore *abs_lc; + + VMCOREINFO_SYMBOL(lowcore_ptr); + VMCOREINFO_SYMBOL(high_memory); + VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); + vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31); + vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31); + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + abs_lc = get_abs_lowcore(); + abs_lc->vmcore_info = paddr_vmcoreinfo_note(); + put_abs_lowcore(abs_lc); +} diff --git a/arch/sh/kernel/Makefile b/arch/sh/kernel/Makefile index 2d7e70537de0..ba917008d63e 100644 --- a/arch/sh/kernel/Makefile +++ b/arch/sh/kernel/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_SH_STANDARD_BIOS) += sh_bios.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_MODULES) += sh_ksyms_32.o module.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_IO_TRAPPED) += io_trapped.o diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c index 8daa8a6e6fa6..8321b31d2e19 100644 --- a/arch/sh/kernel/machine_kexec.c +++ b/arch/sh/kernel/machine_kexec.c @@ -137,17 +137,6 @@ void machine_kexec(struct kimage *image) __ftrace_enabled_restore(save_ftrace_enabled); } -void arch_crash_save_vmcoreinfo(void) -{ -#ifdef CONFIG_NUMA - VMCOREINFO_SYMBOL(node_data); - VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); -#endif -#ifdef CONFIG_X2TLB - VMCOREINFO_CONFIG(X2TLB); -#endif -} - void __init reserve_crashkernel(void) { unsigned long long crash_size, crash_base; diff --git a/arch/sh/kernel/vmcore_info.c b/arch/sh/kernel/vmcore_info.c new file mode 100644 index 000000000000..a244a204a1b1 --- /dev/null +++ b/arch/sh/kernel/vmcore_info.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +void arch_crash_save_vmcoreinfo(void) +{ +#ifdef CONFIG_NUMA + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif +#ifdef CONFIG_X2TLB + VMCOREINFO_CONFIG(X2TLB); +#endif +} From b659a7c2cec6e904ef52dc9deacea3e90d0cb168 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Thu, 8 Feb 2024 07:57:27 +0200 Subject: [PATCH 269/435] MAINTAINERS: update mm and memcg entries Add F: lines for memory management and memory cgroup include files. Link: https://lkml.kernel.org/r/20240208055727.142387-1-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- MAINTAINERS | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index f3f5981ced29..129a237b7880 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5383,6 +5383,7 @@ R: Muchun Song L: cgroups@vger.kernel.org L: linux-mm@kvack.org S: Maintained +F: include/linux/memcontrol.h F: mm/memcontrol.c F: mm/swap_cgroup.c F: samples/cgroup/* @@ -14101,15 +14102,24 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm T: quilt git://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new F: include/linux/gfp.h F: include/linux/gfp_types.h +F: include/linux/memfd.h +F: include/linux/memory.h F: include/linux/memory_hotplug.h +F: include/linux/memory-tiers.h +F: include/linux/mempolicy.h +F: include/linux/mempool.h +F: include/linux/memremap.h F: include/linux/mm.h +F: include/linux/mm_*.h F: include/linux/mmzone.h +F: include/linux/mmu_notifier.h F: include/linux/pagewalk.h F: include/linux/rmap.h F: include/trace/events/ksm.h F: mm/ F: tools/mm/ F: tools/testing/selftests/mm/ +N: include/linux/page[-_]* MEMORY MAPPING M: Andrew Morton From 879c6000e191b61b97e17bce44c4564ee42eb612 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 29 Jan 2024 13:45:51 +0800 Subject: [PATCH 270/435] mm/khugepaged: bypassing unnecessary scans with MMF_DISABLE_THP check khugepaged scans the entire address space in the background for each given mm, looking for opportunities to merge sequences of basic pages into huge pages. However, when an mm is inserted to the mm_slots list, and the MMF_DISABLE_THP flag is set later, this scanning process becomes unnecessary for that mm and can be skipped to avoid redundant operations, especially in scenarios with a large address space. On an Intel Core i5 CPU, the time taken by khugepaged to scan the address space of the process, which has been set with the MMF_DISABLE_THP flag after being added to the mm_slots list, is as follows (shorter is better): VMA Count | Old | New | Change --------------------------------------- 50 | 23us | 9us | -60.9% 100 | 32us | 9us | -71.9% 200 | 44us | 9us | -79.5% 400 | 75us | 9us | -88.0% 800 | 98us | 9us | -90.8% Once the count of VMAs for the process exceeds page_to_scan, khugepaged needs to wait for scan_sleep_millisecs ms before scanning the next process. IMO, unnecessary scans could actually be skipped with a very inexpensive mm->flags check in this case. This commit introduces a check before each scanning process to test the MMF_DISABLE_THP flag for the given mm; if the flag is set, the scanning process is bypassed, thereby improving the efficiency of khugepaged. This optimization is not a correctness issue but rather an enhancement to save expensive checks on each VMA when userspace cannot prctl itself before spawning into the new process. On some servers within our company, we deploy a daemon responsible for monitoring and updating local applications. Some applications prefer not to use THP, so the daemon calls prctl to disable THP before fork/exec. Conversely, for other applications, the daemon calls prctl to enable THP before fork/exec. Ideally, the daemon should invoke prctl after the fork, but its current implementation follows the described approach. In the Go standard library, there is no direct encapsulation of the fork system call; instead, fork and execve are combined into one through syscall.ForkExec. Link: https://lkml.kernel.org/r/20240129054551.57728-1-ioworker0@gmail.com Signed-off-by: Lance Yang Acked-by: David Hildenbrand Cc: Michal Hocko Cc: Minchan Kim Cc: Muchun Song Cc: Peter Xu Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/khugepaged.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index fe43fbc44525..2771fc043b3b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -410,6 +410,12 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) +{ + return hpage_collapse_test_exit(mm) || + test_bit(MMF_DISABLE_THP, &mm->flags); +} + void __khugepaged_enter(struct mm_struct *mm) { struct khugepaged_mm_slot *mm_slot; @@ -1422,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) lockdep_assert_held(&khugepaged_mm_lock); - if (hpage_collapse_test_exit(mm)) { + if (hpage_collapse_test_exit_or_disable(mm)) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); @@ -2360,7 +2366,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, goto breakouterloop_mmap_lock; progress++; - if (unlikely(hpage_collapse_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; vma_iter_init(&vmi, mm, khugepaged_scan.address); @@ -2368,7 +2374,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, unsigned long hstart, hend; cond_resched(); - if (unlikely(hpage_collapse_test_exit(mm))) { + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { progress++; break; } @@ -2390,7 +2396,7 @@ skip: bool mmap_locked = true; cond_resched(); - if (unlikely(hpage_collapse_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || @@ -2408,7 +2414,7 @@ skip: fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); - if (hpage_collapse_test_exit(mm)) + if (hpage_collapse_test_exit_or_disable(mm)) goto breakouterloop; *result = collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); @@ -2450,7 +2456,7 @@ breakouterloop_mmap_lock: * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ - if (hpage_collapse_test_exit(mm) || !vma) { + if (hpage_collapse_test_exit_or_disable(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find From 78f2f60377ee43b7f27b4584dae754b8aa3f80e1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:12 -0800 Subject: [PATCH 271/435] mm/damon/core: set damos_quota->esz as public field and document Patch series "mm/damon: let DAMOS feeds and tame/auto-tune itself". The Aim-oriented Feedback-driven DAMOS Aggressiveness Auto-tuning patchset[1] which has merged since commit 9294a037c015 ("mm/damon/core: implement goal-oriented feedback-driven quota auto-tuning") made the mechanism and the policy separated. That is, users can set a part of DAMOS control policies without a deep understanding of the mechanism but just their demands such as SLA. However, users are still required to do some additional work of manually collecting their target metric and feeding it to DAMOS. In the case of end-users who use DAMON sysfs interface, the context switches between user-space and kernel-space could also make it inefficient. The overhead is supposed to be only trivial in common cases, though. Meanwhile, in simple use cases, the target metric could be common system metrics that the kernel can efficiently self-retrieve, such as memory pressure stall time (PSI). Extend DAMOS quota auto-tuning to support multiple types of metrics including the DAMOS self-retrievable ones, and add support for memory pressure stall time metric. Different types of metrics can be supported in future. The auto-tuning capability is currently supported for only users of DAMOS kernel API and DAMON sysfs interface. Extend the support to DAMON_RECLAIM. Patches Sequence ================ First five patches are for helping debugging and fine-tuning existing quota control features. The first one (patch 1) exposes the effective quota that is made with given user inputs to DAMOS kernel API users and kernel-doc documents. Following four patches implement (patches 1, 2 and 3) and document (patches 4 and 5) a new DAMON sysfs file that exposes the value. Following six patches cleanup and simplify the existing DAMOS quota auto-tuning code by improving layout of comments and data structures (patches 6 and 7), supporting common use cases, namely multiple goals (patches 8, 9 and 10), and simplifying the interface (patch 11). Then six patches for the main purpose of this patchset follow. The first three changes extend the core logic for various target metrics (patch 12), implement memory pressure stall time-based target metric support (patch 13), and update DAMON sysfs interface to support the new target metric (patch 14). Then, documentation updates for the features on design (patch 15), ABI (patch 16), and usage (patch 17) follow. Last three patches add auto-tuning support on DAMON_RECLAIM. The patches implement DAMON_RECLAIM parameters for user-feedback driven quota auto-tuning (patch 18), memory pressure stall time-driven quota self-tuning (patch 19), and finally update the DAMON_RECLAIM usage document for the new parameters (patch 20). [1] https://lore.kernel.org/all/20231130023652.50284-1-sj@kernel.org/ This patch (of 20): DAMOS allow users to specify the quota as they want in multiple ways including time quota, size quota, and feedback-based auto-tuning. DAMOS makes one effective quota out of the inputs and use it at the end. Knowing the current effective quota helps understanding DAMOS' internal mechanism and fine-tuning quotas. DAMON kernel API users can get the information from ->esz field of damos_quota struct, but the field is marked as private purpose, and not kernel-doc documented. Make it public and document. Link: https://lkml.kernel.org/r/20240219194431.159606-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240219194431.159606-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++-- mm/damon/core.c | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 5881e4ac30be..93ef45b87b9c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -138,6 +138,7 @@ enum damos_action { * * @get_score: Feedback function for self-tuning quota. * @get_score_arg: Parameter for @get_score + * @esz: Effective size quota in bytes. * * To avoid consuming too much CPU time or IO resources for applying the * &struct damos->action to large memory, DAMON allows users to set time and/or @@ -167,6 +168,8 @@ enum damos_action { * tuning is getting the feedback screo value of 10,000. If @ms and/or @sz are * set together, those work as a hard limit quota. If neither @ms nor @sz are * set, the mechanism starts from the quota of one byte. + * + * The resulting effective size quota in bytes is set to @esz. */ struct damos_quota { unsigned long ms; @@ -179,14 +182,13 @@ struct damos_quota { unsigned long (*get_score)(void *arg); void *get_score_arg; + unsigned long esz; /* private: */ /* For throughput estimation */ unsigned long total_charged_sz; unsigned long total_charged_ns; - unsigned long esz; /* Effective size quota in bytes */ - /* For charging the quota */ unsigned long charged_sz; unsigned long charged_from; diff --git a/mm/damon/core.c b/mm/damon/core.c index 5b325749fc12..0656966a6fc4 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -299,12 +299,12 @@ void damos_destroy_filter(struct damos_filter *f) damos_free_filter(f); } -/* initialize private fields of damos_quota and return the pointer */ -static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota) +/* initialize fields of @quota that normally API users wouldn't set */ +static struct damos_quota *damos_quota_init(struct damos_quota *quota) { + quota->esz = 0; quota->total_charged_sz = 0; quota->total_charged_ns = 0; - quota->esz = 0; quota->charged_sz = 0; quota->charged_from = 0; quota->charge_target_from = NULL; @@ -336,7 +336,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); - scheme->quota = *(damos_quota_init_priv(quota)); + scheme->quota = *(damos_quota_init(quota)); scheme->wmarks = *wmarks; scheme->wmarks.activated = true; From 6813131578ecd830c3be9b8a26bc62a2d2027610 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:13 -0800 Subject: [PATCH 272/435] mm/damon/sysfs-schemes: implement quota effective_bytes file DAMON sysfs interface allows users to set two types of quotas, namely time quota and size quota. DAMOS converts time quota to a size quota and use smaller one among the resulting two size quotas. The resulting effective size quota can be helpful for debugging and analysis, but not exposed to the user. The recently added feedback-driven quota auto-tuning is making it even more mysterious. Implement a DAMON sysfs interface read-only empty file, namely 'effective_bytes', under the quota goal DAMON sysfs directory. It will be extended to expose the effective quota to the end user. Link: https://lkml.kernel.org/r/20240219194431.159606-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index f6c7f43f06cc..dd46b2db5455 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1139,6 +1139,7 @@ struct damon_sysfs_quotas { unsigned long ms; unsigned long sz; unsigned long reset_interval_ms; + unsigned long effective_sz; /* Effective size quota in bytes */ }; static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) @@ -1252,6 +1253,15 @@ static ssize_t reset_interval_ms_store(struct kobject *kobj, return count; } +static ssize_t effective_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->effective_sz); +} + static void damon_sysfs_quotas_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); @@ -1266,10 +1276,14 @@ static struct kobj_attribute damon_sysfs_quotas_sz_attr = static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = __ATTR_RW_MODE(reset_interval_ms, 0600); +static struct kobj_attribute damon_sysfs_quotas_effective_bytes_attr = + __ATTR_RO_MODE(effective_bytes, 0400); + static struct attribute *damon_sysfs_quotas_attrs[] = { &damon_sysfs_quotas_ms_attr.attr, &damon_sysfs_quotas_sz_attr.attr, &damon_sysfs_quotas_reset_interval_ms_attr.attr, + &damon_sysfs_quotas_effective_bytes_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_quotas); From c71f8a710c39f8138bdd6abcd922789ebffa2de3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:14 -0800 Subject: [PATCH 273/435] mm/damon/sysfs: implement a kdamond command for updating schemes' effective quotas Implement yet another kdamond 'state' file input command, namely 'update_schemes_effective_quotas'. If it is written, the 'effective_bytes' files of the kdamond will be updated to provide the current effective size quota of each scheme in bytes. Link: https://lkml.kernel.org/r/20240219194431.159606-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-common.h | 4 ++++ mm/damon/sysfs-schemes.c | 20 ++++++++++++++++++++ mm/damon/sysfs.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index ec0703e1e90b..5a1ac15fb2f8 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -61,3 +61,7 @@ int damon_sysfs_schemes_clear_regions( void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); + +void damos_sysfs_update_effective_quotas( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index dd46b2db5455..9d90e7b757b7 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1930,6 +1930,26 @@ void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, } } +void damos_sysfs_update_effective_quotas( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + struct damos *scheme; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_quotas *sysfs_quotas; + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + break; + + sysfs_quotas = + sysfs_schemes->schemes_arr[schemes_idx++]->quotas; + sysfs_quotas->effective_sz = scheme->quota.esz; + } +} + static struct damos *damon_sysfs_mk_scheme( struct damon_sysfs_scheme *sysfs_scheme) { diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 678de97fcc88..cc2d88a901f4 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1019,6 +1019,11 @@ enum damon_sysfs_cmd { * regions */ DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS, + /* + * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS: Update the + * effective size quota of the scheme in bytes. + */ + DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS, /* * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands. */ @@ -1035,6 +1040,7 @@ static const char * const damon_sysfs_cmd_strs[] = { "update_schemes_tried_bytes", "update_schemes_tried_regions", "clear_schemes_tried_regions", + "update_schemes_effective_quotas", }; /* @@ -1375,6 +1381,29 @@ static int damon_sysfs_commit_schemes_quota_goals( return 0; } +/* + * damon_sysfs_upd_schemes_effective_quotas() - Update schemes effective quotas + * sysfs files. + * @kdamond: The kobject wrapper that associated to the kdamond thread. + * + * This function reads the schemes' effective quotas of specific kdamond and + * update the related values for sysfs files. This function should be called + * from DAMON callbacks while holding ``damon_syfs_lock``, to safely access the + * DAMON contexts-internal data and DAMON sysfs variables. + */ +static int damon_sysfs_upd_schemes_effective_quotas( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + damos_sysfs_update_effective_quotas( + kdamond->contexts->contexts_arr[0]->schemes, ctx); + return 0; +} + + /* * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests. * @c: The DAMON context of the callback. @@ -1437,6 +1466,9 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active, case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: err = damon_sysfs_clear_schemes_regions(kdamond); break; + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS: + err = damon_sysfs_upd_schemes_effective_quotas(kdamond); + break; default: break; } From 68c4905bba24691eed0cfb5c19f106f6c162ce02 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:15 -0800 Subject: [PATCH 274/435] Docs/ABI/damon: document effective_bytes sysfs file Update the DAMON ABI doc for the effective_bytes sysfs file and the kdamond state file input command for updating the content of the file. Link: https://lkml.kernel.org/r/20240219194431.159606-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index bfa5b8288d8d..a1e4fdb04f95 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -34,7 +34,9 @@ Description: Writing 'on' or 'off' to this file makes the kdamond starts or kdamond. Writing 'update_schemes_tried_bytes' to the file updates only '.../tried_regions/total_bytes' files of this kdamond. Writing 'clear_schemes_tried_regions' to the file - removes contents of the 'tried_regions' directory. + removes contents of the 'tried_regions' directory. Writing + 'update_schemes_effective_quotas' to the file updates + '.../quotas/effective_bytes' files of this kdamond. What: /sys/kernel/mm/damon/admin/kdamonds//pid Date: Mar 2022 @@ -208,6 +210,12 @@ Contact: SeongJae Park Description: Writing to and reading from this file sets and gets the size quota of the scheme in bytes. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/effective_bytes +Date: Feb 2024 +Contact: SeongJae Park +Description: Reading from this file gets the effective size quota of the + scheme in bytes, which adjusted for the time quota and goals. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/reset_interval_ms Date: Mar 2022 Contact: SeongJae Park From a6068d6dfa2f53bdee9d48f32d9e39cdeb74b372 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:16 -0800 Subject: [PATCH 275/435] Docs/admin-guide/mm/damon/usage: document effective_bytes file Update DAMON usage document for the effective quota file of the DAMON sysfs interface. Link: https://lkml.kernel.org/r/20240219194431.159606-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 22254997723c..a88cda52b095 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -83,7 +83,7 @@ comma (","). │ │ │ │ │ │ │ │ sz/min,max │ │ │ │ │ │ │ │ nr_accesses/min,max │ │ │ │ │ │ │ │ age/min,max - │ │ │ │ │ │ │ :ref:`quotas `/ms,bytes,reset_interval_ms + │ │ │ │ │ │ │ :ref:`quotas `/ms,bytes,reset_interval_ms,effective_bytes │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil │ │ │ │ │ │ │ │ :ref:`goals `/nr_goals │ │ │ │ │ │ │ │ │ 0/target_value,current_value @@ -153,6 +153,9 @@ Users can write below commands for the kdamond to the ``state`` file. - ``clear_schemes_tried_regions``: Clear the DAMON-based operating scheme action tried regions directory for each DAMON-based operation scheme of the kdamond. +- ``update_schemes_effective_bytes``: Update the contents of + ``effective_bytes`` files for each DAMON-based operation scheme of the + kdamond. For more details, refer to :ref:`quotas directory `. If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread. @@ -320,8 +323,9 @@ schemes//quotas/ The directory for the :ref:`quotas ` of the given DAMON-based operation scheme. -Under ``quotas`` directory, three files (``ms``, ``bytes``, -``reset_interval_ms``) and two directores (``weights`` and ``goals``) exist. +Under ``quotas`` directory, four files (``ms``, ``bytes``, +``reset_interval_ms``, ``effective_bytes``) and two directores (``weights`` and +``goals``) exist. You can set the ``time quota`` in milliseconds, ``size quota`` in bytes, and ``reset interval`` in milliseconds by writing the values to the three files, @@ -332,6 +336,15 @@ apply the action to only up to ``bytes`` bytes of memory regions within the quota limits unless at least one :ref:`goal ` is set. +The time quota is internally transformed to a size quota. Between the +transformed size quota and user-specified size quota, smaller one is applied. +Based on the user-specified :ref:`goal `, the +effective size quota is further adjusted. Reading ``effective_bytes`` returns +the current effective size quota. The file is not updated in real time, so +users should ask DAMON sysfs interface to update the content of the file for +the stats by writing a special keyword, ``update_schemes_effective_bytes`` to +the relevant ``kdamonds//state`` file. + Under ``weights`` directory, three files (``sz_permil``, ``nr_accesses_permil``, and ``age_permil``) exist. You can set the :ref:`prioritization weights From 4d791a0a2ab47d70131909942009f12f61db20ab Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:17 -0800 Subject: [PATCH 276/435] mm/damon: move comments and fields for damos-quota-prioritization to the end The comments and definition of 'struct damos_quota' lists a few fields for effective quota generation first, fields for regions prioritization under the quota, and then remaining fields for effective quota generation. Readers' should unnecesssarily switch their context in the middle. List all the fields for the effective quota first, and then fields for the prioritization for making it easier to read. Link: https://lkml.kernel.org/r/20240219194431.159606-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 93ef45b87b9c..bd17b14828bc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -128,18 +128,17 @@ enum damos_action { /** * struct damos_quota - Controls the aggressiveness of the given scheme. + * @reset_interval: Charge reset interval in milliseconds. * @ms: Maximum milliseconds that the scheme can use. * @sz: Maximum bytes of memory that the action can be applied. - * @reset_interval: Charge reset interval in milliseconds. + * @get_score: Feedback function for self-tuning quota. + * @get_score_arg: Parameter for @get_score + * @esz: Effective size quota in bytes. * * @weight_sz: Weight of the region's size for prioritization. * @weight_nr_accesses: Weight of the region's nr_accesses for prioritization. * @weight_age: Weight of the region's age for prioritization. * - * @get_score: Feedback function for self-tuning quota. - * @get_score_arg: Parameter for @get_score - * @esz: Effective size quota in bytes. - * * To avoid consuming too much CPU time or IO resources for applying the * &struct damos->action to large memory, DAMON allows users to set time and/or * size quotas. The quotas can be set by writing non-zero values to &ms and @@ -152,12 +151,6 @@ enum damos_action { * throughput of the scheme's action. DAMON then compares it against &sz and * uses smaller one as the effective quota. * - * For selecting regions within the quota, DAMON prioritizes current scheme's - * target memory regions using the &struct damon_operations->get_scheme_score. - * You could customize the prioritization logic by setting &weight_sz, - * &weight_nr_accesses, and &weight_age, because monitoring operations are - * encouraged to respect those. - * * If @get_score function pointer is set, DAMON calls it back with * @get_score_arg and get the return value of it for every @reset_interval. * Then, DAMON adjusts the effective quota using the return value as a feedback @@ -170,20 +163,25 @@ enum damos_action { * set, the mechanism starts from the quota of one byte. * * The resulting effective size quota in bytes is set to @esz. + * + * For selecting regions within the quota, DAMON prioritizes current scheme's + * target memory regions using the &struct damon_operations->get_scheme_score. + * You could customize the prioritization logic by setting &weight_sz, + * &weight_nr_accesses, and &weight_age, because monitoring operations are + * encouraged to respect those. */ struct damos_quota { + unsigned long reset_interval; unsigned long ms; unsigned long sz; - unsigned long reset_interval; + unsigned long (*get_score)(void *arg); + void *get_score_arg; + unsigned long esz; unsigned int weight_sz; unsigned int weight_nr_accesses; unsigned int weight_age; - unsigned long (*get_score)(void *arg); - void *get_score_arg; - unsigned long esz; - /* private: */ /* For throughput estimation */ unsigned long total_charged_sz; From 106e26fc1c4c1a0e3747246e15df2bc3aa9d46b2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:18 -0800 Subject: [PATCH 277/435] mm/damon/core: split out quota goal related fields to a struct 'struct damos_quota' is not small now. Split out fields for quota goal to a separate struct for easier reading. Link: https://lkml.kernel.org/r/20240219194431.159606-8-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 36 ++++++++++++++++++++++-------------- mm/damon/core.c | 13 +++++++------ mm/damon/sysfs-schemes.c | 10 +++++----- 3 files changed, 34 insertions(+), 25 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index bd17b14828bc..2fe345adf6b2 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -126,13 +126,28 @@ enum damos_action { NR_DAMOS_ACTIONS, }; +/** + * struct damos_quota_goal - DAMOS scheme quota auto-tuning goal. + * @get_score: Function for getting current score of the goal. + * @get_score_arg: Parameter for @get_score + * + * Data structure for getting the current score of the quota tuning goal. + * Calling @get_score with @get_score_arg as the parameter should return the + * current score. Then the score is entered to DAMON's internal feedback loop + * mechanism to get the auto-tuned quota. The goal of the tuning is getting + * the feedback score value of 10,000. + */ +struct damos_quota_goal { + unsigned long (*get_score)(void *arg); + void *get_score_arg; +}; + /** * struct damos_quota - Controls the aggressiveness of the given scheme. * @reset_interval: Charge reset interval in milliseconds. * @ms: Maximum milliseconds that the scheme can use. * @sz: Maximum bytes of memory that the action can be applied. - * @get_score: Feedback function for self-tuning quota. - * @get_score_arg: Parameter for @get_score + * @goal: Quota auto-tuning goal. * @esz: Effective size quota in bytes. * * @weight_sz: Weight of the region's size for prioritization. @@ -151,16 +166,10 @@ enum damos_action { * throughput of the scheme's action. DAMON then compares it against &sz and * uses smaller one as the effective quota. * - * If @get_score function pointer is set, DAMON calls it back with - * @get_score_arg and get the return value of it for every @reset_interval. - * Then, DAMON adjusts the effective quota using the return value as a feedback - * score to the current quota, using its internal feedback loop algorithm. - * - * The feedback loop algorithem assumes the quota input and the feedback score - * output are in a positive proportional relationship, and the goal of the - * tuning is getting the feedback screo value of 10,000. If @ms and/or @sz are - * set together, those work as a hard limit quota. If neither @ms nor @sz are - * set, the mechanism starts from the quota of one byte. + * If ->get_score field of @goal is set, DAMON calculates yet another size + * quota based on the goal using its internal feedback loop algorithm, for + * every @reset_interval. Then, if the new size quota is smaller than the + * effective quota, it uses the new size quota as the effective quota. * * The resulting effective size quota in bytes is set to @esz. * @@ -174,8 +183,7 @@ struct damos_quota { unsigned long reset_interval; unsigned long ms; unsigned long sz; - unsigned long (*get_score)(void *arg); - void *get_score_arg; + struct damos_quota_goal goal; unsigned long esz; unsigned int weight_sz; diff --git a/mm/damon/core.c b/mm/damon/core.c index 0656966a6fc4..fe4209672121 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1083,21 +1083,22 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input, return min_input; } -/* Shouldn't be called if quota->ms, quota->sz, and quota->get_score unset */ +/* Called only if quota->ms, quota->sz, or quota->goal.get_score are set */ static void damos_set_effective_quota(struct damos_quota *quota) { unsigned long throughput; unsigned long esz; - if (!quota->ms && !quota->get_score) { + if (!quota->ms && !quota->goal.get_score) { quota->esz = quota->sz; return; } - if (quota->get_score) { + if (quota->goal.get_score) { quota->esz_bp = damon_feed_loop_next_input( max(quota->esz_bp, 10000UL), - quota->get_score(quota->get_score_arg)); + quota->goal.get_score( + quota->goal.get_score_arg)); esz = quota->esz_bp / 10000; } @@ -1107,7 +1108,7 @@ static void damos_set_effective_quota(struct damos_quota *quota) quota->total_charged_ns; else throughput = PAGE_SIZE * 1024; - if (quota->get_score) + if (quota->goal.get_score) esz = min(throughput * quota->ms, esz); else esz = throughput * quota->ms; @@ -1127,7 +1128,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) unsigned long cumulated_sz; unsigned int score, max_score = 0; - if (!quota->ms && !quota->sz && !quota->get_score) + if (!quota->ms && !quota->sz && !quota->goal.get_score) return; /* New charge window starts */ diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 9d90e7b757b7..85ef58f98a87 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1894,19 +1894,19 @@ static void damos_sysfs_set_quota_score( struct damos_sysfs_quota_goal *sysfs_goal; int i; - quota->get_score = NULL; - quota->get_score_arg = (void *)0; + quota->goal.get_score = NULL; + quota->goal.get_score_arg = (void *)0; for (i = 0; i < sysfs_goals->nr; i++) { sysfs_goal = sysfs_goals->goals_arr[i]; if (!sysfs_goal->target_value) continue; /* Higher score makes scheme less aggressive */ - quota->get_score_arg = (void *)max( - (unsigned long)quota->get_score_arg, + quota->goal.get_score_arg = (void *)max( + (unsigned long)quota->goal.get_score_arg, sysfs_goal->current_value * 10000 / sysfs_goal->target_value); - quota->get_score = damos_sysfs_get_quota_score; + quota->goal.get_score = damos_sysfs_get_quota_score; } } From 91f21216a79d00f7da380ed4ce100e8a7a0d0cff Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:19 -0800 Subject: [PATCH 278/435] mm/damon/core: add multiple goals per damos_quota and helpers for those The feedback-driven DAMOS quota auto-tuning feature allows only single goal to the DAMON kernel API users. The API users could implement multiple goals for the end-users on their level, and that's what DAMON sysfs interface is doing. More DAMON kernel API users such as DAMON_RECLAIM would need to do similar work. To reduce unnecessary future duplciated efforts, support multiple goals from DAMOS core layer. To make the support in minimum non-destructive change, keep the old single goal setup interface, and add multiple goals setup. The single goal will treated as one of the multiple goals, so old API users are not required to make any change. Link: https://lkml.kernel.org/r/20240219194431.159606-9-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 17 ++++++++++ mm/damon/core.c | 78 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 88 insertions(+), 7 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 2fe345adf6b2..4bd898eaf80e 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -130,6 +130,7 @@ enum damos_action { * struct damos_quota_goal - DAMOS scheme quota auto-tuning goal. * @get_score: Function for getting current score of the goal. * @get_score_arg: Parameter for @get_score + * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. * Calling @get_score with @get_score_arg as the parameter should return the @@ -140,6 +141,7 @@ enum damos_action { struct damos_quota_goal { unsigned long (*get_score)(void *arg); void *get_score_arg; + struct list_head list; }; /** @@ -148,6 +150,7 @@ struct damos_quota_goal { * @ms: Maximum milliseconds that the scheme can use. * @sz: Maximum bytes of memory that the action can be applied. * @goal: Quota auto-tuning goal. + * @goals: Head of quota tuning goals (&damos_quota_goal) list. * @esz: Effective size quota in bytes. * * @weight_sz: Weight of the region's size for prioritization. @@ -171,6 +174,8 @@ struct damos_quota_goal { * every @reset_interval. Then, if the new size quota is smaller than the * effective quota, it uses the new size quota as the effective quota. * + * If @goals is not empty, same action is taken for each goal of the list. + * * The resulting effective size quota in bytes is set to @esz. * * For selecting regions within the quota, DAMON prioritizes current scheme's @@ -184,6 +189,7 @@ struct damos_quota { unsigned long ms; unsigned long sz; struct damos_quota_goal goal; + struct list_head goals; unsigned long esz; unsigned int weight_sz; @@ -648,6 +654,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damon_for_each_scheme_safe(s, next, ctx) \ list_for_each_entry_safe(s, next, &(ctx)->schemes, list) +#define damos_for_each_quota_goal(goal, quota) \ + list_for_each_entry(goal, "a->goals, list) + +#define damos_for_each_quota_goal_safe(goal, next, quota) \ + list_for_each_entry_safe(goal, next, &(quota)->goals, list) + #define damos_for_each_filter(f, scheme) \ list_for_each_entry(f, &(scheme)->filters, list) @@ -681,6 +693,11 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type, void damos_add_filter(struct damos *s, struct damos_filter *f); void damos_destroy_filter(struct damos_filter *f); +struct damos_quota_goal *damos_new_quota_goal( + unsigned long (*get_score)(void *), void *get_score_arg); +void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g); +void damos_destroy_quota_goal(struct damos_quota_goal *goal); + struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, unsigned long apply_interval_us, diff --git a/mm/damon/core.c b/mm/damon/core.c index fe4209672121..b6cd99b64e85 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -299,6 +299,41 @@ void damos_destroy_filter(struct damos_filter *f) damos_free_filter(f); } +struct damos_quota_goal *damos_new_quota_goal( + unsigned long (*get_score)(void *), void *get_score_arg) +{ + struct damos_quota_goal *goal; + + goal = kmalloc(sizeof(*goal), GFP_KERNEL); + if (!goal) + return NULL; + goal->get_score = get_score; + goal->get_score_arg = get_score_arg; + INIT_LIST_HEAD(&goal->list); + return goal; +} + +void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g) +{ + list_add_tail(&g->list, &q->goals); +} + +static void damos_del_quota_goal(struct damos_quota_goal *g) +{ + list_del(&g->list); +} + +static void damos_free_quota_goal(struct damos_quota_goal *g) +{ + kfree(g); +} + +void damos_destroy_quota_goal(struct damos_quota_goal *g) +{ + damos_del_quota_goal(g); + damos_free_quota_goal(g); +} + /* initialize fields of @quota that normally API users wouldn't set */ static struct damos_quota *damos_quota_init(struct damos_quota *quota) { @@ -337,6 +372,8 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, INIT_LIST_HEAD(&scheme->list); scheme->quota = *(damos_quota_init(quota)); + /* quota.goals should be separately set by caller */ + INIT_LIST_HEAD(&scheme->quota.goals); scheme->wmarks = *wmarks; scheme->wmarks.activated = true; @@ -373,8 +410,12 @@ static void damon_free_scheme(struct damos *s) void damon_destroy_scheme(struct damos *s) { + struct damos_quota_goal *g, *g_next; struct damos_filter *f, *next; + damos_for_each_quota_goal_safe(g, g_next, &s->quota) + damos_destroy_quota_goal(g); + damos_for_each_filter_safe(f, next, s) damos_destroy_filter(f); damon_del_scheme(s); @@ -1083,22 +1124,44 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input, return min_input; } -/* Called only if quota->ms, quota->sz, or quota->goal.get_score are set */ +/* Return the highest score since it makes schemes least aggressive */ +static unsigned long damos_quota_score(struct damos_quota *quota) +{ + struct damos_quota_goal *goal; + unsigned long highest_score = 0; + + if (quota->goal.get_score) + highest_score = quota->goal.get_score( + quota->goal.get_score_arg); + + damos_for_each_quota_goal(goal, quota) + highest_score = max(highest_score, + goal->get_score(goal->get_score_arg)); + + return highest_score; +} + +/* + * Called only if quota->ms, quota->sz, or quota->goal.get_score are set, or + * quota->goals is not empty + */ static void damos_set_effective_quota(struct damos_quota *quota) { unsigned long throughput; unsigned long esz; - if (!quota->ms && !quota->goal.get_score) { + if (!quota->ms && !quota->goal.get_score && + list_empty("a->goals)) { quota->esz = quota->sz; return; } - if (quota->goal.get_score) { + if (quota->goal.get_score || !list_empty("a->goals)) { + unsigned long score = damos_quota_score(quota); + quota->esz_bp = damon_feed_loop_next_input( max(quota->esz_bp, 10000UL), - quota->goal.get_score( - quota->goal.get_score_arg)); + score); esz = quota->esz_bp / 10000; } @@ -1108,7 +1171,7 @@ static void damos_set_effective_quota(struct damos_quota *quota) quota->total_charged_ns; else throughput = PAGE_SIZE * 1024; - if (quota->goal.get_score) + if (quota->goal.get_score || !list_empty("a->goals)) esz = min(throughput * quota->ms, esz); else esz = throughput * quota->ms; @@ -1128,7 +1191,8 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) unsigned long cumulated_sz; unsigned int score, max_score = 0; - if (!quota->ms && !quota->sz && !quota->goal.get_score) + if (!quota->ms && !quota->sz && !quota->goal.get_score && + list_empty("a->goals)) return; /* New charge window starts */ From 9e736fdffe527b25cdd8c5b019e88681796fdb6e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:20 -0800 Subject: [PATCH 279/435] mm/damon/sysfs: use only quota->goals DAMON sysfs interface implements multiple quota auto-tuning goals on its level since the DAMOS core logic was supporting only single goal. Now the core logic supports multiple goals on its level. Update DAMON sysfs interface to reuse the core logic and drop unnecessary duplicated multiple goals implementation. Link: https://lkml.kernel.org/r/20240219194431.159606-10-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-common.h | 2 +- mm/damon/sysfs-schemes.c | 49 +++++++++++++++++++++++++++------------- mm/damon/sysfs.c | 3 +-- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 5a1ac15fb2f8..a63f51577cff 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -59,7 +59,7 @@ int damon_sysfs_schemes_clear_regions( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); -void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, +int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); void damos_sysfs_update_effective_quotas( diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 85ef58f98a87..7bf94b1ed6f7 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1887,30 +1887,34 @@ static unsigned long damos_sysfs_get_quota_score(void *arg) return (unsigned long)arg; } -static void damos_sysfs_set_quota_score( +static int damos_sysfs_set_quota_score( struct damos_sysfs_quota_goals *sysfs_goals, struct damos_quota *quota) { - struct damos_sysfs_quota_goal *sysfs_goal; + struct damos_quota_goal *goal, *next; int i; - quota->goal.get_score = NULL; - quota->goal.get_score_arg = (void *)0; + damos_for_each_quota_goal_safe(goal, next, quota) + damos_destroy_quota_goal(goal); + for (i = 0; i < sysfs_goals->nr; i++) { - sysfs_goal = sysfs_goals->goals_arr[i]; + struct damos_sysfs_quota_goal *sysfs_goal = + sysfs_goals->goals_arr[i]; + if (!sysfs_goal->target_value) continue; - /* Higher score makes scheme less aggressive */ - quota->goal.get_score_arg = (void *)max( - (unsigned long)quota->goal.get_score_arg, - sysfs_goal->current_value * 10000 / - sysfs_goal->target_value); - quota->goal.get_score = damos_sysfs_get_quota_score; + goal = damos_new_quota_goal(damos_sysfs_get_quota_score, + (void *)(sysfs_goal->current_value * 10000 / + sysfs_goal->target_value)); + if (!goal) + return -ENOMEM; + damos_add_quota_goal(quota, goal); } + return 0; } -void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, +int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx) { struct damos *scheme; @@ -1918,16 +1922,21 @@ void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, damon_for_each_scheme(scheme, ctx) { struct damon_sysfs_scheme *sysfs_scheme; + int err; /* user could have removed the scheme sysfs dir */ if (i >= sysfs_schemes->nr) break; sysfs_scheme = sysfs_schemes->schemes_arr[i]; - damos_sysfs_set_quota_score(sysfs_scheme->quotas->goals, + err = damos_sysfs_set_quota_score(sysfs_scheme->quotas->goals, &scheme->quota); + if (err) + /* kdamond will clean up schemes and terminated */ + return err; i++; } + return 0; } void damos_sysfs_update_effective_quotas( @@ -1987,13 +1996,17 @@ static struct damos *damon_sysfs_mk_scheme( .low = sysfs_wmarks->low, }; - damos_sysfs_set_quota_score(sysfs_quotas->goals, "a); - scheme = damon_new_scheme(&pattern, sysfs_scheme->action, sysfs_scheme->apply_interval_us, "a, &wmarks); if (!scheme) return NULL; + err = damos_sysfs_set_quota_score(sysfs_quotas->goals, &scheme->quota); + if (err) { + damon_destroy_scheme(scheme); + return NULL; + } + err = damon_sysfs_set_scheme_filters(scheme, sysfs_filters); if (err) { damon_destroy_scheme(scheme); @@ -2029,7 +2042,11 @@ static void damon_sysfs_update_scheme(struct damos *scheme, scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses; scheme->quota.weight_age = sysfs_weights->age; - damos_sysfs_set_quota_score(sysfs_quotas->goals, &scheme->quota); + err = damos_sysfs_set_quota_score(sysfs_quotas->goals, &scheme->quota); + if (err) { + damon_destroy_scheme(scheme); + return; + } scheme->wmarks.metric = sysfs_wmarks->metric; scheme->wmarks.interval = sysfs_wmarks->interval_us; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index cc2d88a901f4..6fee383bc0c5 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1377,8 +1377,7 @@ static int damon_sysfs_commit_schemes_quota_goals( ctx = sysfs_kdamond->damon_ctx; sysfs_ctx = sysfs_kdamond->contexts->contexts_arr[0]; - damos_sysfs_set_quota_scores(sysfs_ctx->schemes, ctx); - return 0; + return damos_sysfs_set_quota_scores(sysfs_ctx->schemes, ctx); } /* From 89d347a545a704e0bd4fc61f9aea956d71bc72d2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:21 -0800 Subject: [PATCH 280/435] mm/damon/core: remove ->goal field of damos_quota DAMOS quota auto-tuning feature supports static signle goal and dynamic multiple goals via DAMON kernel API, specifically via ->goal and ->goals fields of damos_quota struct, respectively. All in-tree DAMOS kernel API users are using only the dynamic multiple goals now. Remove the unsued static single goal interface. Link: https://lkml.kernel.org/r/20240219194431.159606-11-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 12 ++++-------- mm/damon/core.c | 17 +++++------------ 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 4bd898eaf80e..76c965c1eea3 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -149,7 +149,6 @@ struct damos_quota_goal { * @reset_interval: Charge reset interval in milliseconds. * @ms: Maximum milliseconds that the scheme can use. * @sz: Maximum bytes of memory that the action can be applied. - * @goal: Quota auto-tuning goal. * @goals: Head of quota tuning goals (&damos_quota_goal) list. * @esz: Effective size quota in bytes. * @@ -169,12 +168,10 @@ struct damos_quota_goal { * throughput of the scheme's action. DAMON then compares it against &sz and * uses smaller one as the effective quota. * - * If ->get_score field of @goal is set, DAMON calculates yet another size - * quota based on the goal using its internal feedback loop algorithm, for - * every @reset_interval. Then, if the new size quota is smaller than the - * effective quota, it uses the new size quota as the effective quota. - * - * If @goals is not empty, same action is taken for each goal of the list. + * If @goals is not empt, DAMON calculates yet another size quota based on the + * goals using its internal feedback loop algorithm, for every @reset_interval. + * Then, if the new size quota is smaller than the effective quota, it uses the + * new size quota as the effective quota. * * The resulting effective size quota in bytes is set to @esz. * @@ -188,7 +185,6 @@ struct damos_quota { unsigned long reset_interval; unsigned long ms; unsigned long sz; - struct damos_quota_goal goal; struct list_head goals; unsigned long esz; diff --git a/mm/damon/core.c b/mm/damon/core.c index b6cd99b64e85..7b06d926c552 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1130,10 +1130,6 @@ static unsigned long damos_quota_score(struct damos_quota *quota) struct damos_quota_goal *goal; unsigned long highest_score = 0; - if (quota->goal.get_score) - highest_score = quota->goal.get_score( - quota->goal.get_score_arg); - damos_for_each_quota_goal(goal, quota) highest_score = max(highest_score, goal->get_score(goal->get_score_arg)); @@ -1142,21 +1138,19 @@ static unsigned long damos_quota_score(struct damos_quota *quota) } /* - * Called only if quota->ms, quota->sz, or quota->goal.get_score are set, or - * quota->goals is not empty + * Called only if quota->ms, or quota->sz are set, or quota->goals is not empty */ static void damos_set_effective_quota(struct damos_quota *quota) { unsigned long throughput; unsigned long esz; - if (!quota->ms && !quota->goal.get_score && - list_empty("a->goals)) { + if (!quota->ms && list_empty("a->goals)) { quota->esz = quota->sz; return; } - if (quota->goal.get_score || !list_empty("a->goals)) { + if (!list_empty("a->goals)) { unsigned long score = damos_quota_score(quota); quota->esz_bp = damon_feed_loop_next_input( @@ -1171,7 +1165,7 @@ static void damos_set_effective_quota(struct damos_quota *quota) quota->total_charged_ns; else throughput = PAGE_SIZE * 1024; - if (quota->goal.get_score || !list_empty("a->goals)) + if (!list_empty("a->goals)) esz = min(throughput * quota->ms, esz); else esz = throughput * quota->ms; @@ -1191,8 +1185,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) unsigned long cumulated_sz; unsigned int score, max_score = 0; - if (!quota->ms && !quota->sz && !quota->goal.get_score && - list_empty("a->goals)) + if (!quota->ms && !quota->sz && list_empty("a->goals)) return; /* New charge window starts */ From 06ba5b309ed870cf1a0fedc611d0e7fbb6425a2d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:22 -0800 Subject: [PATCH 281/435] mm/damon/core: let goal specified with only target and current values DAMOS quota auto-tuning feature let users to set the goal by providing a function for getting the current score of the tuned quota. It allows flexible goal setup, but only simple user-set quota is currently being used. As a result, the only user of the DAMOS quota auto-tuning is using a silly void pointer casting based score value passing function. Simplify the interface and the user code by letting user directly set the target and the current value. Link: https://lkml.kernel.org/r/20240219194431.159606-12-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 19 +++++++++---------- mm/damon/core.c | 9 +++++---- mm/damon/sysfs-schemes.c | 10 ++-------- 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 76c965c1eea3..de0cdc7f96d2 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -128,19 +128,18 @@ enum damos_action { /** * struct damos_quota_goal - DAMOS scheme quota auto-tuning goal. - * @get_score: Function for getting current score of the goal. - * @get_score_arg: Parameter for @get_score + * @target_value: Target value to achieve with the tuning. + * @current_value: Current value that achieving with the tuning. * @list: List head for siblings. * - * Data structure for getting the current score of the quota tuning goal. - * Calling @get_score with @get_score_arg as the parameter should return the - * current score. Then the score is entered to DAMON's internal feedback loop - * mechanism to get the auto-tuned quota. The goal of the tuning is getting - * the feedback score value of 10,000. + * Data structure for getting the current score of the quota tuning goal. The + * score is calculated by how close @current_value and @target_value are. Then + * the score is entered to DAMON's internal feedback loop mechanism to get the + * auto-tuned quota. */ struct damos_quota_goal { - unsigned long (*get_score)(void *arg); - void *get_score_arg; + unsigned long target_value; + unsigned long current_value; struct list_head list; }; @@ -690,7 +689,7 @@ void damos_add_filter(struct damos *s, struct damos_filter *f); void damos_destroy_filter(struct damos_filter *f); struct damos_quota_goal *damos_new_quota_goal( - unsigned long (*get_score)(void *), void *get_score_arg); + unsigned long target_value, unsigned long current_value); void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g); void damos_destroy_quota_goal(struct damos_quota_goal *goal); diff --git a/mm/damon/core.c b/mm/damon/core.c index 7b06d926c552..907f467fc8c0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -300,15 +300,15 @@ void damos_destroy_filter(struct damos_filter *f) } struct damos_quota_goal *damos_new_quota_goal( - unsigned long (*get_score)(void *), void *get_score_arg) + unsigned long target_value, unsigned long current_value) { struct damos_quota_goal *goal; goal = kmalloc(sizeof(*goal), GFP_KERNEL); if (!goal) return NULL; - goal->get_score = get_score; - goal->get_score_arg = get_score_arg; + goal->target_value = target_value; + goal->current_value = current_value; INIT_LIST_HEAD(&goal->list); return goal; } @@ -1132,7 +1132,8 @@ static unsigned long damos_quota_score(struct damos_quota *quota) damos_for_each_quota_goal(goal, quota) highest_score = max(highest_score, - goal->get_score(goal->get_score_arg)); + goal->current_value * 10000 / + goal->target_value); return highest_score; } diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 7bf94b1ed6f7..50218a7bfa0a 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1882,11 +1882,6 @@ static int damon_sysfs_set_scheme_filters(struct damos *scheme, return 0; } -static unsigned long damos_sysfs_get_quota_score(void *arg) -{ - return (unsigned long)arg; -} - static int damos_sysfs_set_quota_score( struct damos_sysfs_quota_goals *sysfs_goals, struct damos_quota *quota) @@ -1904,9 +1899,8 @@ static int damos_sysfs_set_quota_score( if (!sysfs_goal->target_value) continue; - goal = damos_new_quota_goal(damos_sysfs_get_quota_score, - (void *)(sysfs_goal->current_value * 10000 / - sysfs_goal->target_value)); + goal = damos_new_quota_goal(sysfs_goal->target_value, + sysfs_goal->current_value); if (!goal) return -ENOMEM; damos_add_quota_goal(quota, goal); From bcce9bc16f56fbc254857fcb31674ab868b824d7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:23 -0800 Subject: [PATCH 282/435] mm/damon/core: support multiple metrics for quota goal DAMOS quota auto-tuning asks users to assess the current tuned quota and provide the feedback in a manual and repeated way. It allows users generate the feedback from a source that the kernel cannot access, and writing a script or a function for doing the manual and repeated feeding is not a big deal. However, additional works are additional works, and it could be more efficient if DAMOS could do the fetch itself, especially in case of DAMON sysfs interface use case, since it can avoid the context switches between the user-space and the kernel-space, though the overhead would be only trivial in most cases. Also in many cases, feedbacks could be made from kernel-accessible sources, such as PSI, CPU usage, etc. Make the quota goal to support multiple types of metrics including such ones. Link: https://lkml.kernel.org/r/20240219194431.159606-13-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 26 +++++++++++++++++++++++--- mm/damon/core.c | 22 +++++++++++++++++++--- mm/damon/sysfs-schemes.c | 5 +++-- 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index de0cdc7f96d2..5a06993d8479 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -126,18 +126,37 @@ enum damos_action { NR_DAMOS_ACTIONS, }; +/** + * enum damos_quota_goal_metric - Represents the metric to be used as the goal + * + * @DAMOS_QUOTA_USER_INPUT: User-input value. + * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. + * + * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. + */ +enum damos_quota_goal_metric { + DAMOS_QUOTA_USER_INPUT, + NR_DAMOS_QUOTA_GOAL_METRICS, +}; + /** * struct damos_quota_goal - DAMOS scheme quota auto-tuning goal. - * @target_value: Target value to achieve with the tuning. - * @current_value: Current value that achieving with the tuning. + * @metric: Metric to be used for representing the goal. + * @target_value: Target value of @metric to achieve with the tuning. + * @current_value: Current value of @metric. * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. The * score is calculated by how close @current_value and @target_value are. Then * the score is entered to DAMON's internal feedback loop mechanism to get the * auto-tuned quota. + * + * If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually + * entered by the user, probably inside the kdamond callbacks. Otherwise, + * DAMON sets @current_value with self-measured value of @metric. */ struct damos_quota_goal { + enum damos_quota_goal_metric metric; unsigned long target_value; unsigned long current_value; struct list_head list; @@ -689,7 +708,8 @@ void damos_add_filter(struct damos *s, struct damos_filter *f); void damos_destroy_filter(struct damos_filter *f); struct damos_quota_goal *damos_new_quota_goal( - unsigned long target_value, unsigned long current_value); + enum damos_quota_goal_metric metric, + unsigned long target_value); void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g); void damos_destroy_quota_goal(struct damos_quota_goal *goal); diff --git a/mm/damon/core.c b/mm/damon/core.c index 907f467fc8c0..973423166ee2 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -300,15 +300,16 @@ void damos_destroy_filter(struct damos_filter *f) } struct damos_quota_goal *damos_new_quota_goal( - unsigned long target_value, unsigned long current_value) + enum damos_quota_goal_metric metric, + unsigned long target_value) { struct damos_quota_goal *goal; goal = kmalloc(sizeof(*goal), GFP_KERNEL); if (!goal) return NULL; + goal->metric = metric; goal->target_value = target_value; - goal->current_value = current_value; INIT_LIST_HEAD(&goal->list); return goal; } @@ -1124,16 +1125,31 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input, return min_input; } +static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) +{ + u64 now_psi_total; + + switch (goal->metric) { + case DAMOS_QUOTA_USER_INPUT: + /* User should already set goal->current_value */ + break; + default: + break; + } +} + /* Return the highest score since it makes schemes least aggressive */ static unsigned long damos_quota_score(struct damos_quota *quota) { struct damos_quota_goal *goal; unsigned long highest_score = 0; - damos_for_each_quota_goal(goal, quota) + damos_for_each_quota_goal(goal, quota) { + damos_set_quota_goal_current_value(goal); highest_score = max(highest_score, goal->current_value * 10000 / goal->target_value); + } return highest_score; } diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 50218a7bfa0a..7a8a39f2679b 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1899,10 +1899,11 @@ static int damos_sysfs_set_quota_score( if (!sysfs_goal->target_value) continue; - goal = damos_new_quota_goal(sysfs_goal->target_value, - sysfs_goal->current_value); + goal = damos_new_quota_goal(DAMOS_QUOTA_USER_INPUT, + sysfs_goal->target_value); if (!goal) return -ENOMEM; + goal->current_value = sysfs_goal->current_value; damos_add_quota_goal(quota, goal); } return 0; From 2dbb60f789cbb5c0000a4664f40f9358b3a62ba2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:24 -0800 Subject: [PATCH 283/435] mm/damon/core: implement PSI metric DAMOS quota goal Extend DAMOS quota goal metric with system wide memory pressure stall time. Specifically, the system level 'some' PSI for memory is used. The target value can be set in microseconds. DAMOS measures the increased amount of the PSI metric in last quota_reset_interval and use the ratio of it versus the user-specified target PSI value as the score for the auto-tuning feedback loop. Link: https://lkml.kernel.org/r/20240219194431.159606-14-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 7 +++++++ mm/damon/core.c | 25 +++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 5a06993d8479..886d07294f4e 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -130,12 +130,14 @@ enum damos_action { * enum damos_quota_goal_metric - Represents the metric to be used as the goal * * @DAMOS_QUOTA_USER_INPUT: User-input value. + * @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. */ enum damos_quota_goal_metric { DAMOS_QUOTA_USER_INPUT, + DAMOS_QUOTA_SOME_MEM_PSI_US, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -144,6 +146,7 @@ enum damos_quota_goal_metric { * @metric: Metric to be used for representing the goal. * @target_value: Target value of @metric to achieve with the tuning. * @current_value: Current value of @metric. + * @last_psi_total: Last measured total PSI * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. The @@ -159,6 +162,10 @@ struct damos_quota_goal { enum damos_quota_goal_metric metric; unsigned long target_value; unsigned long current_value; + /* metric-dependent fields */ + union { + u64 last_psi_total; + }; struct list_head list; }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 973423166ee2..6d503c1c125e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -1125,6 +1126,25 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input, return min_input; } +#ifdef CONFIG_PSI + +static u64 damos_get_some_mem_psi_total(void) +{ + if (static_branch_likely(&psi_disabled)) + return 0; + return div_u64(psi_system.total[PSI_AVGS][PSI_MEM * 2], + NSEC_PER_USEC); +} + +#else /* CONFIG_PSI */ + +static inline u64 damos_get_some_mem_psi_total(void) +{ + return 0; +}; + +#endif /* CONFIG_PSI */ + static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) { u64 now_psi_total; @@ -1133,6 +1153,11 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) case DAMOS_QUOTA_USER_INPUT: /* User should already set goal->current_value */ break; + case DAMOS_QUOTA_SOME_MEM_PSI_US: + now_psi_total = damos_get_some_mem_psi_total(); + goal->current_value = now_psi_total - goal->last_psi_total; + goal->last_psi_total = now_psi_total; + break; default: break; } From 4daacfe8f99f4b4cef562649d56c48642981f46e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:25 -0800 Subject: [PATCH 284/435] mm/damon/sysfs-schemes: support PSI-based quota auto-tune Extend DAMON sysfs interface to support the PSI-based quota auto-tuning by adding a new file, 'target_metric' under the quota goal directory. Old users don't get any behavioral changes since the default value of the metric is 'user input'. Link: https://lkml.kernel.org/r/20240219194431.159606-15-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 42 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 7a8a39f2679b..53a90ac678fb 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -826,15 +826,48 @@ static const struct kobj_type damon_sysfs_watermarks_ktype = { struct damos_sysfs_quota_goal { struct kobject kobj; + enum damos_quota_goal_metric metric; unsigned long target_value; unsigned long current_value; }; +/* This should match with enum damos_action */ +static const char * const damos_sysfs_quota_goal_metric_strs[] = { + "user_input", + "some_mem_psi_us", +}; + static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void) { return kzalloc(sizeof(struct damos_sysfs_quota_goal), GFP_KERNEL); } +static ssize_t target_metric_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, + struct damos_sysfs_quota_goal, kobj); + + return sysfs_emit(buf, "%s\n", + damos_sysfs_quota_goal_metric_strs[goal->metric]); +} + +static ssize_t target_metric_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damos_sysfs_quota_goal *goal = container_of(kobj, + struct damos_sysfs_quota_goal, kobj); + enum damos_quota_goal_metric m; + + for (m = 0; m < NR_DAMOS_QUOTA_GOAL_METRICS; m++) { + if (sysfs_streq(buf, damos_sysfs_quota_goal_metric_strs[m])) { + goal->metric = m; + return count; + } + } + return -EINVAL; +} + static ssize_t target_value_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -880,6 +913,9 @@ static void damos_sysfs_quota_goal_release(struct kobject *kobj) kfree(container_of(kobj, struct damos_sysfs_quota_goal, kobj)); } +static struct kobj_attribute damos_sysfs_quota_goal_target_metric_attr = + __ATTR_RW_MODE(target_metric, 0600); + static struct kobj_attribute damos_sysfs_quota_goal_target_value_attr = __ATTR_RW_MODE(target_value, 0600); @@ -887,6 +923,7 @@ static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr = __ATTR_RW_MODE(current_value, 0600); static struct attribute *damos_sysfs_quota_goal_attrs[] = { + &damos_sysfs_quota_goal_target_metric_attr.attr, &damos_sysfs_quota_goal_target_value_attr.attr, &damos_sysfs_quota_goal_current_value_attr.attr, NULL, @@ -1899,11 +1936,12 @@ static int damos_sysfs_set_quota_score( if (!sysfs_goal->target_value) continue; - goal = damos_new_quota_goal(DAMOS_QUOTA_USER_INPUT, + goal = damos_new_quota_goal(sysfs_goal->metric, sysfs_goal->target_value); if (!goal) return -ENOMEM; - goal->current_value = sysfs_goal->current_value; + if (sysfs_goal->metric == DAMOS_QUOTA_USER_INPUT) + goal->current_value = sysfs_goal->current_value; damos_add_quota_goal(quota, goal); } return 0; From 3c17174f64fe4900a3c5eadaa6e9116b11a9bd33 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:26 -0800 Subject: [PATCH 285/435] Docs/mm/damon/design: document quota goal self-tuning Update DAMON design doc to explain the quota goal self-tuning, which can be used by setting the goal's metric to metrics that kernel can self-retrieve. Link: https://lkml.kernel.org/r/20240219194431.159606-16-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 2bd0c203dcfb..8c89d26f0baa 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -398,12 +398,28 @@ Aim-oriented Feedback-driven Auto-tuning ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Automatic feedback-driven quota tuning. Instead of setting the absolute quota -value, users can repeatedly provide numbers representing how much of their goal -for the scheme is achieved as feedback. DAMOS then automatically tunes the +value, users can specify the metric of their interest, and what target value +they want the metric value to be. DAMOS then automatically tunes the aggressiveness (the quota) of the corresponding scheme. For example, if DAMOS is under achieving the goal, DAMOS automatically increases the quota. If DAMOS is over achieving the goal, it decreases the quota. +The goal can be specified with three parameters, namely ``target_metric``, +``target_value``, and ``current_value``. The auto-tuning mechanism tries to +make ``current_value`` of ``target_metric`` be same to ``target_value``. +Currently, two ``target_metric`` are provided. + +- ``user_input``: User-provided value. Users could use any metric that they + has interest in for the value. Use space main workload's latency or + throughput, system metrics like free memory ratio or memory pressure stall + time (PSI) could be examples. Note that users should explicitly set + ``current_value`` on their own in this case. In other words, users should + repeatedly provide the feedback. +- ``some_mem_psi_us``: System-wide ``some`` memory pressure stall information + in microseconds that measured from last quota reset to next quota reset. + DAMOS does the measurement on its own, so only ``target_value`` need to be + set by users at the initial time. In other words, DAMOS does self-feedback. + .. _damon_design_damos_watermarks: From adc3908b3ccfd1ee5282e5ba75a24d9b536777c6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:27 -0800 Subject: [PATCH 286/435] Docs/ABI/damon: document quota goal metric file Update DAMON ABI document for the quota goal target_metric file. Link: https://lkml.kernel.org/r/20240219194431.159606-17-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index a1e4fdb04f95..dad4d5ffd786 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -229,6 +229,12 @@ Description: Writing a number 'N' to this file creates the number of directories for setting automatic tuning of the scheme's aggressiveness named '0' to 'N-1' under the goals/ directory. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/goals//target_metric +Date: Feb 2024 +Contact: SeongJae Park +Description: Writing to and reading from this file sets and gets the quota + auto-tuning goal metric. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//quotas/goals//target_value Date: Nov 2023 Contact: SeongJae Park From 57e88e86a167698978621a09719d716771430c48 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:28 -0800 Subject: [PATCH 287/435] Docs/admin-guide/mm/damon/usage: document quota goal metric file Update DAMON usage document for the quota goal target_metric file. [sj@kernel.org: fix a typo on the auto-tuning design reference link] Link: https://lkml.kernel.org/r/20240221170852.55529-3-sj@kernel.org Link: https://lkml.kernel.org/r/20240219194431.159606-18-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index a88cda52b095..6fce035fdbf5 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -86,7 +86,7 @@ comma (","). │ │ │ │ │ │ │ :ref:`quotas `/ms,bytes,reset_interval_ms,effective_bytes │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil │ │ │ │ │ │ │ │ :ref:`goals `/nr_goals - │ │ │ │ │ │ │ │ │ 0/target_value,current_value + │ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value │ │ │ │ │ │ │ :ref:`watermarks `/metric,interval_us,high,mid,low │ │ │ │ │ │ │ :ref:`filters `/nr_filters │ │ │ │ │ │ │ │ 0/type,matching,memcg_id @@ -366,11 +366,11 @@ number (``N``) to the file creates the number of child directories named ``0`` to ``N-1``. Each directory represents each goal and current achievement. Among the multiple feedback, the best one is used. -Each goal directory contains two files, namely ``target_value`` and -``current_value``. Users can set and get any number to those files to set the -feedback. User space main workload's latency or throughput, system metrics -like free memory ratio or memory pressure stall time (PSI) could be example -metrics for the values. Note that users should write +Each goal directory contains three files, namely ``target_metric``, +``target_value`` and ``current_value``. Users can set and get the three +parameters for the quota auto-tuning goals that specified on the :ref:`design +doc ` by writing to and reading from each +of the files. Note that users should further write ``commit_schemes_quota_goals`` to the ``state`` file of the :ref:`kdamond directory ` to pass the feedback to DAMON. From 58dea17d7a0f8e5dfd36d4ae26e85a26e866b0d4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:29 -0800 Subject: [PATCH 288/435] mm/damon/reclaim: implement user-feedback driven quota auto-tuning DAMOS supports user-feedback driven quota auto-tuning, but only DAMON sysfs interface is using it. Add support of the feature on DAMON_RECLAIM by adding one more input parameter, namely 'quota_autotune_feedback', for providing the user feedback to DAMON_RECLAIM. It assumes the target value of the feedback is 10,000. Link: https://lkml.kernel.org/r/20240219194431.159606-19-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 66e190f0374a..9df6b8819998 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -62,6 +62,21 @@ static struct damos_quota damon_reclaim_quota = { }; DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota); +/* + * User-specifiable feedback for auto-tuning of the effective quota. + * + * While keeping the caps that set by other quotas, DAMON_RECLAIM automatically + * increases and decreases the effective level of the quota aiming receiving this + * feedback of value ``10,000`` from the user. DAMON_RECLAIM assumes the feedback + * value and the quota are positively proportional. Value zero means disabling + * this auto-tuning feature. + * + * Disabled by default. + * + */ +static unsigned long quota_autotune_feedback __read_mostly; +module_param(quota_autotune_feedback, ulong, 0600); + static struct damos_watermarks damon_reclaim_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = 5000000, /* 5 seconds */ @@ -159,11 +174,13 @@ static void damon_reclaim_copy_quota_status(struct damos_quota *dst, dst->charged_from = src->charged_from; dst->charge_target_from = src->charge_target_from; dst->charge_addr_from = src->charge_addr_from; + dst->esz_bp = src->esz_bp; } static int damon_reclaim_apply_parameters(void) { struct damos *scheme, *old_scheme; + struct damos_quota_goal *goal; struct damos_filter *filter; int err = 0; @@ -180,6 +197,17 @@ static int damon_reclaim_apply_parameters(void) damon_reclaim_copy_quota_status(&scheme->quota, &old_scheme->quota); } + + if (quota_autotune_feedback) { + goal = damos_new_quota_goal(DAMOS_QUOTA_USER_INPUT, 10000); + if (!goal) { + damon_destroy_scheme(scheme); + return -ENOMEM; + } + goal->current_value = quota_autotune_feedback; + damos_add_quota_goal(&scheme->quota, goal); + } + if (skip_anon) { filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); if (!filter) { From 7ce55f8ffdedc0e59d53f7003b7bd912b19918a7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:30 -0800 Subject: [PATCH 289/435] mm/damon/reclaim: implement memory PSI-driven quota self-tuning Support the PSI-driven quota self-tuning from DAMON_RECLAIM by introducing yet another parameter, 'quota_mem_pressure_us'. Users can set the desired amount of memory pressure stall time per each quota reset interval using the parameter. Then DAMON_RECLAIM monitor the memory pressure stall time, specifically system-wide memory 'some' PSI value that increased during the given time interval, and self-tune the quota using the DAMOS core logic. Link: https://lkml.kernel.org/r/20240219194431.159606-20-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 9df6b8819998..9bd341d62b4c 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -62,6 +62,21 @@ static struct damos_quota damon_reclaim_quota = { }; DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota); +/* + * Desired level of memory pressure-stall time in microseconds. + * + * While keeping the caps that set by other quotas, DAMON_RECLAIM automatically + * increases and decreases the effective level of the quota aiming this level of + * memory pressure is incurred. System-wide ``some`` memory PSI in microseconds + * per quota reset interval (``quota_reset_interval_ms``) is collected and + * compared to this value to see if the aim is satisfied. Value zero means + * disabling this auto-tuning feature. + * + * Disabled by default. + */ +static unsigned long quota_mem_pressure_us __read_mostly; +module_param(quota_mem_pressure_us, ulong, 0600); + /* * User-specifiable feedback for auto-tuning of the effective quota. * @@ -198,6 +213,16 @@ static int damon_reclaim_apply_parameters(void) &old_scheme->quota); } + if (quota_mem_pressure_us) { + goal = damos_new_quota_goal(DAMOS_QUOTA_SOME_MEM_PSI_US, + quota_mem_pressure_us); + if (!goal) { + damon_destroy_scheme(scheme); + return -ENOMEM; + } + damos_add_quota_goal(&scheme->quota, goal); + } + if (quota_autotune_feedback) { goal = damos_new_quota_goal(DAMOS_QUOTA_USER_INPUT, 10000); if (!goal) { From 75c40c2509e797830dd90d92568262ba69a89c9c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:31 -0800 Subject: [PATCH 290/435] Docs/admin-guide/mm/damon/reclaim: document auto-tuning parameters Update DAMON_RECLAIM usage document for the user/self feedback based auto-tuning of the quota. Link: https://lkml.kernel.org/r/20240219194431.159606-21-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- .../admin-guide/mm/damon/reclaim.rst | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index 343e25b252f4..af05ae617018 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -117,6 +117,33 @@ milliseconds. 1 second by default. +quota_mem_pressure_us +--------------------- + +Desired level of memory pressure-stall time in microseconds. + +While keeping the caps that set by other quotas, DAMON_RECLAIM automatically +increases and decreases the effective level of the quota aiming this level of +memory pressure is incurred. System-wide ``some`` memory PSI in microseconds +per quota reset interval (``quota_reset_interval_ms``) is collected and +compared to this value to see if the aim is satisfied. Value zero means +disabling this auto-tuning feature. + +Disabled by default. + +quota_autotune_feedback +----------------------- + +User-specifiable feedback for auto-tuning of the effective quota. + +While keeping the caps that set by other quotas, DAMON_RECLAIM automatically +increases and decreases the effective level of the quota aiming receiving this +feedback of value ``10,000`` from the user. DAMON_RECLAIM assumes the feedback +value and the quota are positively proportional. Value zero means disabling +this auto-tuning feature. + +Disabled by default. + wmarks_interval --------------- From 568b567f78acbe64d2e91b7e58a85feb6846434c Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 19 Feb 2024 13:33:51 +0000 Subject: [PATCH 291/435] mm/zsmalloc: fix migrate_write_lock() when !CONFIG_COMPACTION Patch series "mm/zsmalloc: fix and optimize objects/page migration". This series is to fix and optimize the zsmalloc objects/page migration. This patch (of 3): migrate_write_lock() is a empty function when !CONFIG_COMPACTION, in which case zs_compact() can be triggered from shrinker reclaim context. (Maybe it's better to rename it to zs_shrink()?) And zspage map object users rely on this migrate_read_lock() so object won't be migrated elsewhere. Fix it by always implementing the migrate_write_lock() related functions. Link: https://lkml.kernel.org/r/20240219-b4-szmalloc-migrate-v1-0-34cd49c6545b@bytedance.com Link: https://lkml.kernel.org/r/20240219-b4-szmalloc-migrate-v1-1-34cd49c6545b@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Sergey Senozhatsky Cc: Johannes Weiner Cc: Minchan Kim Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c937635e0ad1..64d5533fa5d8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -278,18 +278,15 @@ static bool ZsHugePage(struct zspage *zspage) static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); - -#ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage); static void migrate_write_lock_nested(struct zspage *zspage); static void migrate_write_unlock(struct zspage *zspage); + +#ifdef CONFIG_COMPACTION static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); #else -static void migrate_write_lock(struct zspage *zspage) {} -static void migrate_write_lock_nested(struct zspage *zspage) {} -static void migrate_write_unlock(struct zspage *zspage) {} static void kick_deferred_free(struct zs_pool *pool) {} static void init_deferred_free(struct zs_pool *pool) {} static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} @@ -1725,7 +1722,6 @@ static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) read_unlock(&zspage->lock); } -#ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage) { write_lock(&zspage->lock); @@ -1741,6 +1737,7 @@ static void migrate_write_unlock(struct zspage *zspage) write_unlock(&zspage->lock); } +#ifdef CONFIG_COMPACTION /* Number of isolated subpage for *page migration* in this zspage */ static void inc_zspage_isolation(struct zspage *zspage) { From 59def443c9942545a9bc702d8a42babc615a8d97 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 19 Feb 2024 13:33:52 +0000 Subject: [PATCH 292/435] mm/zsmalloc: remove migrate_write_lock_nested() The migrate write lock is to protect the race between zspage migration and zspage objects' map users. We only need to lock out the map users of src zspage, not dst zspage, which is safe to map by users concurrently, since we only need to do obj_malloc() from dst zspage. So we can remove the migrate_write_lock_nested() use case. As we are here, cleanup the __zs_compact() by moving putback_zspage() outside of migrate_write_unlock since we hold pool lock, no malloc or free users can come in. Link: https://lkml.kernel.org/r/20240219-b4-szmalloc-migrate-v1-2-34cd49c6545b@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Sergey Senozhatsky Cc: Johannes Weiner Cc: Minchan Kim Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 64d5533fa5d8..f2ae7d4c6f21 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -279,7 +279,6 @@ static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); static void migrate_write_lock(struct zspage *zspage); -static void migrate_write_lock_nested(struct zspage *zspage); static void migrate_write_unlock(struct zspage *zspage); #ifdef CONFIG_COMPACTION @@ -1727,11 +1726,6 @@ static void migrate_write_lock(struct zspage *zspage) write_lock(&zspage->lock); } -static void migrate_write_lock_nested(struct zspage *zspage) -{ - write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING); -} - static void migrate_write_unlock(struct zspage *zspage) { write_unlock(&zspage->lock); @@ -2003,19 +1997,17 @@ static unsigned long __zs_compact(struct zs_pool *pool, dst_zspage = isolate_dst_zspage(class); if (!dst_zspage) break; - migrate_write_lock(dst_zspage); } src_zspage = isolate_src_zspage(class); if (!src_zspage) break; - migrate_write_lock_nested(src_zspage); - + migrate_write_lock(src_zspage); migrate_zspage(pool, src_zspage, dst_zspage); - fg = putback_zspage(class, src_zspage); migrate_write_unlock(src_zspage); + fg = putback_zspage(class, src_zspage); if (fg == ZS_INUSE_RATIO_0) { free_zspage(pool, class, src_zspage); pages_freed += class->pages_per_zspage; @@ -2025,7 +2017,6 @@ static unsigned long __zs_compact(struct zs_pool *pool, if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100 || spin_is_contended(&pool->lock)) { putback_zspage(class, dst_zspage); - migrate_write_unlock(dst_zspage); dst_zspage = NULL; spin_unlock(&pool->lock); @@ -2034,15 +2025,12 @@ static unsigned long __zs_compact(struct zs_pool *pool, } } - if (src_zspage) { + if (src_zspage) putback_zspage(class, src_zspage); - migrate_write_unlock(src_zspage); - } - if (dst_zspage) { + if (dst_zspage) putback_zspage(class, dst_zspage); - migrate_write_unlock(dst_zspage); - } + spin_unlock(&pool->lock); return pages_freed; From 4ad63e163264063b8cac8f681c20a303933842f2 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 19 Feb 2024 13:33:53 +0000 Subject: [PATCH 293/435] mm/zsmalloc: remove unused zspage->isolated The zspage->isolated is not used anywhere, we don't need to maintain it, which needs to hold the heavy pool lock to update it, so just remove it. Link: https://lkml.kernel.org/r/20240219-b4-szmalloc-migrate-v1-3-34cd49c6545b@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Sergey Senozhatsky Cc: Johannes Weiner Cc: Minchan Kim Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index f2ae7d4c6f21..a48f4651d143 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -116,7 +116,6 @@ #define HUGE_BITS 1 #define FULLNESS_BITS 4 #define CLASS_BITS 8 -#define ISOLATED_BITS 5 #define MAGIC_VAL_BITS 8 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) @@ -246,7 +245,6 @@ struct zspage { unsigned int huge:HUGE_BITS; unsigned int fullness:FULLNESS_BITS; unsigned int class:CLASS_BITS + 1; - unsigned int isolated:ISOLATED_BITS; unsigned int magic:MAGIC_VAL_BITS; }; unsigned int inuse; @@ -1732,17 +1730,6 @@ static void migrate_write_unlock(struct zspage *zspage) } #ifdef CONFIG_COMPACTION -/* Number of isolated subpage for *page migration* in this zspage */ -static void inc_zspage_isolation(struct zspage *zspage) -{ - zspage->isolated++; -} - -static void dec_zspage_isolation(struct zspage *zspage) -{ - VM_BUG_ON(zspage->isolated == 0); - zspage->isolated--; -} static const struct movable_operations zsmalloc_mops; @@ -1771,21 +1758,12 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { - struct zs_pool *pool; - struct zspage *zspage; - /* * Page is locked so zspage couldn't be destroyed. For detail, look at * lock_zspage in free_zspage. */ VM_BUG_ON_PAGE(PageIsolated(page), page); - zspage = get_zspage(page); - pool = zspage->pool; - spin_lock(&pool->lock); - inc_zspage_isolation(zspage); - spin_unlock(&pool->lock); - return true; } @@ -1850,7 +1828,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page, kunmap_atomic(s_addr); replace_sub_page(class, zspage, newpage, page); - dec_zspage_isolation(zspage); /* * Since we complete the data copy and set up new zspage structure, * it's okay to release the pool's lock. @@ -1872,16 +1849,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, static void zs_page_putback(struct page *page) { - struct zs_pool *pool; - struct zspage *zspage; - VM_BUG_ON_PAGE(!PageIsolated(page), page); - - zspage = get_zspage(page); - pool = zspage->pool; - spin_lock(&pool->lock); - dec_zspage_isolation(zspage); - spin_unlock(&pool->lock); } static const struct movable_operations zsmalloc_mops = { From 929e4c3534ecdc6c11921dff5a54597c5d61ca6a Mon Sep 17 00:00:00 2001 From: Zhongkun He Date: Mon, 19 Feb 2024 10:44:53 +0800 Subject: [PATCH 294/435] mm/z3fold: fix the comment for __encode_handle() The comment is confusing that Pool lock should be held as this function accesses first_num above the __encode_handle() because first_num is the element of z3fold_header which is protected by z3fold_header->page_lock. I found the same comment for encode_handle() in zbud.c by accident ,Pool lock should be held as this function accesses first|last_chunks, which is the element of zbud_header and it does not have any lock, so pool lock should be held. Z3fold is based on zbud, maybe the comment come from zbud, but it was wrong, so fix it. Link: https://lkml.kernel.org/r/20240219024453.2240147-1-hezhongkun.hzk@bytedance.com Signed-off-by: Zhongkun He Cc: Johannes Weiner Cc: Miaohe Lin Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 7c76b396b74c..7ab05621052d 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -364,8 +364,9 @@ static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) } /* - * Encodes the handle of a particular buddy within a z3fold page - * Pool lock should be held as this function accesses first_num + * Encodes the handle of a particular buddy within a z3fold page. + * Zhdr->page_lock should be held as this function accesses first_num + * if bud != HEADLESS. */ static unsigned long __encode_handle(struct z3fold_header *zhdr, struct z3fold_buddy_slots *slots, From 55e78c933d747ed44b2b22879e6a55e5af34b9f7 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 20 Feb 2024 10:19:35 +1300 Subject: [PATCH 295/435] mm: zswap: increase reject_compress_poor but not reject_compress_fail if compression returns ENOSPC We used to rely on the returned -ENOSPC of zpool_malloc() to increase reject_compress_poor. But the code wouldn't get to there after commit 744e1885922a ("crypto: scomp - fix req->dst buffer overflow") as the new code will goto out immediately after the special compression case happens. So there might be no longer a chance to execute zpool_malloc now. We are incorrectly increasing zswap_reject_compress_fail instead. Thus, we need to fix the counters handling right after compressions return ENOSPC. This patch also centralizes the counters handling for all of compress_poor, compress_fail and alloc_fail. Link: https://lkml.kernel.org/r/20240219211935.72394-1-21cnbao@gmail.com Fixes: 744e1885922a ("crypto: scomp - fix req->dst buffer overflow") Signed-off-by: Barry Song Cc: Sergey Senozhatsky Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 62fe307521c9..51de79aa8659 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1021,12 +1021,12 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) { struct crypto_acomp_ctx *acomp_ctx; struct scatterlist input, output; + int comp_ret = 0, alloc_ret = 0; unsigned int dlen = PAGE_SIZE; unsigned long handle; struct zpool *zpool; char *buf; gfp_t gfp; - int ret; u8 *dst; acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); @@ -1057,26 +1057,18 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) * but in different threads running on different cpu, we have different * acomp instance, so multiple threads can do (de)compression in parallel. */ - ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); + comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - if (ret) { - zswap_reject_compress_fail++; + if (comp_ret) goto unlock; - } zpool = zswap_find_zpool(entry); gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; if (zpool_malloc_support_movable(zpool)) gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - ret = zpool_malloc(zpool, dlen, gfp, &handle); - if (ret == -ENOSPC) { - zswap_reject_compress_poor++; + alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); + if (alloc_ret) goto unlock; - } - if (ret) { - zswap_reject_alloc_fail++; - goto unlock; - } buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); memcpy(buf, dst, dlen); @@ -1086,8 +1078,15 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) entry->length = dlen; unlock: + if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC) + zswap_reject_compress_poor++; + else if (comp_ret) + zswap_reject_compress_fail++; + else if (alloc_ret) + zswap_reject_alloc_fail++; + mutex_unlock(&acomp_ctx->mutex); - return ret == 0; + return comp_ret == 0 && alloc_ret == 0; } static void zswap_decompress(struct zswap_entry *entry, struct page *page) From f6f3f27597864d38bce5b4c346494970ecc19544 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 8 Feb 2024 10:25:08 +0800 Subject: [PATCH 296/435] mm: compaction: early termination in compact_nodes() No need to continue try compact memory if pending fatal signal, allow loop termination earlier in compact_nodes(). The existing fatal_signal_pending() check does make compact_zone() break out of the while loop, but it still enters the next zone/next nid, and some unnecessary functions(eg, lru_add_drain) are called. There was no observable benefit from the new test, it is just found from code inspection when refactoring compact_node(). Link: https://lkml.kernel.org/r/20240208022508.1771534-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/compaction.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index f146478b01bc..52ff6b9344c7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2808,7 +2808,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, * reaching score targets due to various back-off conditions, such as, * contention on per-node or per-zone locks. */ -static void compact_node(pg_data_t *pgdat, bool proactive) +static int compact_node(pg_data_t *pgdat, bool proactive) { int zoneid; struct zone *zone; @@ -2826,6 +2826,9 @@ static void compact_node(pg_data_t *pgdat, bool proactive) if (!populated_zone(zone)) continue; + if (fatal_signal_pending(current)) + return -EINTR; + cc.zone = zone; compact_zone(&cc, NULL); @@ -2837,18 +2840,25 @@ static void compact_node(pg_data_t *pgdat, bool proactive) cc.total_free_scanned); } } + + return 0; } /* Compact all zones of all nodes in the system */ -static void compact_nodes(void) +static int compact_nodes(void) { - int nid; + int ret, nid; /* Flush pending updates to the LRU lists */ lru_add_drain_all(); - for_each_online_node(nid) - compact_node(NODE_DATA(nid), false); + for_each_online_node(nid) { + ret = compact_node(NODE_DATA(nid), false); + if (ret) + return ret; + } + + return 0; } static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, @@ -2894,9 +2904,9 @@ static int sysctl_compaction_handler(struct ctl_table *table, int write, return -EINVAL; if (write) - compact_nodes(); + ret = compact_nodes(); - return 0; + return ret; } #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) From a6a8cdfdde43b6231dd1ed61b8a7a6ee55956376 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 20 Feb 2024 06:53:00 +0000 Subject: [PATCH 297/435] mm/zsmalloc: remove set_zspage_mapping() Patch series "mm/zsmalloc: some cleanup for get/set_zspage_mapping()". The discussion[1] with Sergey shows there are some cleanup works to do in get/set_zspage_mapping(): - the fullness returned from get_zspage_mapping() is not stable outside pool->lock, this usage pattern is confusing, but should be ok in this free_zspage path. - we seldom use the class_idx returned from get_zspage_mapping(), only free_zspage path use to get its class. - set_zspage_mapping() always set the zspage->class, but it's never changed after zspage allocated. [1] https://lore.kernel.org/all/a6c22e30-cf10-4122-91bc-ceb9fb57a5d6@bytedance.com/ This patch (of 3): We only need to update zspage->fullness when insert_zspage(), since zspage->class is never changed after allocated. Link: https://lkml.kernel.org/r/20240220-b4-zsmalloc-cleanup-v1-0-5c5ee4ccdd87@bytedance.com Link: https://lkml.kernel.org/r/20240220-b4-zsmalloc-cleanup-v1-1-5c5ee4ccdd87@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Sergey Senozhatsky Cc: Johannes Weiner Cc: Minchan Kim Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index a48f4651d143..a6653915bf17 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -486,14 +486,6 @@ static struct size_class *zspage_class(struct zs_pool *pool, return pool->size_class[zspage->class]; } -static void set_zspage_mapping(struct zspage *zspage, - unsigned int class_idx, - int fullness) -{ - zspage->class = class_idx; - zspage->fullness = fullness; -} - /* * zsmalloc divides the pool into various size classes where each * class maintains a list of zspages where each zspage is divided @@ -688,6 +680,7 @@ static void insert_zspage(struct size_class *class, { class_stat_inc(class, fullness, 1); list_add(&zspage->list, &class->fullness_list[fullness]); + zspage->fullness = fullness; } /* @@ -725,7 +718,6 @@ static int fix_fullness_group(struct size_class *class, struct zspage *zspage) remove_zspage(class, zspage, currfg); insert_zspage(class, zspage, newfg); - set_zspage_mapping(zspage, class_idx, newfg); out: return newfg; } @@ -1005,6 +997,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, create_page_chain(class, zspage, pages); init_zspage(class, zspage); zspage->pool = pool; + zspage->class = class->index; return zspage; } @@ -1397,7 +1390,6 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) obj = obj_malloc(pool, zspage, handle); newfg = get_fullness_group(class, zspage); insert_zspage(class, zspage, newfg); - set_zspage_mapping(zspage, class->index, newfg); record_obj(handle, obj); atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); class_stat_inc(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage); @@ -1655,7 +1647,6 @@ static int putback_zspage(struct size_class *class, struct zspage *zspage) fullness = get_fullness_group(class, zspage); insert_zspage(class, zspage, fullness); - set_zspage_mapping(zspage, class->index, fullness); return fullness; } From 67eaedc1c52f0ac3ea99e4fc0bc3d7940344b94b Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 20 Feb 2024 06:53:01 +0000 Subject: [PATCH 298/435] mm/zsmalloc: remove_zspage() don't need fullness parameter We must remove_zspage() from its current fullness list, then use insert_zspage() to update its fullness and insert to new fullness list. Obviously, remove_zspage() doesn't need the fullness parameter. Link: https://lkml.kernel.org/r/20240220-b4-zsmalloc-cleanup-v1-2-5c5ee4ccdd87@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Sergey Senozhatsky Cc: Johannes Weiner Cc: Minchan Kim Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index a6653915bf17..c39fac9361d7 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -687,10 +687,10 @@ static void insert_zspage(struct size_class *class, * This function removes the given zspage from the freelist identified * by . */ -static void remove_zspage(struct size_class *class, - struct zspage *zspage, - int fullness) +static void remove_zspage(struct size_class *class, struct zspage *zspage) { + int fullness = zspage->fullness; + VM_BUG_ON(list_empty(&class->fullness_list[fullness])); list_del_init(&zspage->list); @@ -716,7 +716,7 @@ static int fix_fullness_group(struct size_class *class, struct zspage *zspage) if (newfg == currfg) goto out; - remove_zspage(class, zspage, currfg); + remove_zspage(class, zspage); insert_zspage(class, zspage, newfg); out: return newfg; @@ -878,7 +878,7 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class, return; } - remove_zspage(class, zspage, ZS_INUSE_RATIO_0); + remove_zspage(class, zspage); __free_zspage(pool, class, zspage); } @@ -1609,7 +1609,7 @@ static struct zspage *isolate_src_zspage(struct size_class *class) zspage = list_first_entry_or_null(&class->fullness_list[fg], struct zspage, list); if (zspage) { - remove_zspage(class, zspage, fg); + remove_zspage(class, zspage); return zspage; } } @@ -1626,7 +1626,7 @@ static struct zspage *isolate_dst_zspage(struct size_class *class) zspage = list_first_entry_or_null(&class->fullness_list[fg], struct zspage, list); if (zspage) { - remove_zspage(class, zspage, fg); + remove_zspage(class, zspage); return zspage; } } From ce335e0723470203ca3498d70de2eb97e6869a06 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 20 Feb 2024 06:53:02 +0000 Subject: [PATCH 299/435] mm/zsmalloc: remove get_zspage_mapping() Actually we seldom use the class_idx returned from get_zspage_mapping(), only the zspage->fullness is useful, just use zspage->fullness to remove this helper. Note zspage->fullness is not stable outside pool->lock, remove redundant "VM_BUG_ON(fullness != ZS_INUSE_RATIO_0)" in async_free_zspage() since we already have the same VM_BUG_ON() in __free_zspage(), which is safe to access zspage->fullness with pool->lock held. Link: https://lkml.kernel.org/r/20240220-b4-zsmalloc-cleanup-v1-3-5c5ee4ccdd87@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Sergey Senozhatsky Cc: Johannes Weiner Cc: Minchan Kim Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c39fac9361d7..63ec385cd670 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -470,16 +470,6 @@ static inline void set_freeobj(struct zspage *zspage, unsigned int obj) zspage->freeobj = obj; } -static void get_zspage_mapping(struct zspage *zspage, - unsigned int *class_idx, - int *fullness) -{ - BUG_ON(zspage->magic != ZSPAGE_MAGIC); - - *fullness = zspage->fullness; - *class_idx = zspage->class; -} - static struct size_class *zspage_class(struct zs_pool *pool, struct zspage *zspage) { @@ -708,12 +698,10 @@ static void remove_zspage(struct size_class *class, struct zspage *zspage) */ static int fix_fullness_group(struct size_class *class, struct zspage *zspage) { - int class_idx; - int currfg, newfg; + int newfg; - get_zspage_mapping(zspage, &class_idx, &currfg); newfg = get_fullness_group(class, zspage); - if (newfg == currfg) + if (newfg == zspage->fullness) goto out; remove_zspage(class, zspage); @@ -835,15 +823,11 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { struct page *page, *next; - int fg; - unsigned int class_idx; - - get_zspage_mapping(zspage, &class_idx, &fg); assert_spin_locked(&pool->lock); VM_BUG_ON(get_zspage_inuse(zspage)); - VM_BUG_ON(fg != ZS_INUSE_RATIO_0); + VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0); next = page = get_first_page(zspage); do { @@ -1857,8 +1841,6 @@ static void async_free_zspage(struct work_struct *work) { int i; struct size_class *class; - unsigned int class_idx; - int fullness; struct zspage *zspage, *tmp; LIST_HEAD(free_pages); struct zs_pool *pool = container_of(work, struct zs_pool, @@ -1879,10 +1861,8 @@ static void async_free_zspage(struct work_struct *work) list_del(&zspage->list); lock_zspage(zspage); - get_zspage_mapping(zspage, &class_idx, &fullness); - VM_BUG_ON(fullness != ZS_INUSE_RATIO_0); - class = pool->size_class[class_idx]; spin_lock(&pool->lock); + class = zspage_class(pool, zspage); __free_zspage(pool, class, zspage); spin_unlock(&pool->lock); } From fa4b759212ac1c6461f14335e4639e4184f7815c Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 20 Feb 2024 07:38:51 +0000 Subject: [PATCH 300/435] MAINTAINERS: add Chengming Zhou as a zswap reviewer I have been actively contributing to zswap and reviewing zswap patches for a while, and I am already getting CC'd on most of them. So add myself as a reviewer, will continue to work on it and help with the review process. Link: https://lkml.kernel.org/r/20240220073851.865113-1-chengming.zhou@linux.dev Signed-off-by: Chengming Zhou Cc: Johannes Weiner Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 129a237b7880..f4ddbcdfb29a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24426,6 +24426,7 @@ ZSWAP COMPRESSED SWAP CACHING M: Johannes Weiner M: Yosry Ahmed M: Nhat Pham +R: Chengming Zhou L: linux-mm@kvack.org S: Maintained F: Documentation/admin-guide/mm/zswap.rst From 5267fe5d092e80a83740e5a1f6d5638d88ac7309 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 20 Feb 2024 13:32:17 -0500 Subject: [PATCH 301/435] mm/page_alloc: remove unused fpi_flags in free_pages_prepare() Patch series "Enable >0 order folio memory compaction", v7. This patchset enables >0 order folio memory compaction, which is one of the prerequisitions for large folio support[1]. I am aware of that split free pages is necessary for folio migration in compaction, since if >0 order free pages are never split and no order-0 free page is scanned, compaction will end prematurely due to migration returns -ENOMEM. Free page split becomes a must instead of an optimization. lkp ncompare results (on a 8-CPU (Intel Xeon E5-2650 v4 @2.20GHz) 16G VM) for default LRU (-no-mglru) and CONFIG_LRU_GEN are shown at the bottom, copied from V3[4]. In sum, most of vm-scalability applications do not see performance change, and the others see ~4% to ~26% performance boost under default LRU and ~2% to ~6% performance boost under CONFIG_LRU_GEN. Overview === To support >0 order folio compaction, the patchset changes how free pages used for migration are kept during compaction. Free pages used to be split into order-0 pages that are post allocation processed (i.e., PageBuddy flag cleared, page order stored in page->private is zeroed, and page reference is set to 1). Now all free pages are kept in a NR_PAGE_ORDER array of page lists based on their order without post allocation process. When migrate_pages() asks for a new page, one of the free pages, based on the requested page order, is then processed and given out. And THP <2MB would need this feature. [1] https://lore.kernel.org/linux-mm/f8d47176-03a8-99bf-a813-b5942830fd73@arm.com/ [2] https://lore.kernel.org/linux-mm/20231113170157.280181-1-zi.yan@sent.com/ [3] https://lore.kernel.org/linux-mm/20240123034636.1095672-1-zi.yan@sent.com/ [4] https://lore.kernel.org/linux-mm/20240202161554.565023-1-zi.yan@sent.com/ [5] https://lore.kernel.org/linux-mm/20240212163510.859822-1-zi.yan@sent.com/ [6] https://lore.kernel.org/linux-mm/20240214220420.1229173-1-zi.yan@sent.com/ [7] https://lore.kernel.org/linux-mm/20240216170432.1268753-1-zi.yan@sent.com/ This patch (of 4): Commit 0a54864f8dfb ("kasan: remove PG_skip_kasan_poison flag") removes the use of fpi_flags in should_skip_kasan_poison() and fpi_flags is only passed to should_skip_kasan_poison() in free_pages_prepare(). Remove the unused parameter. Link: https://lkml.kernel.org/r/20240220183220.1451315-1-zi.yan@sent.com Link: https://lkml.kernel.org/r/20240220183220.1451315-2-zi.yan@sent.com Signed-off-by: Zi Yan Reviewed-by: Vlastimil Babka Reviewed-by: David Hildenbrand Cc: Adam Manzanares Cc: Baolin Wang Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Kemeng Shi Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Cc: Yin Fengwei Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/page_alloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9faca05d124e..dc59fb225cbf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1061,7 +1061,7 @@ out: * on-demand allocation and then freed again before the deferred pages * initialization is done, but this is not likely to happen. */ -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool should_skip_kasan_poison(struct page *page) { if (IS_ENABLED(CONFIG_KASAN_GENERIC)) return deferred_pages_enabled(); @@ -1081,10 +1081,10 @@ static void kernel_init_pages(struct page *page, int numpages) } static __always_inline bool free_pages_prepare(struct page *page, - unsigned int order, fpi_t fpi_flags) + unsigned int order) { int bad = 0; - bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); + bool skip_kasan_poison = should_skip_kasan_poison(page); bool init = want_init_on_free(); bool compound = PageCompound(page); @@ -1266,7 +1266,7 @@ static void __free_pages_ok(struct page *page, unsigned int order, unsigned long pfn = page_to_pfn(page); struct zone *zone = page_zone(page); - if (!free_pages_prepare(page, order, fpi_flags)) + if (!free_pages_prepare(page, order)) return; /* @@ -2343,7 +2343,7 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn, { int migratetype; - if (!free_pages_prepare(page, order, FPI_NONE)) + if (!free_pages_prepare(page, order)) return false; migratetype = get_pfnblock_migratetype(page, pfn); From ee6f62fd34f0bb99ef93f799bcf5fc6a6b24945b Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 20 Feb 2024 13:32:18 -0500 Subject: [PATCH 302/435] mm/compaction: enable compacting >0 order folios. migrate_pages() supports >0 order folio migration and during compaction, even if compaction_alloc() cannot provide >0 order free pages, migrate_pages() can split the source page and try to migrate the base pages from the split. It can be a baseline and start point for adding support for compacting >0 order folios. Link: https://lkml.kernel.org/r/20240220183220.1451315-3-zi.yan@sent.com Signed-off-by: Zi Yan Suggested-by: Huang Ying Reviewed-by: Baolin Wang Reviewed-by: Vlastimil Babka Tested-by: Baolin Wang Tested-by: Yu Zhao Cc: Adam Manzanares Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kemeng Shi Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/compaction.c | 101 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 76 insertions(+), 25 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 52ff6b9344c7..2ba9ba49b0e9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -40,9 +40,22 @@ static inline void count_compact_events(enum vm_event_item item, long delta) { count_vm_events(item, delta); } + +/* + * order == -1 is expected when compacting proactively via + * 1. /proc/sys/vm/compact_memory + * 2. /sys/devices/system/node/nodex/compact + * 3. /proc/sys/vm/compaction_proactiveness + */ +static inline bool is_via_compact_memory(int order) +{ + return order == -1; +} + #else #define count_compact_event(item) do { } while (0) #define count_compact_events(item, delta) do { } while (0) +static inline bool is_via_compact_memory(int order) { return false; } #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA @@ -816,6 +829,32 @@ static bool too_many_isolated(struct compact_control *cc) return too_many; } +/** + * skip_isolation_on_order() - determine when to skip folio isolation based on + * folio order and compaction target order + * @order: to-be-isolated folio order + * @target_order: compaction target order + * + * This avoids unnecessary folio isolations during compaction. + */ +static bool skip_isolation_on_order(int order, int target_order) +{ + /* + * Unless we are performing global compaction (i.e., + * is_via_compact_memory), skip any folios that are larger than the + * target order: we wouldn't be here if we'd have a free folio with + * the desired target_order, so migrating this folio would likely fail + * later. + */ + if (!is_via_compact_memory(target_order) && order >= target_order) + return true; + /* + * We limit memory compaction to pageblocks and won't try + * creating free blocks of memory that are larger than that. + */ + return order >= pageblock_order; +} + /** * isolate_migratepages_block() - isolate all migrate-able pages within * a single pageblock @@ -947,7 +986,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, valid_page = page; } - if (PageHuge(page) && cc->alloc_contig) { + if (PageHuge(page)) { + /* + * skip hugetlbfs if we are not compacting for pages + * bigger than its order. THPs and other compound pages + * are handled below. + */ + if (!cc->alloc_contig) { + const unsigned int order = compound_order(page); + + if (order <= MAX_PAGE_ORDER) { + low_pfn += (1UL << order) - 1; + nr_scanned += (1UL << order) - 1; + } + goto isolate_fail; + } + /* for alloc_contig case */ if (locked) { unlock_page_lruvec_irqrestore(locked, flags); locked = NULL; @@ -1008,21 +1062,24 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * Regardless of being on LRU, compound pages such as THP and - * hugetlbfs are not to be compacted unless we are attempting - * an allocation much larger than the huge page size (eg CMA). - * We can potentially save a lot of iterations if we skip them - * at once. The check is racy, but we can consider only valid - * values and the only danger is skipping too much. + * Regardless of being on LRU, compound pages such as THP + * (hugetlbfs is handled above) are not to be compacted unless + * we are attempting an allocation larger than the compound + * page size. We can potentially save a lot of iterations if we + * skip them at once. The check is racy, but we can consider + * only valid values and the only danger is skipping too much. */ if (PageCompound(page) && !cc->alloc_contig) { const unsigned int order = compound_order(page); - if (likely(order <= MAX_PAGE_ORDER)) { - low_pfn += (1UL << order) - 1; - nr_scanned += (1UL << order) - 1; + /* Skip based on page order and compaction target order. */ + if (skip_isolation_on_order(order, cc->order)) { + if (order <= MAX_PAGE_ORDER) { + low_pfn += (1UL << order) - 1; + nr_scanned += (1UL << order) - 1; + } + goto isolate_fail; } - goto isolate_fail; } /* @@ -1165,10 +1222,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * folio become large since the non-locked check, - * and it's on LRU. + * Check LRU folio order under the lock */ - if (unlikely(folio_test_large(folio) && !cc->alloc_contig)) { + if (unlikely(skip_isolation_on_order(folio_order(folio), + cc->order) && + !cc->alloc_contig)) { low_pfn += folio_nr_pages(folio) - 1; nr_scanned += folio_nr_pages(folio) - 1; folio_set_lru(folio); @@ -1788,6 +1846,10 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data) struct compact_control *cc = (struct compact_control *)data; struct folio *dst; + /* this makes migrate_pages() split the source page and retry */ + if (folio_test_large(src)) + return NULL; + if (list_empty(&cc->freepages)) { isolate_freepages(cc); @@ -2090,17 +2152,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } -/* - * order == -1 is expected when compacting proactively via - * 1. /proc/sys/vm/compact_memory - * 2. /sys/devices/system/node/nodex/compact - * 3. /proc/sys/vm/compaction_proactiveness - */ -static inline bool is_via_compact_memory(int order) -{ - return order == -1; -} - /* * Determine whether kswapd is (or recently was!) running on this node. * From 733aea0b3a7bba0451dfc19322665de13a5b7af4 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 20 Feb 2024 13:32:19 -0500 Subject: [PATCH 303/435] mm/compaction: add support for >0 order folio memory compaction. Before last commit, memory compaction only migrates order-0 folios and skips >0 order folios. Last commit splits all >0 order folios during compaction. This commit migrates >0 order folios during compaction by keeping isolated free pages at their original size without splitting them into order-0 pages and using them directly during migration process. What is different from the prior implementation: 1. All isolated free pages are kept in a NR_PAGE_ORDERS array of page lists, where each page list stores free pages in the same order. 2. All free pages are not post_alloc_hook() processed nor buddy pages, although their orders are stored in first page's private like buddy pages. 3. During migration, in new page allocation time (i.e., in compaction_alloc()), free pages are then processed by post_alloc_hook(). When migration fails and a new page is returned (i.e., in compaction_free()), free pages are restored by reversing the post_alloc_hook() operations using newly added free_pages_prepare_fpi_none(). Step 3 is done for a latter optimization that splitting and/or merging free pages during compaction becomes easier. Note: without splitting free pages, compaction can end prematurely due to migration will return -ENOMEM even if there is free pages. This happens when no order-0 free page exist and compaction_alloc() return NULL. Link: https://lkml.kernel.org/r/20240220183220.1451315-4-zi.yan@sent.com Signed-off-by: Zi Yan Reviewed-by: Baolin Wang Reviewed-by: Vlastimil Babka Tested-by: Baolin Wang Tested-by: Yu Zhao Cc: Adam Manzanares Cc: David Hildenbrand Cc: Huang Ying Cc: Johannes Weiner Cc: Kemeng Shi Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/compaction.c | 146 +++++++++++++++++++++++++++--------------------- mm/internal.h | 4 +- mm/page_alloc.c | 2 +- 3 files changed, 86 insertions(+), 66 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 2ba9ba49b0e9..61b2c731c9db 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -79,45 +79,56 @@ static inline bool is_via_compact_memory(int order) { return false; } #define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT) #endif -static unsigned long release_freepages(struct list_head *freelist) +static void split_map_pages(struct list_head *freepages) { - struct page *page, *next; - unsigned long high_pfn = 0; - - list_for_each_entry_safe(page, next, freelist, lru) { - unsigned long pfn = page_to_pfn(page); - list_del(&page->lru); - __free_page(page); - if (pfn > high_pfn) - high_pfn = pfn; - } - - return high_pfn; -} - -static void split_map_pages(struct list_head *list) -{ - unsigned int i, order, nr_pages; + unsigned int i, order; struct page *page, *next; LIST_HEAD(tmp_list); - list_for_each_entry_safe(page, next, list, lru) { - list_del(&page->lru); + for (order = 0; order < NR_PAGE_ORDERS; order++) { + list_for_each_entry_safe(page, next, &freepages[order], lru) { + unsigned int nr_pages; - order = page_private(page); - nr_pages = 1 << order; + list_del(&page->lru); - post_alloc_hook(page, order, __GFP_MOVABLE); - if (order) - split_page(page, order); + nr_pages = 1 << order; - for (i = 0; i < nr_pages; i++) { - list_add(&page->lru, &tmp_list); - page++; + post_alloc_hook(page, order, __GFP_MOVABLE); + if (order) + split_page(page, order); + + for (i = 0; i < nr_pages; i++) { + list_add(&page->lru, &tmp_list); + page++; + } + } + list_splice_init(&tmp_list, &freepages[0]); + } +} + +static unsigned long release_free_list(struct list_head *freepages) +{ + int order; + unsigned long high_pfn = 0; + + for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct page *page, *next; + + list_for_each_entry_safe(page, next, &freepages[order], lru) { + unsigned long pfn = page_to_pfn(page); + + list_del(&page->lru); + /* + * Convert free pages into post allocation pages, so + * that we can free them via __free_page. + */ + post_alloc_hook(page, order, __GFP_MOVABLE); + __free_pages(page, order); + if (pfn > high_pfn) + high_pfn = pfn; } } - - list_splice(&tmp_list, list); + return high_pfn; } #ifdef CONFIG_COMPACTION @@ -670,7 +681,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, nr_scanned += isolated - 1; total_isolated += isolated; cc->nr_freepages += isolated; - list_add_tail(&page->lru, freelist); + list_add_tail(&page->lru, &freelist[order]); if (!strict && cc->nr_migratepages <= cc->nr_freepages) { blockpfn += isolated; @@ -735,7 +746,11 @@ isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { unsigned long isolated, pfn, block_start_pfn, block_end_pfn; - LIST_HEAD(freelist); + int order; + struct list_head tmp_freepages[NR_PAGE_ORDERS]; + + for (order = 0; order < NR_PAGE_ORDERS; order++) + INIT_LIST_HEAD(&tmp_freepages[order]); pfn = start_pfn; block_start_pfn = pageblock_start_pfn(pfn); @@ -766,7 +781,7 @@ isolate_freepages_range(struct compact_control *cc, break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, &freelist, 0, true); + block_end_pfn, tmp_freepages, 0, true); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -783,15 +798,15 @@ isolate_freepages_range(struct compact_control *cc, */ } - /* __isolate_free_page() does not map the pages */ - split_map_pages(&freelist); - if (pfn < end_pfn) { /* Loop terminated early, cleanup. */ - release_freepages(&freelist); + release_free_list(tmp_freepages); return 0; } + /* __isolate_free_page() does not map the pages */ + split_map_pages(tmp_freepages); + /* We don't use freelists for anything. */ return pfn; } @@ -1518,7 +1533,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn) if (!page) return; - isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); + isolate_freepages_block(cc, &start_pfn, end_pfn, cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ if (start_pfn == end_pfn && !cc->no_set_skip_hint) @@ -1647,7 +1662,7 @@ static void fast_isolate_freepages(struct compact_control *cc) nr_scanned += nr_isolated - 1; total_isolated += nr_isolated; cc->nr_freepages += nr_isolated; - list_add_tail(&page->lru, &cc->freepages); + list_add_tail(&page->lru, &cc->freepages[order]); count_compact_events(COMPACTISOLATED, nr_isolated); } else { /* If isolation fails, abort the search */ @@ -1724,13 +1739,12 @@ static void isolate_freepages(struct compact_control *cc) unsigned long isolate_start_pfn; /* exact pfn we start at */ unsigned long block_end_pfn; /* end of current pageblock */ unsigned long low_pfn; /* lowest pfn scanner is able to scan */ - struct list_head *freelist = &cc->freepages; unsigned int stride; /* Try a small search of the free lists for a candidate */ fast_isolate_freepages(cc); if (cc->nr_freepages) - goto splitmap; + return; /* * Initialise the free scanner. The starting point is where we last @@ -1790,7 +1804,7 @@ static void isolate_freepages(struct compact_control *cc) /* Found a block suitable for isolating free pages from. */ nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, freelist, stride, false); + block_end_pfn, cc->freepages, stride, false); /* Update the skip hint if the full pageblock was scanned */ if (isolate_start_pfn == block_end_pfn) @@ -1831,10 +1845,6 @@ static void isolate_freepages(struct compact_control *cc) * and the loop terminated due to isolate_start_pfn < low_pfn */ cc->free_pfn = isolate_start_pfn; - -splitmap: - /* __isolate_free_page() does not map the pages */ - split_map_pages(freelist); } /* @@ -1845,24 +1855,22 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; struct folio *dst; + int order = folio_order(src); - /* this makes migrate_pages() split the source page and retry */ - if (folio_test_large(src)) - return NULL; - - if (list_empty(&cc->freepages)) { + if (list_empty(&cc->freepages[order])) { isolate_freepages(cc); - - if (list_empty(&cc->freepages)) + if (list_empty(&cc->freepages[order])) return NULL; } - dst = list_entry(cc->freepages.next, struct folio, lru); + dst = list_first_entry(&cc->freepages[order], struct folio, lru); list_del(&dst->lru); - cc->nr_freepages--; - cc->nr_migratepages--; - - return dst; + post_alloc_hook(&dst->page, order, __GFP_MOVABLE); + if (order) + prep_compound_page(&dst->page, order); + cc->nr_freepages -= 1 << order; + cc->nr_migratepages -= 1 << order; + return page_rmappable_folio(&dst->page); } /* @@ -1873,10 +1881,19 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data) static void compaction_free(struct folio *dst, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; + int order = folio_order(dst); + struct page *page = &dst->page; - list_add(&dst->lru, &cc->freepages); - cc->nr_freepages++; - cc->nr_migratepages++; + if (folio_put_testzero(dst)) { + free_pages_prepare(page, order); + list_add(&dst->lru, &cc->freepages[order]); + cc->nr_freepages += 1 << order; + } + cc->nr_migratepages += 1 << order; + /* + * someone else has referenced the page, we cannot take it back to our + * free list. + */ } /* possible outcome of isolate_migratepages */ @@ -2489,6 +2506,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) const bool sync = cc->mode != MIGRATE_ASYNC; bool update_cached; unsigned int nr_succeeded = 0, nr_migratepages; + int order; /* * These counters track activities during zone compaction. Initialize @@ -2498,7 +2516,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->total_free_scanned = 0; cc->nr_migratepages = 0; cc->nr_freepages = 0; - INIT_LIST_HEAD(&cc->freepages); + for (order = 0; order < NR_PAGE_ORDERS; order++) + INIT_LIST_HEAD(&cc->freepages[order]); INIT_LIST_HEAD(&cc->migratepages); cc->migratetype = gfp_migratetype(cc->gfp_mask); @@ -2690,7 +2709,7 @@ out: * so we don't leave any returned pages behind in the next attempt. */ if (cc->nr_freepages > 0) { - unsigned long free_pfn = release_freepages(&cc->freepages); + unsigned long free_pfn = release_free_list(cc->freepages); cc->nr_freepages = 0; VM_BUG_ON(free_pfn == 0); @@ -2709,7 +2728,6 @@ out: trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret); - VM_BUG_ON(!list_empty(&cc->freepages)); VM_BUG_ON(!list_empty(&cc->migratepages)); return ret; diff --git a/mm/internal.h b/mm/internal.h index 1e29c5821a1d..93e229112045 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -447,6 +447,8 @@ extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); +extern bool free_pages_prepare(struct page *page, unsigned int order); + extern int user_min_free_kbytes; extern void free_unref_page(struct page *page, unsigned int order); @@ -481,7 +483,7 @@ int split_free_page(struct page *free_page, * completes when free_pfn <= migrate_pfn */ struct compact_control { - struct list_head freepages; /* List of free pages to migrate to */ + struct list_head freepages[NR_PAGE_ORDERS]; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dc59fb225cbf..51e13aa605ec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1080,7 +1080,7 @@ static void kernel_init_pages(struct page *page, int numpages) kasan_enable_current(); } -static __always_inline bool free_pages_prepare(struct page *page, +__always_inline bool free_pages_prepare(struct page *page, unsigned int order) { int bad = 0; From 73318e2cafe53e8b7c8899d990cf8eaca32184d0 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 20 Feb 2024 13:32:20 -0500 Subject: [PATCH 304/435] mm/compaction: optimize >0 order folio compaction with free page split. During migration in a memory compaction, free pages are placed in an array of page lists based on their order. But the desired free page order (i.e., the order of a source page) might not be always present, thus leading to migration failures and premature compaction termination. Split a high order free pages when source migration page has a lower order to increase migration successful rate. Note: merging free pages when a migration fails and a lower order free page is returned via compaction_free() is possible, but there is too much work. Since the free pages are not buddy pages, it is hard to identify these free pages using existing PFN-based page merging algorithm. Link: https://lkml.kernel.org/r/20240220183220.1451315-5-zi.yan@sent.com Signed-off-by: Zi Yan Reviewed-by: Baolin Wang Reviewed-by: Vlastimil Babka Tested-by: Baolin Wang Tested-by: Yu Zhao Cc: Adam Manzanares Cc: David Hildenbrand Cc: Huang Ying Cc: Johannes Weiner Cc: Kemeng Shi Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/compaction.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 61b2c731c9db..1faeffb28720 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1856,15 +1856,40 @@ static struct folio *compaction_alloc(struct folio *src, unsigned long data) struct compact_control *cc = (struct compact_control *)data; struct folio *dst; int order = folio_order(src); + bool has_isolated_pages = false; + int start_order; + struct page *freepage; + unsigned long size; - if (list_empty(&cc->freepages[order])) { - isolate_freepages(cc); - if (list_empty(&cc->freepages[order])) +again: + for (start_order = order; start_order < NR_PAGE_ORDERS; start_order++) + if (!list_empty(&cc->freepages[start_order])) + break; + + /* no free pages in the list */ + if (start_order == NR_PAGE_ORDERS) { + if (has_isolated_pages) return NULL; + isolate_freepages(cc); + has_isolated_pages = true; + goto again; } - dst = list_first_entry(&cc->freepages[order], struct folio, lru); - list_del(&dst->lru); + freepage = list_first_entry(&cc->freepages[start_order], struct page, + lru); + size = 1 << start_order; + + list_del(&freepage->lru); + + while (start_order > order) { + start_order--; + size >>= 1; + + list_add(&freepage[size].lru, &cc->freepages[start_order]); + set_page_private(&freepage[size], start_order); + } + dst = (struct folio *)freepage; + post_alloc_hook(&dst->page, order, __GFP_MOVABLE); if (order) prep_compound_page(&dst->page, order); From b4d3de57cab266fcb536007bf51f42249af364c3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Jan 2024 13:01:31 +0100 Subject: [PATCH 305/435] shmem: properly report quota mount options Report quota options among the set of mount options. This allows proper user visibility into whether quotas are enabled or not. Link: https://lkml.kernel.org/r/20240129120131.21145-1-jack@suse.cz Fixes: e09764cff44b ("shmem: quota support") Signed-off-by: Jan Kara Reviewed-by: Carlos Maiolino Acked-by: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index d7c84ff62186..30c9dc862505 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4265,6 +4265,24 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) mpol_put(mpol); if (sbinfo->noswap) seq_printf(seq, ",noswap"); +#ifdef CONFIG_TMPFS_QUOTA + if (sb_has_quota_active(root->d_sb, USRQUOTA)) + seq_printf(seq, ",usrquota"); + if (sb_has_quota_active(root->d_sb, GRPQUOTA)) + seq_printf(seq, ",grpquota"); + if (sbinfo->qlimits.usrquota_bhardlimit) + seq_printf(seq, ",usrquota_block_hardlimit=%lld", + sbinfo->qlimits.usrquota_bhardlimit); + if (sbinfo->qlimits.grpquota_bhardlimit) + seq_printf(seq, ",grpquota_block_hardlimit=%lld", + sbinfo->qlimits.grpquota_bhardlimit); + if (sbinfo->qlimits.usrquota_ihardlimit) + seq_printf(seq, ",usrquota_inode_hardlimit=%lld", + sbinfo->qlimits.usrquota_ihardlimit); + if (sbinfo->qlimits.grpquota_ihardlimit) + seq_printf(seq, ",grpquota_inode_hardlimit=%lld", + sbinfo->qlimits.grpquota_ihardlimit); +#endif return 0; } From e26f0b939df49d17bda8d5faa4813a255734e8c8 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 21 Feb 2024 22:10:28 +1300 Subject: [PATCH 306/435] mm/swapfile:__swap_duplicate: drop redundant WRITE_ONCE on swap_map for err cases The code is quite hard to read, we are still writing swap_map after errors happen. Though the written value is as before, has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; [snipped] WRITE_ONCE(p->swap_map[offset], count | has_cache); It would be better to entirely drop the WRITE_ONCE for both performance and readability. [akpm@linux-foundation.org: avoid using goto] Link: https://lkml.kernel.org/r/20240221091028.123122-1-21cnbao@gmail.com Signed-off-by: Barry Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index d1bd8d1e17bd..2b3a2d85e350 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3335,7 +3335,8 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) } else err = -ENOENT; /* unused swap entry */ - WRITE_ONCE(p->swap_map[offset], count | has_cache); + if (!err) + WRITE_ONCE(p->swap_map[offset], count | has_cache); unlock_out: unlock_cluster_or_swap_info(p, ci); From cc864ebba5f612ce2960e7e09322a193e8fda0d7 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 21 Feb 2024 21:50:36 +1300 Subject: [PATCH 307/435] madvise:madvise_cold_or_pageout_pte_range(): allow split while folio_estimated_sharers = 0 The purpose is stopping splitting large folios whose mapcount are 2 or above. Folios whose estimated_shares = 0 should be still perfect and even better candidates than estimated_shares = 1. Consider a pte-mapped large folio with 16 subpages, if we unmap 1-15, the current code will split folios and reclaim them while madvise goes on this folio; but if we unmap subpage 0, we will keep this folio and break. This is weird. For pmd-mapped large folios, we can still use "= 1" as the condition as anyway we have the entire map for it. So this patch doesn't change the condition for pmd-mapped large folios. This also explains why we had been using "= 1" for both pmd-mapped and pte-mapped large folios before commit 07e8c82b5eff ("madvise: convert madvise_cold_or_pageout_pte_range() to use folios"), because in the past, we used the mapcount of the specific subpage, since the subpage had pte present, its mapcount wouldn't be 0. The problem can be quite easily reproduced by writing a small program, unmapping the first subpage of a pte-mapped large folio vs. unmapping anyone other than the first subpage. Link: https://lkml.kernel.org/r/20240221085036.105621-1-21cnbao@gmail.com Fixes: 2f406263e3e9 ("madvise:madvise_cold_or_pageout_pte_range(): don't use mapcount() against large folio for sharing check") Signed-off-by: Barry Song Reviewed-by: David Hildenbrand Reviewed-by: Vishal Moola (Oracle) Cc: Yin Fengwei Cc: Yu Zhao Cc: Ryan Roberts Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Minchan Kim Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/madvise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index cfa5e7288261..abde3edb04f0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -453,7 +453,7 @@ restart: if (folio_test_large(folio)) { int err; - if (folio_estimated_sharers(folio) != 1) + if (folio_estimated_sharers(folio) > 1) break; if (pageout_anon_only_filter && !folio_test_anon(folio)) break; From 5bb1421422fa8955e741097e6371a1d3d0a9e54e Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Wed, 21 Feb 2024 15:32:27 +0800 Subject: [PATCH 308/435] mm/page_alloc: make bad_range() return bool bad_range() can return bool, so let us change it. Link: https://lkml.kernel.org/r/20240221073227.276234-1-gehao@kylinos.cn Signed-off-by: Hao Ge Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 51e13aa605ec..a9581cdf9649 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -464,19 +464,19 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) /* * Temporary debugging check for pages not lying within a given zone. */ -static int __maybe_unused bad_range(struct zone *zone, struct page *page) +static bool __maybe_unused bad_range(struct zone *zone, struct page *page) { if (page_outside_zone_boundaries(zone, page)) - return 1; + return true; if (zone != page_zone(page)) - return 1; + return true; - return 0; + return false; } #else -static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) +static inline bool __maybe_unused bad_range(struct zone *zone, struct page *page) { - return 0; + return false; } #endif From 6768907eb2823e0e6759df30c9a0e22f5326feb5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Feb 2024 07:36:36 +0100 Subject: [PATCH 309/435] writeback: don't call mapping_set_error on AOP_WRITEPAGE_ACTIVATE Patch series "convert write_cache_pages() to an iterator", v8. This is an evolution of the series Matthew Wilcox originally sent in June 2023, which has changed quite a bit since and now has a while based iterator. This patch (of 14): mapping_set_error should only be called on 0 returns (which it ignores) or a negative error code. writepage_cb ends up being able to call writepage_cb on the magic AOP_WRITEPAGE_ACTIVATE return value from ->writepage which means success but the caller needs to unlock the page. Ignore that and just call mapping_set_error on negative errors. (no fixes tag as this goes back more than 20 years over various renames and refactors so I've given up chasing down the original introduction) Link: https://lkml.kernel.org/r/20240215063649.2164017-1-hch@lst.de Link: https://lkml.kernel.org/r/20240215063649.2164017-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Brian Foster Cc: Christian Brauner Cc: David Howells Cc: Dave Chinner Cc: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- mm/page-writeback.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3f255534986a..703e83c69ffe 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2535,7 +2535,9 @@ static int writepage_cb(struct folio *folio, struct writeback_control *wbc, { struct address_space *mapping = data; int ret = mapping->a_ops->writepage(&folio->page, wbc); - mapping_set_error(mapping, ret); + + if (ret < 0) + mapping_set_error(mapping, ret); return ret; } From 2a6e1a8f4cf39cd3d10c52fca639a7d6f30b7004 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Feb 2024 07:36:37 +0100 Subject: [PATCH 310/435] writeback: remove a duplicate prototype for tag_pages_for_writeback [hch@lst.de: split from a larger patch] Link: https://lkml.kernel.org/r/20240215063649.2164017-3-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Brian Foster Reviewed-by: Jan Kara Acked-by: Dave Chinner Cc: Christian Brauner Cc: David Howells Signed-off-by: Andrew Morton --- include/linux/writeback.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 453736fd1d23..4b8cf9e4810b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -363,8 +363,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb); typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, void *data); -void tag_pages_for_writeback(struct address_space *mapping, - pgoff_t start, pgoff_t end); int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data); From a02829f011b64e6c102929ed55da52e38391e970 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Feb 2024 07:36:38 +0100 Subject: [PATCH 311/435] writeback: fix done_index when hitting the wbc->nr_to_write When write_cache_pages finishes writing out a folio, it fails to update done_index to account for the number of pages in the folio just written. That means when range_cyclic writeback is restarted, it will be restarted at this folio instead of after it as it should. Fix that by updating done_index before breaking out of the loop. Link: https://lkml.kernel.org/r/20240215063649.2164017-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Acked-by: Dave Chinner Cc: Christian Brauner Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- mm/page-writeback.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 703e83c69ffe..2679176da316 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2505,6 +2505,7 @@ continue_unlock: * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ + done_index = folio->index + nr; wbc->nr_to_write -= nr; if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { From 9810325854a31cdba9917674201616d748178161 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Feb 2024 07:36:39 +0100 Subject: [PATCH 312/435] writeback: also update wbc->nr_to_write on writeback failure When exiting write_cache_pages early due to a non-integrity write failure, wbc->nr_to_write currently doesn't account for the folio we just failed to write. This doesn't matter because the callers always ingore the value on a failure, but moving the update to common code will allow to simplify the code, so do it. Link: https://lkml.kernel.org/r/20240215063649.2164017-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Acked-by: Dave Chinner Cc: Christian Brauner Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2679176da316..abbee369405a 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2473,6 +2473,7 @@ continue_unlock: trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); error = writepage(folio, wbc, data); nr = folio_nr_pages(folio); + wbc->nr_to_write -= nr; if (unlikely(error)) { /* * Handle errors according to the type of @@ -2506,7 +2507,6 @@ continue_unlock: * we tagged for writeback prior to entering this loop. */ done_index = folio->index + nr; - wbc->nr_to_write -= nr; if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; From 5d899d43ed293b854fa4f6adc3138fd6fa12aecf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Feb 2024 07:36:40 +0100 Subject: [PATCH 313/435] writeback: only update ->writeback_index for range_cyclic writeback mapping->writeback_index is only [1] used as the starting point for range_cyclic writeback, so there is no point in updating it for other types of writeback. [1] except for btrfs_defrag_file which does really odd things with mapping->writeback_index. But btrfs doesn't use write_cache_pages at all, so this isn't relevant here. Link: https://lkml.kernel.org/r/20240215063649.2164017-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Acked-by: Dave Chinner Cc: Christian Brauner Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- mm/page-writeback.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index abbee369405a..f02014007b57 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2403,7 +2403,6 @@ int write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; - int range_whole = 0; xa_mark_t tag; folio_batch_init(&fbatch); @@ -2413,8 +2412,6 @@ int write_cache_pages(struct address_space *mapping, } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { tag_pages_for_writeback(mapping, index, end); @@ -2518,14 +2515,21 @@ continue_unlock: } /* - * If we hit the last page and there is more work to be done: wrap - * back the index back to the start of the file for the next - * time we are called. + * For range cyclic writeback we need to remember where we stopped so + * that we can continue there next time we are called. If we hit the + * last page and there is more work to be done, wrap back to the start + * of the file. + * + * For non-cyclic writeback we always start looking up at the beginning + * of the file if we are called again, which can only happen due to + * -ENOMEM from the file system. */ - if (wbc->range_cyclic && !done) - done_index = 0; - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = done_index; + if (wbc->range_cyclic) { + if (done) + mapping->writeback_index = done_index; + else + mapping->writeback_index = 0; + } return ret; } From f946e0d22e22813ef78af000486d1d7332cf14e4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Feb 2024 07:36:41 +0100 Subject: [PATCH 314/435] writeback: rework the loop termination condition in write_cache_pages Rework the way we deal with the cleanup after the writepage call. First handle the magic AOP_WRITEPAGE_ACTIVATE separately from real error returns to get it out of the way of the actual error handling path. The split the handling on intgrity vs non-integrity branches first, and return early using a goto for the non-ingegrity early loop condition to remove the need for the done and done_index local variables, and for assigning the error to ret when we can just return error directly. Link: https://lkml.kernel.org/r/20240215063649.2164017-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Cc: Christian Brauner Cc: Dave Chinner Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- mm/page-writeback.c | 84 ++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 51 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f02014007b57..3a1e23bed405 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2396,13 +2396,12 @@ int write_cache_pages(struct address_space *mapping, void *data) { int ret = 0; - int done = 0; int error; struct folio_batch fbatch; + struct folio *folio; int nr_folios; pgoff_t index; pgoff_t end; /* Inclusive */ - pgoff_t done_index; xa_mark_t tag; folio_batch_init(&fbatch); @@ -2419,8 +2418,7 @@ int write_cache_pages(struct address_space *mapping, } else { tag = PAGECACHE_TAG_DIRTY; } - done_index = index; - while (!done && (index <= end)) { + while (index <= end) { int i; nr_folios = filemap_get_folios_tag(mapping, &index, end, @@ -2430,11 +2428,7 @@ int write_cache_pages(struct address_space *mapping, break; for (i = 0; i < nr_folios; i++) { - struct folio *folio = fbatch.folios[i]; - unsigned long nr; - - done_index = folio->index; - + folio = fbatch.folios[i]; folio_lock(folio); /* @@ -2469,45 +2463,32 @@ continue_unlock: trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); error = writepage(folio, wbc, data); - nr = folio_nr_pages(folio); - wbc->nr_to_write -= nr; - if (unlikely(error)) { - /* - * Handle errors according to the type of - * writeback. There's no need to continue for - * background writeback. Just push done_index - * past this page so media errors won't choke - * writeout for the entire file. For integrity - * writeback, we must process the entire dirty - * set regardless of errors because the fs may - * still have state to clear for each page. In - * that case we continue processing and return - * the first error. - */ - if (error == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - error = 0; - } else if (wbc->sync_mode != WB_SYNC_ALL) { - ret = error; - done_index = folio->index + nr; - done = 1; - break; - } - if (!ret) - ret = error; + wbc->nr_to_write -= folio_nr_pages(folio); + + if (error == AOP_WRITEPAGE_ACTIVATE) { + folio_unlock(folio); + error = 0; } /* - * We stop writing back only if we are not doing - * integrity sync. In case of integrity sync we have to - * keep going until we have written all the pages - * we tagged for writeback prior to entering this loop. + * For integrity writeback we have to keep going until + * we have written all the folios we tagged for + * writeback above, even if we run past wbc->nr_to_write + * or encounter errors. + * We stash away the first error we encounter in + * wbc->saved_err so that it can be retrieved when we're + * done. This is because the file system may still have + * state to clear for each folio. + * + * For background writeback we exit as soon as we run + * past wbc->nr_to_write or encounter the first error. */ - done_index = folio->index + nr; - if (wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) { - done = 1; - break; + if (wbc->sync_mode == WB_SYNC_ALL) { + if (error && !ret) + ret = error; + } else { + if (error || wbc->nr_to_write <= 0) + goto done; } } folio_batch_release(&fbatch); @@ -2524,14 +2505,15 @@ continue_unlock: * of the file if we are called again, which can only happen due to * -ENOMEM from the file system. */ - if (wbc->range_cyclic) { - if (done) - mapping->writeback_index = done_index; - else - mapping->writeback_index = 0; - } - + if (wbc->range_cyclic) + mapping->writeback_index = 0; return ret; + +done: + if (wbc->range_cyclic) + mapping->writeback_index = folio->index + folio_nr_pages(folio); + folio_batch_release(&fbatch); + return error; } EXPORT_SYMBOL(write_cache_pages); From b1793929b7dc17842a5e4377796b005d3efd61bc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Feb 2024 07:36:42 +0100 Subject: [PATCH 315/435] writeback: factor folio_prepare_writeback() out of write_cache_pages() Reduce write_cache_pages() by about 30 lines; much of it is commentary, but it all bundles nicely into an obvious function. [hch@lst.de: rename should_writeback_folio to folio_prepare_writeback per Jan] Link: https://lkml.kernel.org/r/20240215063649.2164017-8-hch@lst.de Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Acked-by: Dave Chinner Cc: Christian Brauner Cc: David Howells Signed-off-by: Andrew Morton --- mm/page-writeback.c | 61 +++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3a1e23bed405..4bc55ce9597d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2360,6 +2360,38 @@ void tag_pages_for_writeback(struct address_space *mapping, } EXPORT_SYMBOL(tag_pages_for_writeback); +static bool folio_prepare_writeback(struct address_space *mapping, + struct writeback_control *wbc, struct folio *folio) +{ + /* + * Folio truncated or invalidated. We can freely skip it then, + * even for data integrity operations: the folio has disappeared + * concurrently, so there could be no real expectation of this + * data integrity operation even if there is now a new, dirty + * folio at the same pagecache index. + */ + if (unlikely(folio->mapping != mapping)) + return false; + + /* + * Did somebody else write it for us? + */ + if (!folio_test_dirty(folio)) + return false; + + if (folio_test_writeback(folio)) { + if (wbc->sync_mode == WB_SYNC_NONE) + return false; + folio_wait_writeback(folio); + } + BUG_ON(folio_test_writeback(folio)); + + if (!folio_clear_dirty_for_io(folio)) + return false; + + return true; +} + /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write @@ -2430,38 +2462,13 @@ int write_cache_pages(struct address_space *mapping, for (i = 0; i < nr_folios; i++) { folio = fbatch.folios[i]; folio_lock(folio); - - /* - * Page truncated or invalidated. We can freely skip it - * then, even for data integrity operations: the page - * has disappeared concurrently, so there could be no - * real expectation of this data integrity operation - * even if there is now a new, dirty page at the same - * pagecache address. - */ - if (unlikely(folio->mapping != mapping)) { -continue_unlock: + if (!folio_prepare_writeback(mapping, wbc, folio)) { folio_unlock(folio); continue; } - if (!folio_test_dirty(folio)) { - /* someone wrote it for us */ - goto continue_unlock; - } - - if (folio_test_writeback(folio)) { - if (wbc->sync_mode != WB_SYNC_NONE) - folio_wait_writeback(folio); - else - goto continue_unlock; - } - - BUG_ON(folio_test_writeback(folio)); - if (!folio_clear_dirty_for_io(folio)) - goto continue_unlock; - trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); + error = writepage(folio, wbc, data); wbc->nr_to_write -= folio_nr_pages(folio); From 751e0d559c62a87dc828af22c3c58dc078c734e3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Feb 2024 07:36:43 +0100 Subject: [PATCH 316/435] writeback: factor writeback_get_batch() out of write_cache_pages() This simple helper will be the basis of the writeback iterator. To make this work, we need to remember the current index and end positions in writeback_control. [hch@lst.de: heavily rebased, add helpers to get the tag and end index, don't keep the end index in struct writeback_control] Link: https://lkml.kernel.org/r/20240215063649.2164017-9-hch@lst.de Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Acked-by: Dave Chinner Cc: Christian Brauner Cc: David Howells Signed-off-by: Andrew Morton --- include/linux/writeback.h | 6 ++++ mm/page-writeback.c | 60 +++++++++++++++++++++++++-------------- 2 files changed, 44 insertions(+), 22 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 4b8cf9e4810b..f67b3ea866a0 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -11,6 +11,7 @@ #include #include #include +#include struct bio; @@ -40,6 +41,7 @@ enum writeback_sync_modes { * in a manner such that unspecified fields are set to zero. */ struct writeback_control { + /* public fields that can be set and/or consumed by the caller: */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ @@ -77,6 +79,10 @@ struct writeback_control { */ struct swap_iocb **swap_plug; + /* internal fields used by the ->writepages implementation: */ + struct folio_batch fbatch; + pgoff_t index; + #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ struct inode *inode; /* inode being written out */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4bc55ce9597d..5b8dbbef7137 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2392,6 +2392,29 @@ static bool folio_prepare_writeback(struct address_space *mapping, return true; } +static xa_mark_t wbc_to_tag(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + return PAGECACHE_TAG_TOWRITE; + return PAGECACHE_TAG_DIRTY; +} + +static pgoff_t wbc_end(struct writeback_control *wbc) +{ + if (wbc->range_cyclic) + return -1; + return wbc->range_end >> PAGE_SHIFT; +} + +static void writeback_get_batch(struct address_space *mapping, + struct writeback_control *wbc) +{ + folio_batch_release(&wbc->fbatch); + cond_resched(); + filemap_get_folios_tag(mapping, &wbc->index, wbc_end(wbc), + wbc_to_tag(wbc), &wbc->fbatch); +} + /** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write @@ -2429,38 +2452,32 @@ int write_cache_pages(struct address_space *mapping, { int ret = 0; int error; - struct folio_batch fbatch; struct folio *folio; - int nr_folios; - pgoff_t index; pgoff_t end; /* Inclusive */ - xa_mark_t tag; - folio_batch_init(&fbatch); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* prev offset */ + wbc->index = mapping->writeback_index; /* prev offset */ end = -1; } else { - index = wbc->range_start >> PAGE_SHIFT; + wbc->index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { - tag_pages_for_writeback(mapping, index, end); - tag = PAGECACHE_TAG_TOWRITE; - } else { - tag = PAGECACHE_TAG_DIRTY; - } - while (index <= end) { + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, wbc->index, end); + + folio_batch_init(&wbc->fbatch); + + while (wbc->index <= end) { int i; - nr_folios = filemap_get_folios_tag(mapping, &index, end, - tag, &fbatch); + writeback_get_batch(mapping, wbc); - if (nr_folios == 0) + if (wbc->fbatch.nr == 0) break; - for (i = 0; i < nr_folios; i++) { - folio = fbatch.folios[i]; + for (i = 0; i < wbc->fbatch.nr; i++) { + folio = wbc->fbatch.folios[i]; + folio_lock(folio); if (!folio_prepare_writeback(mapping, wbc, folio)) { folio_unlock(folio); @@ -2498,8 +2515,6 @@ int write_cache_pages(struct address_space *mapping, goto done; } } - folio_batch_release(&fbatch); - cond_resched(); } /* @@ -2512,6 +2527,7 @@ int write_cache_pages(struct address_space *mapping, * of the file if we are called again, which can only happen due to * -ENOMEM from the file system. */ + folio_batch_release(&wbc->fbatch); if (wbc->range_cyclic) mapping->writeback_index = 0; return ret; @@ -2519,7 +2535,7 @@ int write_cache_pages(struct address_space *mapping, done: if (wbc->range_cyclic) mapping->writeback_index = folio->index + folio_nr_pages(folio); - folio_batch_release(&fbatch); + folio_batch_release(&wbc->fbatch); return error; } EXPORT_SYMBOL(write_cache_pages); From 807d1fe36077decf6f4ef8d4b63a084760a5c5fc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Feb 2024 07:36:44 +0100 Subject: [PATCH 317/435] writeback: simplify the loops in write_cache_pages() Collapse the two nested loops into one. This is needed as a step towards turning this into an iterator. Note that this drops the "index <= end" check in the previous outer loop and just relies on filemap_get_folios_tag() to return 0 entries when index > end. This actually has a subtle implication when end == -1 because then the returned index will be -1 as well and thus if there is page present on index -1, we could be looping indefinitely. But as the comment in filemap_get_folios_tag documents this as already broken anyway we should not worry about it here either. The fix for that would probably a change to the filemap_get_folios_tag() calling convention. [hch@lst.de: update the commit log per Jan] Link: https://lkml.kernel.org/r/20240215063649.2164017-10-hch@lst.de Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Acked-by: Dave Chinner Cc: Christian Brauner Cc: David Howells Signed-off-by: Andrew Morton --- mm/page-writeback.c | 75 ++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5b8dbbef7137..358ce3ade9ad 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2454,6 +2454,7 @@ int write_cache_pages(struct address_space *mapping, int error; struct folio *folio; pgoff_t end; /* Inclusive */ + int i = 0; if (wbc->range_cyclic) { wbc->index = mapping->writeback_index; /* prev offset */ @@ -2467,53 +2468,49 @@ int write_cache_pages(struct address_space *mapping, folio_batch_init(&wbc->fbatch); - while (wbc->index <= end) { - int i; - - writeback_get_batch(mapping, wbc); - + for (;;) { + if (i == wbc->fbatch.nr) { + writeback_get_batch(mapping, wbc); + i = 0; + } if (wbc->fbatch.nr == 0) break; - for (i = 0; i < wbc->fbatch.nr; i++) { - folio = wbc->fbatch.folios[i]; + folio = wbc->fbatch.folios[i++]; - folio_lock(folio); - if (!folio_prepare_writeback(mapping, wbc, folio)) { - folio_unlock(folio); - continue; - } + folio_lock(folio); + if (!folio_prepare_writeback(mapping, wbc, folio)) { + folio_unlock(folio); + continue; + } - trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); + trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); - error = writepage(folio, wbc, data); - wbc->nr_to_write -= folio_nr_pages(folio); + error = writepage(folio, wbc, data); + wbc->nr_to_write -= folio_nr_pages(folio); - if (error == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - error = 0; - } + if (error == AOP_WRITEPAGE_ACTIVATE) { + folio_unlock(folio); + error = 0; + } - /* - * For integrity writeback we have to keep going until - * we have written all the folios we tagged for - * writeback above, even if we run past wbc->nr_to_write - * or encounter errors. - * We stash away the first error we encounter in - * wbc->saved_err so that it can be retrieved when we're - * done. This is because the file system may still have - * state to clear for each folio. - * - * For background writeback we exit as soon as we run - * past wbc->nr_to_write or encounter the first error. - */ - if (wbc->sync_mode == WB_SYNC_ALL) { - if (error && !ret) - ret = error; - } else { - if (error || wbc->nr_to_write <= 0) - goto done; - } + /* + * For integrity writeback we have to keep going until we have + * written all the folios we tagged for writeback above, even if + * we run past wbc->nr_to_write or encounter errors. + * We stash away the first error we encounter in wbc->saved_err + * so that it can be retrieved when we're done. This is because + * the file system may still have state to clear for each folio. + * + * For background writeback we exit as soon as we run past + * wbc->nr_to_write or encounter the first error. + */ + if (wbc->sync_mode == WB_SYNC_ALL) { + if (error && !ret) + ret = error; + } else { + if (error || wbc->nr_to_write <= 0) + goto done; } } From 535c5d9dadb327bb35f6d780fa037e0d3dfcb568 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Feb 2024 07:36:45 +0100 Subject: [PATCH 318/435] pagevec: add ability to iterate a queue Add a loop counter inside the folio_batch to let us iterate from 0-nr instead of decrementing nr and treating the batch as a stack. It would generate some very weird and suboptimal I/O patterns for page writeback to iterate over the batch as a stack. Link: https://lkml.kernel.org/r/20240215063649.2164017-11-hch@lst.de Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Acked-by: Dave Chinner Cc: Christian Brauner Cc: David Howells Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 87cc678adc85..fcc06c300a72 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -27,6 +27,7 @@ struct folio; */ struct folio_batch { unsigned char nr; + unsigned char i; bool percpu_pvec_drained; struct folio *folios[PAGEVEC_SIZE]; }; @@ -40,12 +41,14 @@ struct folio_batch { static inline void folio_batch_init(struct folio_batch *fbatch) { fbatch->nr = 0; + fbatch->i = 0; fbatch->percpu_pvec_drained = false; } static inline void folio_batch_reinit(struct folio_batch *fbatch) { fbatch->nr = 0; + fbatch->i = 0; } static inline unsigned int folio_batch_count(struct folio_batch *fbatch) @@ -75,6 +78,21 @@ static inline unsigned folio_batch_add(struct folio_batch *fbatch, return folio_batch_space(fbatch); } +/** + * folio_batch_next - Return the next folio to process. + * @fbatch: The folio batch being processed. + * + * Use this function to implement a queue of folios. + * + * Return: The next folio in the queue, or NULL if the queue is empty. + */ +static inline struct folio *folio_batch_next(struct folio_batch *fbatch) +{ + if (fbatch->i == fbatch->nr) + return NULL; + return fbatch->folios[fbatch->i++]; +} + void __folio_batch_release(struct folio_batch *pvec); static inline void folio_batch_release(struct folio_batch *fbatch) From e6d0ab87c8efe9305024d197bc866122d39be306 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Feb 2024 07:36:46 +0100 Subject: [PATCH 319/435] writeback: use the folio_batch queue iterator Instead of keeping our own local iterator variable, use the one just added to folio_batch. Link: https://lkml.kernel.org/r/20240215063649.2164017-12-hch@lst.de Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Acked-by: Dave Chinner Cc: Christian Brauner Cc: David Howells Signed-off-by: Andrew Morton --- mm/page-writeback.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 358ce3ade9ad..3cbe4a7daa35 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2406,13 +2406,21 @@ static pgoff_t wbc_end(struct writeback_control *wbc) return wbc->range_end >> PAGE_SHIFT; } -static void writeback_get_batch(struct address_space *mapping, +static struct folio *writeback_get_folio(struct address_space *mapping, struct writeback_control *wbc) { - folio_batch_release(&wbc->fbatch); - cond_resched(); - filemap_get_folios_tag(mapping, &wbc->index, wbc_end(wbc), - wbc_to_tag(wbc), &wbc->fbatch); + struct folio *folio; + + folio = folio_batch_next(&wbc->fbatch); + if (!folio) { + folio_batch_release(&wbc->fbatch); + cond_resched(); + filemap_get_folios_tag(mapping, &wbc->index, wbc_end(wbc), + wbc_to_tag(wbc), &wbc->fbatch); + folio = folio_batch_next(&wbc->fbatch); + } + + return folio; } /** @@ -2454,7 +2462,6 @@ int write_cache_pages(struct address_space *mapping, int error; struct folio *folio; pgoff_t end; /* Inclusive */ - int i = 0; if (wbc->range_cyclic) { wbc->index = mapping->writeback_index; /* prev offset */ @@ -2469,15 +2476,10 @@ int write_cache_pages(struct address_space *mapping, folio_batch_init(&wbc->fbatch); for (;;) { - if (i == wbc->fbatch.nr) { - writeback_get_batch(mapping, wbc); - i = 0; - } - if (wbc->fbatch.nr == 0) + folio = writeback_get_folio(mapping, wbc); + if (!folio) break; - folio = wbc->fbatch.folios[i++]; - folio_lock(folio); if (!folio_prepare_writeback(mapping, wbc, folio)) { folio_unlock(folio); From a2cbc13638d909e61495f8f4c5968105fff23d9d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Feb 2024 07:36:47 +0100 Subject: [PATCH 320/435] writeback: move the folio_prepare_writeback loop out of write_cache_pages() Move the loop for should-we-write-this-folio to writeback_get_folio. [hch@lst.de: fold loop into existing helper instead of a separate one per Jan] Link: https://lkml.kernel.org/r/20240215063649.2164017-13-hch@lst.de Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Acked-by: Dave Chinner Cc: Christian Brauner Cc: David Howells Signed-off-by: Andrew Morton --- mm/page-writeback.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3cbe4a7daa35..fc421402f818 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2411,6 +2411,7 @@ static struct folio *writeback_get_folio(struct address_space *mapping, { struct folio *folio; +retry: folio = folio_batch_next(&wbc->fbatch); if (!folio) { folio_batch_release(&wbc->fbatch); @@ -2418,8 +2419,17 @@ static struct folio *writeback_get_folio(struct address_space *mapping, filemap_get_folios_tag(mapping, &wbc->index, wbc_end(wbc), wbc_to_tag(wbc), &wbc->fbatch); folio = folio_batch_next(&wbc->fbatch); + if (!folio) + return NULL; } + folio_lock(folio); + if (unlikely(!folio_prepare_writeback(mapping, wbc, folio))) { + folio_unlock(folio); + goto retry; + } + + trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); return folio; } @@ -2480,14 +2490,6 @@ int write_cache_pages(struct address_space *mapping, if (!folio) break; - folio_lock(folio); - if (!folio_prepare_writeback(mapping, wbc, folio)) { - folio_unlock(folio); - continue; - } - - trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); - error = writepage(folio, wbc, data); wbc->nr_to_write -= folio_nr_pages(folio); From cdc150b575cf1176472791cfbe7738708812ea0d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Feb 2024 07:36:48 +0100 Subject: [PATCH 321/435] writeback: add a writeback iterator Refactor the code left in write_cache_pages into an iterator that the file system can call to get the next folio for a writeback operation: struct folio *folio = NULL; while ((folio = writeback_iter(mapping, wbc, folio, &error))) { error = ; } The twist here is that the error value is passed by reference, so that the iterator can restore it when breaking out of the loop. Handling of the magic AOP_WRITEPAGE_ACTIVATE value stays outside the iterator and needs is just kept in the write_cache_pages legacy wrapper. in preparation for eventually killing it off. Heavily based on a for_each* based iterator from Matthew Wilcox. Link: https://lkml.kernel.org/r/20240215063649.2164017-14-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Cc: Christian Brauner Cc: Dave Chinner Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- include/linux/writeback.h | 4 + mm/page-writeback.c | 188 +++++++++++++++++++++++--------------- 2 files changed, 116 insertions(+), 76 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index f67b3ea866a0..9845cb62e40b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -82,6 +82,7 @@ struct writeback_control { /* internal fields used by the ->writepages implementation: */ struct folio_batch fbatch; pgoff_t index; + int saved_err; #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ @@ -366,6 +367,9 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, bool wb_over_bg_thresh(struct bdi_writeback *wb); +struct folio *writeback_iter(struct address_space *mapping, + struct writeback_control *wbc, struct folio *folio, int *error); + typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, void *data); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index fc421402f818..3da4345f08a3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2325,18 +2325,18 @@ void __init page_writeback_init(void) } /** - * tag_pages_for_writeback - tag pages to be written by write_cache_pages + * tag_pages_for_writeback - tag pages to be written by writeback * @mapping: address space structure to write * @start: starting page index * @end: ending page index (inclusive) * * This function scans the page range from @start to @end (inclusive) and tags - * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is - * that write_cache_pages (or whoever calls this function) will then use - * TOWRITE tag to identify pages eligible for writeback. This mechanism is - * used to avoid livelocking of writeback by a process steadily creating new - * dirty pages in the file (thus it is important for this function to be quick - * so that it can tag pages faster than a dirtying process can create them). + * all pages that have DIRTY tag set with a special TOWRITE tag. The caller + * can then use the TOWRITE tag to identify pages eligible for writeback. + * This mechanism is used to avoid livelocking of writeback by a process + * steadily creating new dirty pages in the file (thus it is important for this + * function to be quick so that it can tag pages faster than a dirtying process + * can create them). */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -2434,69 +2434,68 @@ retry: } /** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * writeback_iter - iterate folio of a mapping for writeback * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function + * @wbc: writeback context + * @folio: previously iterated folio (%NULL to start) + * @error: in-out pointer for writeback errors (see below) * - * If a page is already under I/O, write_cache_pages() skips it, even - * if it's dirty. This is desirable behaviour for memory-cleaning writeback, - * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() - * and msync() need to guarantee that all the data which was dirty at the time - * the call was made get new I/O started against them. If wbc->sync_mode is - * WB_SYNC_ALL then we were called for data integrity and we must wait for - * existing IO to complete. + * This function returns the next folio for the writeback operation described by + * @wbc on @mapping and should be called in a while loop in the ->writepages + * implementation. * - * To avoid livelocks (when other process dirties new pages), we first tag - * pages which should be written back with TOWRITE tag and only then start - * writing them. For data-integrity sync we have to be careful so that we do - * not miss some pages (e.g., because some other process has cleared TOWRITE - * tag we set). The rule we follow is that TOWRITE tag can be cleared only - * by the process clearing the DIRTY tag (and submitting the page for IO). + * To start the writeback operation, %NULL is passed in the @folio argument, and + * for every subsequent iteration the folio returned previously should be passed + * back in. * - * To avoid deadlocks between range_cyclic writeback and callers that hold - * pages in PageWriteback to aggregate IO until write_cache_pages() returns, - * we do not loop back to the start of the file. Doing so causes a page - * lock/page writeback access order inversion - we should only ever lock - * multiple pages in ascending page->index order, and looping back to the start - * of the file violates that rule and causes deadlocks. + * If there was an error in the per-folio writeback inside the writeback_iter() + * loop, @error should be set to the error value. * - * Return: %0 on success, negative error code otherwise + * Once the writeback described in @wbc has finished, this function will return + * %NULL and if there was an error in any iteration restore it to @error. + * + * Note: callers should not manually break out of the loop using break or goto + * but must keep calling writeback_iter() until it returns %NULL. + * + * Return: the folio to write or %NULL if the loop is done. */ -int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) +struct folio *writeback_iter(struct address_space *mapping, + struct writeback_control *wbc, struct folio *folio, int *error) { - int ret = 0; - int error; - struct folio *folio; - pgoff_t end; /* Inclusive */ + if (!folio) { + folio_batch_init(&wbc->fbatch); + wbc->saved_err = *error = 0; - if (wbc->range_cyclic) { - wbc->index = mapping->writeback_index; /* prev offset */ - end = -1; + /* + * For range cyclic writeback we remember where we stopped so + * that we can continue where we stopped. + * + * For non-cyclic writeback we always start at the beginning of + * the passed in range. + */ + if (wbc->range_cyclic) + wbc->index = mapping->writeback_index; + else + wbc->index = wbc->range_start >> PAGE_SHIFT; + + /* + * To avoid livelocks when other processes dirty new pages, we + * first tag pages which should be written back and only then + * start writing them. + * + * For data-integrity writeback we have to be careful so that we + * do not miss some pages (e.g., because some other process has + * cleared the TOWRITE tag we set). The rule we follow is that + * TOWRITE tag can be cleared only by the process clearing the + * DIRTY tag (and submitting the page for I/O). + */ + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, wbc->index, + wbc_end(wbc)); } else { - wbc->index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; - } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, wbc->index, end); - - folio_batch_init(&wbc->fbatch); - - for (;;) { - folio = writeback_get_folio(mapping, wbc); - if (!folio) - break; - - error = writepage(folio, wbc, data); wbc->nr_to_write -= folio_nr_pages(folio); - if (error == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - error = 0; - } + WARN_ON_ONCE(*error > 0); /* * For integrity writeback we have to keep going until we have @@ -2510,33 +2509,70 @@ int write_cache_pages(struct address_space *mapping, * wbc->nr_to_write or encounter the first error. */ if (wbc->sync_mode == WB_SYNC_ALL) { - if (error && !ret) - ret = error; + if (*error && !wbc->saved_err) + wbc->saved_err = *error; } else { - if (error || wbc->nr_to_write <= 0) + if (*error || wbc->nr_to_write <= 0) goto done; } } - /* - * For range cyclic writeback we need to remember where we stopped so - * that we can continue there next time we are called. If we hit the - * last page and there is more work to be done, wrap back to the start - * of the file. - * - * For non-cyclic writeback we always start looking up at the beginning - * of the file if we are called again, which can only happen due to - * -ENOMEM from the file system. - */ - folio_batch_release(&wbc->fbatch); - if (wbc->range_cyclic) - mapping->writeback_index = 0; - return ret; + folio = writeback_get_folio(mapping, wbc); + if (!folio) { + /* + * To avoid deadlocks between range_cyclic writeback and callers + * that hold pages in PageWriteback to aggregate I/O until + * the writeback iteration finishes, we do not loop back to the + * start of the file. Doing so causes a page lock/page + * writeback access order inversion - we should only ever lock + * multiple pages in ascending page->index order, and looping + * back to the start of the file violates that rule and causes + * deadlocks. + */ + if (wbc->range_cyclic) + mapping->writeback_index = 0; + + /* + * Return the first error we encountered (if there was any) to + * the caller. + */ + *error = wbc->saved_err; + } + return folio; done: if (wbc->range_cyclic) mapping->writeback_index = folio->index + folio_nr_pages(folio); folio_batch_release(&wbc->fbatch); + return NULL; +} + +/** + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * Return: %0 on success, negative error code otherwise + * + * Note: please use writeback_iter() instead. + */ +int write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) +{ + struct folio *folio = NULL; + int error; + + while ((folio = writeback_iter(mapping, wbc, folio, &error))) { + error = writepage(folio, wbc, data); + if (error == AOP_WRITEPAGE_ACTIVATE) { + folio_unlock(folio); + error = 0; + } + } + return error; } EXPORT_SYMBOL(write_cache_pages); From c44ed5b7596f0b0421803f16c819ee9c02c50240 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Feb 2024 07:36:49 +0100 Subject: [PATCH 322/435] writeback: remove a use of write_cache_pages() from do_writepages() Use the new writeback_iter() directly instead of indirecting through a callback. [hch@lst.de: ported to the while based iter style] Link: https://lkml.kernel.org/r/20240215063649.2164017-15-hch@lst.de Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Cc: Christian Brauner Cc: Dave Chinner Cc: David Howells Signed-off-by: Andrew Morton --- mm/page-writeback.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3da4345f08a3..3e19b87049db 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2577,15 +2577,25 @@ int write_cache_pages(struct address_space *mapping, } EXPORT_SYMBOL(write_cache_pages); -static int writepage_cb(struct folio *folio, struct writeback_control *wbc, - void *data) +static int writeback_use_writepage(struct address_space *mapping, + struct writeback_control *wbc) { - struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(&folio->page, wbc); + struct folio *folio = NULL; + struct blk_plug plug; + int err; - if (ret < 0) - mapping_set_error(mapping, ret); - return ret; + blk_start_plug(&plug); + while ((folio = writeback_iter(mapping, wbc, folio, &err))) { + err = mapping->a_ops->writepage(&folio->page, wbc); + if (err == AOP_WRITEPAGE_ACTIVATE) { + folio_unlock(folio); + err = 0; + } + mapping_set_error(mapping, err); + } + blk_finish_plug(&plug); + + return err; } int do_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -2601,12 +2611,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) if (mapping->a_ops->writepages) { ret = mapping->a_ops->writepages(mapping, wbc); } else if (mapping->a_ops->writepage) { - struct blk_plug plug; - - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, writepage_cb, - mapping); - blk_finish_plug(&plug); + ret = writeback_use_writepage(mapping, wbc); } else { /* deal with chardevs and other special files */ ret = 0; From bf9b7df23cb3c03d8cdbcaa58817e60ce06105ac Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 16 Feb 2024 08:55:04 +0000 Subject: [PATCH 323/435] mm/zswap: global lru and shrinker shared by all zswap_pools Patch series "mm/zswap: optimize for dynamic zswap_pools", v3. Dynamic pool creation has been supported for a long time, which maybe not used so much in practice. But with the per-memcg lru merged, the current structure of zswap_pool's lru and shrinker become less optimal. In the current structure, each zswap_pool has its own lru, shrinker and shrink_work, but only the latest zswap_pool will be the current used. 1. When memory has pressure, all shrinkers of zswap_pools will try to shrink its lru list, there is no order between them. 2. When zswap limit hit, only the last zswap_pool's shrink_work will try to shrink its own lru, which is inefficient. A more natural way is to have a global zswap lru shared between all zswap_pools, and so is the shrinker. The code becomes much simpler too. Another optimization is changing zswap_pool kref to percpu_ref, which will be taken reference by every zswap entry. So the scalability is better. Testing kernel build (32 threads) in tmpfs with memory.max=2GB. (zswap shrinker and writeback enabled with one 50GB swapfile, on a 128 CPUs x86-64 machine, below is the average of 5 runs) mm-unstable zswap-global-lru real 63.20 63.12 user 1061.75 1062.95 sys 268.74 264.44 This patch (of 3): Dynamic zswap_pool creation may create/reuse to have multiple zswap_pools in a list, only the first will be current used. Each zswap_pool has its own lru and shrinker, which is not necessary and has its problem: 1. When memory has pressure, all shrinker of zswap_pools will try to shrink its own lru, there is no order between them. 2. When zswap limit hit, only the last zswap_pool's shrink_work will try to shrink its lru list. The rationale here was to try and empty the old pool first so that we can completely drop it. However, since we only support exclusive loads now, the LRU ordering should be entirely decided by the order of stores, so the oldest entries on the LRU will naturally be from the oldest pool. Anyway, having a global lru and shrinker shared by all zswap_pools is better and efficient. Link: https://lkml.kernel.org/r/20240210-zswap-global-lru-v3-0-200495333595@bytedance.com Link: https://lkml.kernel.org/r/20240210-zswap-global-lru-v3-1-200495333595@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Yosry Ahmed Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/zswap.c | 175 +++++++++++++++++++++-------------------------------- 1 file changed, 68 insertions(+), 107 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 51de79aa8659..0141c45a5a6f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -176,15 +176,20 @@ struct zswap_pool { struct kref kref; struct list_head list; struct work_struct release_work; - struct work_struct shrink_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; - struct list_lru list_lru; - struct mem_cgroup *next_shrink; - struct shrinker *shrinker; - atomic_t nr_stored; }; +static struct { + struct list_lru list_lru; + atomic_t nr_stored; + struct shrinker *shrinker; + struct work_struct shrink_work; + struct mem_cgroup *next_shrink; + /* The lock protects next_shrink. */ + spinlock_t shrink_lock; +} zswap; + /* * struct zswap_entry * @@ -301,9 +306,6 @@ static void zswap_update_total_size(void) * pool functions **********************************/ -static void zswap_alloc_shrinker(struct zswap_pool *pool); -static void shrink_worker(struct work_struct *w); - static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { int i; @@ -353,30 +355,16 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) if (ret) goto error; - zswap_alloc_shrinker(pool); - if (!pool->shrinker) - goto error; - - pr_debug("using %s compressor\n", pool->tfm_name); - /* being the current pool takes 1 ref; this func expects the * caller to always add the new pool as the current pool */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); - if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) - goto lru_fail; - shrinker_register(pool->shrinker); - INIT_WORK(&pool->shrink_work, shrink_worker); - atomic_set(&pool->nr_stored, 0); zswap_pool_debug("created", pool); return pool; -lru_fail: - list_lru_destroy(&pool->list_lru); - shrinker_free(pool->shrinker); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); @@ -434,15 +422,8 @@ static void zswap_pool_destroy(struct zswap_pool *pool) zswap_pool_debug("destroying", pool); - shrinker_free(pool->shrinker); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); - list_lru_destroy(&pool->list_lru); - - spin_lock(&zswap_pools_lock); - mem_cgroup_iter_break(NULL, pool->next_shrink); - pool->next_shrink = NULL; - spin_unlock(&zswap_pools_lock); for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) zpool_destroy_pool(pool->zpools[i]); @@ -529,24 +510,6 @@ static struct zswap_pool *zswap_pool_current_get(void) return pool; } -static struct zswap_pool *zswap_pool_last_get(void) -{ - struct zswap_pool *pool, *last = NULL; - - rcu_read_lock(); - - list_for_each_entry_rcu(pool, &zswap_pools, list) - last = pool; - WARN_ONCE(!last && zswap_has_pool, - "%s: no page storage pool!\n", __func__); - if (!zswap_pool_get(last)) - last = NULL; - - rcu_read_unlock(); - - return last; -} - /* type and compressor must be null-terminated */ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) { @@ -816,15 +779,11 @@ void zswap_folio_swapin(struct folio *folio) void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { - struct zswap_pool *pool; - - /* lock out zswap pools list modification */ - spin_lock(&zswap_pools_lock); - list_for_each_entry(pool, &zswap_pools, list) { - if (pool->next_shrink == memcg) - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); - } - spin_unlock(&zswap_pools_lock); + /* lock out zswap shrinker walking memcg tree */ + spin_lock(&zswap.shrink_lock); + if (zswap.next_shrink == memcg) + zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL); + spin_unlock(&zswap.shrink_lock); } /********************************* @@ -923,9 +882,9 @@ static void zswap_entry_free(struct zswap_entry *entry) if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { - zswap_lru_del(&entry->pool->list_lru, entry); + zswap_lru_del(&zswap.list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); - atomic_dec(&entry->pool->nr_stored); + atomic_dec(&zswap.nr_stored); zswap_pool_put(entry->pool); } if (entry->objcg) { @@ -1287,7 +1246,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, { struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); unsigned long shrink_ret, nr_protected, lru_size; - struct zswap_pool *pool = shrinker->private_data; bool encountered_page_in_swapcache = false; if (!zswap_shrinker_enabled || @@ -1298,7 +1256,7 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, nr_protected = atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - lru_size = list_lru_shrink_count(&pool->list_lru, sc); + lru_size = list_lru_shrink_count(&zswap.list_lru, sc); /* * Abort if we are shrinking into the protected region. @@ -1315,7 +1273,7 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, return SHRINK_STOP; } - shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, + shrink_ret = list_lru_shrink_walk(&zswap.list_lru, sc, &shrink_memcg_cb, &encountered_page_in_swapcache); if (encountered_page_in_swapcache) @@ -1327,7 +1285,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, static unsigned long zswap_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { - struct zswap_pool *pool = shrinker->private_data; struct mem_cgroup *memcg = sc->memcg; struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; @@ -1341,8 +1298,8 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); #else /* use pool stats instead of memcg stats */ - nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; - nr_stored = atomic_read(&pool->nr_stored); + nr_backing = zswap_pool_total_size >> PAGE_SHIFT; + nr_stored = atomic_read(&zswap.nr_stored); #endif if (!nr_stored) @@ -1350,7 +1307,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, nr_protected = atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); + nr_freeable = list_lru_shrink_count(&zswap.list_lru, sc); /* * Subtract the lru size by an estimate of the number of pages * that should be protected. @@ -1366,23 +1323,24 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, return mult_frac(nr_freeable, nr_backing, nr_stored); } -static void zswap_alloc_shrinker(struct zswap_pool *pool) +static struct shrinker *zswap_alloc_shrinker(void) { - pool->shrinker = - shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); - if (!pool->shrinker) - return; + struct shrinker *shrinker; - pool->shrinker->private_data = pool; - pool->shrinker->scan_objects = zswap_shrinker_scan; - pool->shrinker->count_objects = zswap_shrinker_count; - pool->shrinker->batch = 0; - pool->shrinker->seeks = DEFAULT_SEEKS; + shrinker = + shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); + if (!shrinker) + return NULL; + + shrinker->scan_objects = zswap_shrinker_scan; + shrinker->count_objects = zswap_shrinker_count; + shrinker->batch = 0; + shrinker->seeks = DEFAULT_SEEKS; + return shrinker; } static int shrink_memcg(struct mem_cgroup *memcg) { - struct zswap_pool *pool; int nid, shrunk = 0; if (!mem_cgroup_zswap_writeback_enabled(memcg)) @@ -1395,32 +1353,25 @@ static int shrink_memcg(struct mem_cgroup *memcg) if (memcg && !mem_cgroup_online(memcg)) return -ENOENT; - pool = zswap_pool_current_get(); - if (!pool) - return -EINVAL; - for_each_node_state(nid, N_NORMAL_MEMORY) { unsigned long nr_to_walk = 1; - shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, + shrunk += list_lru_walk_one(&zswap.list_lru, nid, memcg, &shrink_memcg_cb, NULL, &nr_to_walk); } - zswap_pool_put(pool); return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) { - struct zswap_pool *pool = container_of(w, typeof(*pool), - shrink_work); struct mem_cgroup *memcg; int ret, failures = 0; /* global reclaim will select cgroup in a round-robin fashion. */ do { - spin_lock(&zswap_pools_lock); - pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); - memcg = pool->next_shrink; + spin_lock(&zswap.shrink_lock); + zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL); + memcg = zswap.next_shrink; /* * We need to retry if we have gone through a full round trip, or if we @@ -1434,7 +1385,7 @@ static void shrink_worker(struct work_struct *w) * memcg is not killed when we are reclaiming. */ if (!memcg) { - spin_unlock(&zswap_pools_lock); + spin_unlock(&zswap.shrink_lock); if (++failures == MAX_RECLAIM_RETRIES) break; @@ -1444,15 +1395,15 @@ static void shrink_worker(struct work_struct *w) if (!mem_cgroup_tryget_online(memcg)) { /* drop the reference from mem_cgroup_iter() */ mem_cgroup_iter_break(NULL, memcg); - pool->next_shrink = NULL; - spin_unlock(&zswap_pools_lock); + zswap.next_shrink = NULL; + spin_unlock(&zswap.shrink_lock); if (++failures == MAX_RECLAIM_RETRIES) break; goto resched; } - spin_unlock(&zswap_pools_lock); + spin_unlock(&zswap.shrink_lock); ret = shrink_memcg(memcg); /* drop the extra reference */ @@ -1466,7 +1417,6 @@ static void shrink_worker(struct work_struct *w) resched: cond_resched(); } while (!zswap_can_accept()); - zswap_pool_put(pool); } static int zswap_is_page_same_filled(void *ptr, unsigned long *value) @@ -1507,7 +1457,6 @@ bool zswap_store(struct folio *folio) struct zswap_entry *entry, *dupentry; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; - struct zswap_pool *shrink_pool; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1575,7 +1524,7 @@ bool zswap_store(struct folio *folio) if (objcg) { memcg = get_mem_cgroup_from_objcg(objcg); - if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) { + if (memcg_list_lru_alloc(memcg, &zswap.list_lru, GFP_KERNEL)) { mem_cgroup_put(memcg); goto put_pool; } @@ -1606,8 +1555,8 @@ insert_entry: } if (entry->length) { INIT_LIST_HEAD(&entry->lru); - zswap_lru_add(&entry->pool->list_lru, entry); - atomic_inc(&entry->pool->nr_stored); + zswap_lru_add(&zswap.list_lru, entry); + atomic_inc(&zswap.nr_stored); } spin_unlock(&tree->lock); @@ -1639,9 +1588,7 @@ check_old: return false; shrink: - shrink_pool = zswap_pool_last_get(); - if (shrink_pool && !queue_work(shrink_wq, &shrink_pool->shrink_work)) - zswap_pool_put(shrink_pool); + queue_work(shrink_wq, &zswap.shrink_work); goto reject; } @@ -1803,6 +1750,22 @@ static int zswap_setup(void) if (ret) goto hp_fail; + shrink_wq = alloc_workqueue("zswap-shrink", + WQ_UNBOUND|WQ_MEM_RECLAIM, 1); + if (!shrink_wq) + goto shrink_wq_fail; + + zswap.shrinker = zswap_alloc_shrinker(); + if (!zswap.shrinker) + goto shrinker_fail; + if (list_lru_init_memcg(&zswap.list_lru, zswap.shrinker)) + goto lru_fail; + shrinker_register(zswap.shrinker); + + INIT_WORK(&zswap.shrink_work, shrink_worker); + atomic_set(&zswap.nr_stored, 0); + spin_lock_init(&zswap.shrink_lock); + pool = __zswap_pool_create_fallback(); if (pool) { pr_info("loaded using pool %s/%s\n", pool->tfm_name, @@ -1814,19 +1777,17 @@ static int zswap_setup(void) zswap_enabled = false; } - shrink_wq = alloc_workqueue("zswap-shrink", - WQ_UNBOUND|WQ_MEM_RECLAIM, 1); - if (!shrink_wq) - goto fallback_fail; - if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); zswap_init_state = ZSWAP_INIT_SUCCEED; return 0; -fallback_fail: - if (pool) - zswap_pool_destroy(pool); +lru_fail: + shrinker_free(zswap.shrinker); +shrinker_fail: + destroy_workqueue(shrink_wq); +shrink_wq_fail: + cpuhp_remove_multi_state(CPUHP_MM_ZSWP_POOL_PREPARE); hp_fail: kmem_cache_destroy(zswap_entry_cache); cache_fail: From 94ace3fec8477f8a46d08fc57cd1dd5efbd0a32b Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 16 Feb 2024 08:55:05 +0000 Subject: [PATCH 324/435] mm/zswap: change zswap_pool kref to percpu_ref All zswap entries will take a reference of zswap_pool when zswap_store(), and drop it when free. Change it to use the percpu_ref is better for scalability performance. Although percpu_ref use a bit more memory which should be ok for our use case, since we almost have only one zswap_pool to be using. The performance gain is for zswap_store/load hotpath. Testing kernel build (32 threads) in tmpfs with memory.max=2GB. (zswap shrinker and writeback enabled with one 50GB swapfile, on a 128 CPUs x86-64 machine, below is the average of 5 runs) mm-unstable zswap-global-lru real 63.20 63.12 user 1061.75 1062.95 sys 268.74 264.44 [chengming.zhou@linux.dev: fix zswap_pools_lock usages after changing to percpu_ref] Link: https://lkml.kernel.org/r/20240228154954.3028626-1-chengming.zhou@linux.dev Link: https://lkml.kernel.org/r/20240210-zswap-global-lru-v3-2-200495333595@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Nhat Pham Cc: Johannes Weiner Cc: Yosry Ahmed Cc: Chengming Zhou Signed-off-by: Andrew Morton --- mm/zswap.c | 48 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 0141c45a5a6f..da90933c6d20 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -173,7 +173,7 @@ struct crypto_acomp_ctx { struct zswap_pool { struct zpool *zpools[ZSWAP_NR_ZPOOLS]; struct crypto_acomp_ctx __percpu *acomp_ctx; - struct kref kref; + struct percpu_ref ref; struct list_head list; struct work_struct release_work; struct hlist_node node; @@ -305,6 +305,7 @@ static void zswap_update_total_size(void) /********************************* * pool functions **********************************/ +static void __zswap_pool_empty(struct percpu_ref *ref); static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { @@ -358,13 +359,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) /* being the current pool takes 1 ref; this func expects the * caller to always add the new pool as the current pool */ - kref_init(&pool->kref); + ret = percpu_ref_init(&pool->ref, __zswap_pool_empty, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); + if (ret) + goto ref_fail; INIT_LIST_HEAD(&pool->list); zswap_pool_debug("created", pool); return pool; +ref_fail: + cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); @@ -437,8 +443,9 @@ static void __zswap_pool_release(struct work_struct *work) synchronize_rcu(); - /* nobody should have been able to get a kref... */ - WARN_ON(kref_get_unless_zero(&pool->kref)); + /* nobody should have been able to get a ref... */ + WARN_ON(!percpu_ref_is_zero(&pool->ref)); + percpu_ref_exit(&pool->ref); /* pool is now off zswap_pools list and has no references. */ zswap_pool_destroy(pool); @@ -446,13 +453,13 @@ static void __zswap_pool_release(struct work_struct *work) static struct zswap_pool *zswap_pool_current(void); -static void __zswap_pool_empty(struct kref *kref) +static void __zswap_pool_empty(struct percpu_ref *ref) { struct zswap_pool *pool; - pool = container_of(kref, typeof(*pool), kref); + pool = container_of(ref, typeof(*pool), ref); - spin_lock(&zswap_pools_lock); + spin_lock_bh(&zswap_pools_lock); WARN_ON(pool == zswap_pool_current()); @@ -461,7 +468,7 @@ static void __zswap_pool_empty(struct kref *kref) INIT_WORK(&pool->release_work, __zswap_pool_release); schedule_work(&pool->release_work); - spin_unlock(&zswap_pools_lock); + spin_unlock_bh(&zswap_pools_lock); } static int __must_check zswap_pool_get(struct zswap_pool *pool) @@ -469,12 +476,12 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool) if (!pool) return 0; - return kref_get_unless_zero(&pool->kref); + return percpu_ref_tryget(&pool->ref); } static void zswap_pool_put(struct zswap_pool *pool) { - kref_put(&pool->kref, __zswap_pool_empty); + percpu_ref_put(&pool->ref); } static struct zswap_pool *__zswap_pool_current(void) @@ -591,7 +598,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, return -EINVAL; } - spin_lock(&zswap_pools_lock); + spin_lock_bh(&zswap_pools_lock); pool = zswap_pool_find_get(type, compressor); if (pool) { @@ -600,17 +607,28 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, list_del_rcu(&pool->list); } - spin_unlock(&zswap_pools_lock); + spin_unlock_bh(&zswap_pools_lock); if (!pool) pool = zswap_pool_create(type, compressor); + else { + /* + * Restore the initial ref dropped by percpu_ref_kill() + * when the pool was decommissioned and switch it again + * to percpu mode. + */ + percpu_ref_resurrect(&pool->ref); + + /* Drop the ref from zswap_pool_find_get(). */ + zswap_pool_put(pool); + } if (pool) ret = param_set_charp(s, kp); else ret = -EINVAL; - spin_lock(&zswap_pools_lock); + spin_lock_bh(&zswap_pools_lock); if (!ret) { put_pool = zswap_pool_current(); @@ -625,7 +643,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, put_pool = pool; } - spin_unlock(&zswap_pools_lock); + spin_unlock_bh(&zswap_pools_lock); if (!zswap_has_pool && !pool) { /* if initial pool creation failed, and this pool creation also @@ -642,7 +660,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, * or the new pool we failed to add */ if (put_pool) - zswap_pool_put(put_pool); + percpu_ref_kill(&put_pool->ref); return ret; } From 3fb43636876d98ed995e331782e4b06a00d24aee Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 19 Feb 2024 13:10:47 +0900 Subject: [PATCH 325/435] sched/numa, mm: do not try to migrate memory to memoryless nodes Memoryless nodes do not have any memory to migrate to, so, as an optimization, stop trying it. Link: https://lkml.kernel.org/r/20240219041920.1183-1-byungchul@sk.com Link: https://lkml.kernel.org/r/20240216111502.79759-1-byungchul@sk.com Fixes: c574bbe91703 ("NUMA balancing: optimize page placement for memory tiering system") Signed-off-by: Byungchul Park Reviewed-by: Oscar Salvador Reviewed-by: "Huang, Ying" Reviewed-by: Phil Auld Reviewed-by: Davidlohr Bueso Acked-by: David Hildenbrand Cc: Benjamin Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 533547e3c90a..d8d71ad7f9f8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1830,6 +1830,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid; + /* + * Cannot migrate to memoryless nodes. + */ + if (!node_state(dst_nid, N_MEMORY)) + return false; + /* * The pages in slow memory node should be migrated according * to hot/cold instead of private/shared. From f5eec03611d9352c8b19134ecd4742a6ebc45282 Mon Sep 17 00:00:00 2001 From: Matthew Cassell Date: Thu, 22 Feb 2024 19:46:17 +0000 Subject: [PATCH 326/435] mm/util.c: add byte count to __vm_enough_memory failure warning Commit 44b414c8715c5dcf53288 ("mm/util.c: add warning if __vm_enough_memory fails") adds debug information which gives the process id and executable name should __vm_enough_memory() fail. Adding the number of pages to the failure message would benefit application developers and system administrators in debugging overambitious memory requests by providing a point of reference to the amount of memory causing __vm_enough_memory() to fail. 1. Set appropriate kernel tunable to reach code path for failure message: # echo 2 > /proc/sys/vm/overcommit_memory 2. Test program to generate failure - requests 1 gibibyte per iteration: #include #include int main(int argc, char **argv) { for(;;) { if(malloc(1<<30) == NULL) break; printf("allocated 1 GiB\n"); } return 0; } 3. Output: Before: __vm_enough_memory: pid: 1218, comm: a.out, not enough memory for the allocation After: __vm_enough_memory: pid: 1137, comm: a.out, bytes: 1073741824, not enough memory for the allocation Link: https://lkml.kernel.org/r/20240222194617.1255-1-mcassell411@gmail.com Signed-off-by: Matthew Cassell Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/util.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/util.c b/mm/util.c index 5a6a9802583b..6710efaf1802 100644 --- a/mm/util.c +++ b/mm/util.c @@ -942,6 +942,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; + unsigned long bytes_failed; vm_acct_memory(pages); @@ -976,8 +977,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: - pr_warn_ratelimited("%s: pid: %d, comm: %s, not enough memory for the allocation\n", - __func__, current->pid, current->comm); + bytes_failed = pages << PAGE_SHIFT; + pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n", + __func__, current->pid, current->comm, bytes_failed); vm_unacct_memory(pages); return -ENOMEM; From cd87d9f58439a114f38cec276f3ec1600729c8bf Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 22 Feb 2024 19:09:10 +0000 Subject: [PATCH 327/435] x86/mm: further clarify switch_mm_irqs_off() documentation Commit accf6b23d1e5a ("x86/mm: clarify "prev" usage in switch_mm_irqs_off()") attempted to clarify x86's usage of the arguments passed by generic code, specifically the "prev" argument the is unused by x86. However, it could have done a better job with the comment above switch_mm_irqs_off(). Rewrite this comment according to Dave Hansen's suggestion. Link: https://lkml.kernel.org/r/20240222190911.1903054-1-yosryahmed@google.com Fixes: 3cfd6625a6cf ("x86/mm: clarify "prev" usage in switch_mm_irqs_off()") Signed-off-by: Yosry Ahmed Suggested-by: Dave Hansen Acked-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov (AMD) Cc: Ingo Molnar Cc: Peter Zijlstra (Intel) Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/mm/tlb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index bf9605caf24f..b67545baf697 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -493,10 +493,10 @@ static inline void cr4_update_pce_mm(struct mm_struct *mm) { } #endif /* - * The "prev" argument passed by the caller does not always match CR3. For - * example, the scheduler passes in active_mm when switching from lazy TLB mode - * to normal mode, but switch_mm_irqs_off() can be called from x86 code without - * updating active_mm. Use cpu_tlbstate.loaded_mm instead. + * This optimizes when not actually switching mm's. Some architectures use the + * 'unused' argument for this optimization, but x86 must use + * 'cpu_tlbstate.loaded_mm' instead because it does not always keep + * 'current->active_mm' up to date. */ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, struct task_struct *tsk) From 15d1ec74b5d7054bba97e8be0b4407cd74976cbd Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 22 Feb 2024 19:09:11 +0000 Subject: [PATCH 328/435] x86/mm: always pass NULL as the first argument of switch_mm_irqs_off() The first argument of switch_mm_irqs_off() is unused by the x86 implementation. Make sure that x86 code never passes a non-NULL value to make this clear. Update the only non violating caller, switch_mm(). Link: https://lkml.kernel.org/r/20240222190911.1903054-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Suggested-by: Dave Hansen Acked-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov (AMD) Cc: Ingo Molnar Cc: Peter Zijlstra (Intel) Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index b67545baf697..51f9f5694105 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -327,7 +327,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, unsigned long flags; local_irq_save(flags); - switch_mm_irqs_off(prev, next, tsk); + switch_mm_irqs_off(NULL, next, tsk); local_irq_restore(flags); } From 77c7a095644efb388ac412d1affcbde75302e52c Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Thu, 22 Feb 2024 17:19:32 +0800 Subject: [PATCH 329/435] mm/page_alloc: make check_new_page() return bool Make check_new_page() return bool like check_new_pages() Link: https://lkml.kernel.org/r/20240222091932.54799-1-gehao@kylinos.cn Signed-off-by: Hao Ge Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a9581cdf9649..5ab921e3d7d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1422,14 +1422,14 @@ static void check_new_page_bad(struct page *page) /* * This page is about to be returned from the page allocator */ -static int check_new_page(struct page *page) +static bool check_new_page(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) - return 0; + return false; check_new_page_bad(page); - return 1; + return true; } static inline bool check_new_pages(struct page *page, unsigned int order) From 997f0ecb11da15602a3d34e10f9ca8418db794d0 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Feb 2024 15:47:28 -0800 Subject: [PATCH 330/435] mm/memory: change vmf_anon_prepare() to be non-static Patch series "Handle hugetlb faults under the VMA lock", v2. It is generally safe to handle hugetlb faults under the VMA lock. The only time this is unsafe is when no anon_vma has been allocated to this vma yet, so we can use vmf_anon_prepare() instead of anon_vma_prepare() to bailout if necessary. This should only happen for the first hugetlb page in the vma. Additionally, this patchset begins to use struct vm_fault within hugetlb_fault(). This works towards cleaning up hugetlb code, and should significantly reduce the number of arguments passed to functions. The last patch in this series may cause ltp hugemmap10 to "fail". This is because vmf_anon_prepare() may bailout with no anon_vma under the VMA lock after allocating a folio for the hugepage. In free_huge_folio(), this folio is completely freed on bailout iff there is a surplus of hugetlb pages. This will remove a folio off the freelist and decrement the number of hugepages while ltp expects these counters to remain unchanged on failure. The rest of the ltp testcases pass. This patch (of 2): In order to handle hugetlb faults under the VMA lock, hugetlb can use vmf_anon_prepare() to ensure we can safely prepare an anon_vma. Change it to be a non-static function so it can be used within hugetlb as well. Link: https://lkml.kernel.org/r/20240221234732.187629-6-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20240221234732.187629-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/internal.h | 1 + mm/memory.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/internal.h b/mm/internal.h index 93e229112045..2b7efffbe4d7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -103,6 +103,7 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat) wake_up(wqh); } +vm_fault_t vmf_anon_prepare(struct vm_fault *vmf); vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); diff --git a/mm/memory.c b/mm/memory.c index 642b4f2be523..1c45b6a42a1b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3280,7 +3280,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) return VM_FAULT_RETRY; } -static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; From 0ca22723e3ffe0d539c5d72603dded8fe6924a89 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Feb 2024 15:47:29 -0800 Subject: [PATCH 331/435] hugetlb: move vm_fault declaration to the top of hugetlb_fault() hugetlb_fault() currently defines a vm_fault to pass to the generic handle_userfault() function. We can move this definition to the top of hugetlb_fault() so that it can be used throughout the rest of the hugetlb fault path. This will help cleanup a number of excess variables and function arguments throughout the stack. Also, since vm_fault already has space to store the page offset, use that instead and get rid of idx. Link: https://lkml.kernel.org/r/20240221234732.187629-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c53a41d07cd3..02673926b3bb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6378,13 +6378,25 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; vm_fault_t ret; u32 hash; - pgoff_t idx; struct folio *folio = NULL; struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; unsigned long haddr = address & huge_page_mask(h); + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .real_address = address, + .flags = flags, + .pgoff = vma_hugecache_offset(h, vma, haddr), + /* TODO: Track hugetlb faults using vm_fault */ + + /* + * Some fields may not be initialized, be careful as it may + * be hard to debug if called functions make assumptions + */ + }; /* TODO: Handle faults under the VMA lock */ if (flags & FAULT_FLAG_VMA_LOCK) { @@ -6398,8 +6410,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * the same page in the page cache. */ mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, haddr); - hash = hugetlb_fault_mutex_hash(mapping, idx); + hash = hugetlb_fault_mutex_hash(mapping, vmf.pgoff); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -6433,8 +6444,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ - return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, - entry, flags); + + return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address, + ptep, entry, flags); } ret = 0; @@ -6480,7 +6492,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx); + pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, + vmf.pgoff); if (IS_ERR(pagecache_folio)) pagecache_folio = NULL; } @@ -6495,13 +6508,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { if (!userfaultfd_wp_async(vma)) { - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = address, - .flags = flags, - }; - spin_unlock(ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); From 7dac0ec8fa3f4977d04974e94806dfa8bdac7ed2 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Feb 2024 15:47:30 -0800 Subject: [PATCH 332/435] hugetlb: pass struct vm_fault through to hugetlb_handle_userfault() Now that hugetlb_fault() has a struct vm_fault, have hugetlb_handle_userfault() use it instead of creating one of its own. This lets us reduce the number of arguments passed to hugetlb_handle_userfault() from 7 to 3, cleaning up the code and stack. Link: https://lkml.kernel.org/r/20240221234732.187629-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 38 +++++++++----------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 02673926b3bb..fa695e71152c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6085,39 +6085,21 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping return 0; } -static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, +static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf, struct address_space *mapping, - pgoff_t idx, - unsigned int flags, - unsigned long haddr, - unsigned long addr, unsigned long reason) { u32 hash; - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = addr, - .flags = flags, - - /* - * Hard to debug if it ends up being - * used by a callee that assumes - * something about the other - * uninitialized fields... same as in - * memory.c - */ - }; /* * vma_lock and hugetlb_fault_mutex must be dropped before handling * userfault. Also mmap_lock could be dropped due to handling * userfault, any vma operation should be careful from here. */ - hugetlb_vma_unlock_read(vma); - hash = hugetlb_fault_mutex_hash(mapping, idx); + hugetlb_vma_unlock_read(vmf->vma); + hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - return handle_userfault(&vmf, reason); + return handle_userfault(vmf, reason); } /* @@ -6141,7 +6123,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, unsigned long address, pte_t *ptep, - pte_t old_pte, unsigned int flags) + pte_t old_pte, unsigned int flags, + struct vm_fault *vmf) { struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; @@ -6200,8 +6183,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto out; } - return hugetlb_handle_userfault(vma, mapping, idx, flags, - haddr, address, + return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MISSING); } @@ -6273,8 +6255,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, ret = 0; goto out; } - return hugetlb_handle_userfault(vma, mapping, idx, flags, - haddr, address, + return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MINOR); } } @@ -6444,9 +6425,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ - return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address, - ptep, entry, flags); + ptep, entry, flags, &vmf); } ret = 0; From 9acad7ba3e25d11f4c96df1b7312ae89e6faca5c Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Feb 2024 15:47:31 -0800 Subject: [PATCH 333/435] hugetlb: use vmf_anon_prepare() instead of anon_vma_prepare() hugetlb_no_page() and hugetlb_wp() call anon_vma_prepare(). In preparation for hugetlb to safely handle faults under the VMA lock, use vmf_anon_prepare() here instead. Additionally, passing hugetlb_wp() the vm_fault struct from hugetlb_fault() works toward cleaning up the hugetlb code and function stack. Link: https://lkml.kernel.org/r/20240221234732.187629-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fa695e71152c..5e2e4612d458 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5851,7 +5851,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int flags, - struct folio *pagecache_folio, spinlock_t *ptl) + struct folio *pagecache_folio, spinlock_t *ptl, + struct vm_fault *vmf) { const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte = huge_ptep_get(ptep); @@ -5985,10 +5986,9 @@ retry_avoidcopy: * When the original hugepage is shared one, it does not have * anon_vma prepared. */ - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; + ret = vmf_anon_prepare(vmf); + if (unlikely(ret)) goto out_release_all; - } if (copy_user_large_folio(new_folio, old_folio, address, vma)) { ret = VM_FAULT_HWPOISON_LARGE; @@ -6228,10 +6228,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, new_pagecache_folio = true; } else { folio_lock(folio); - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; + + ret = vmf_anon_prepare(vmf); + if (unlikely(ret)) goto backout_unlocked; - } anon_rmap = 1; } } else { @@ -6298,7 +6298,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl, vmf); } spin_unlock(ptl); @@ -6521,7 +6521,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(entry)) { ret = hugetlb_wp(mm, vma, address, ptep, flags, - pagecache_folio, ptl); + pagecache_folio, ptl, &vmf); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { entry = huge_pte_mkdirty(entry); From 7c43a553792a1701affeef20959dfb2ccb26dcee Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Feb 2024 15:47:32 -0800 Subject: [PATCH 334/435] hugetlb: allow faults to be handled under the VMA lock Hugetlb can now safely handle faults under the VMA lock, so allow it to do so. This patch may cause ltp hugemmap10 to "fail". Hugemmap10 tests hugetlb counters, and expects the counters to remain unchanged on failure to handle a fault. In hugetlb_no_page(), vmf_anon_prepare() may bailout with no anon_vma under the VMA lock after allocating a folio for the hugepage. In free_huge_folio(), this folio is completely freed on bailout iff there is a surplus of hugetlb pages. This will remove a folio off the freelist and decrement the number of hugepages while ltp expects these counters to remain unchanged on failure. Originally this could only happen due to OOM failures, but now it may also occur after we allocate a hugetlb folio without a suitable anon_vma under the VMA lock. This should only happen for the first freshly allocated hugepage in this vma. Link: https://lkml.kernel.org/r/20240221234732.187629-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5e2e4612d458..c3d68671cbae 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6379,12 +6379,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ }; - /* TODO: Handle faults under the VMA lock */ - if (flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } - /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate From fd2f556c4f3bcad46f1ddb461a0eb85c11033a13 Mon Sep 17 00:00:00 2001 From: Vincenzo Mezzela Date: Wed, 21 Feb 2024 13:11:47 -0800 Subject: [PATCH 335/435] selftest: damon: fix minor typos in test logs Patch series "selftests/damon: misc fixes". Misc fixes for DAMON selftets on behalf of the original authors. This patch (of 2): This patch resolves a spelling error in the test log, preventing potential confusion. It is submitted as part of my application to the "Linux Kernel Bug Fixing Spring Unpaid 2024" mentorship program of the Linux Foundation. Link: https://lore.kernel.org/r/20240204122523.14160-1-vincenzo.mezzela@gmail.com Link: https://lkml.kernel.org/r/20240221211148.46522-2-sj@kernel.org Signed-off-by: Vincenzo Mezzela Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Bernd Edlinger Cc: Javier Carrasco Signed-off-by: Andrew Morton --- .../selftests/damon/sysfs_update_schemes_tried_regions_hang.py | 2 +- .../damon/sysfs_update_schemes_tried_regions_wss_estimation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py index 8c690ba1a573..28c887a0108f 100644 --- a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py +++ b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_hang.py @@ -20,7 +20,7 @@ def main(): err = kdamonds.start() if err != None: - print('kdmaond start failed: %s' % err) + print('kdamond start failed: %s' % err) exit(1) while proc.poll() == None: diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py index cdbf19b442c9..90ad7409a7a6 100644 --- a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py +++ b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py @@ -23,7 +23,7 @@ def main(): err = kdamonds.start() if err != None: - print('kdmaond start failed: %s' % err) + print('kdamond start failed: %s' % err) exit(1) wss_collected = [] From 21992241cdcff2a279f170a02b0514d76f46b45a Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Wed, 21 Feb 2024 13:11:48 -0800 Subject: [PATCH 336/435] selftests: damon: add access_memory to .gitignore This binary is missing in the .gitignore and stays as an untracked file. Link: https://lore.kernel.org/r/20240214-damon_selftest_gitignore-v1-1-f517d0f9f783@gmail.com Link: https://lkml.kernel.org/r/20240221211148.46522-3-sj@kernel.org Signed-off-by: Javier Carrasco Singed-off-by: SeongJae Park Reported-by: Bernd Edlinger Closes: https://lore.kernel.org/all/AS8P193MB1285C963658008F1B2702AF7E4792@AS8P193MB1285.EURP193.PROD.OUTLOOK.COM/ Reviewed-by: SeongJae Park Cc: Vincenzo Mezzela Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/damon/.gitignore b/tools/testing/selftests/damon/.gitignore index d861701f0327..e65ef9d9cedc 100644 --- a/tools/testing/selftests/damon/.gitignore +++ b/tools/testing/selftests/damon/.gitignore @@ -2,3 +2,4 @@ huge_count_read_write debugfs_target_ids_read_before_terminate_race debugfs_target_ids_pid_leak +access_memory From 72ba14deb40a9e9668ec5e66a341ed657e5215c2 Mon Sep 17 00:00:00 2001 From: Carlos Galo Date: Fri, 23 Feb 2024 17:32:49 +0000 Subject: [PATCH 337/435] mm: update mark_victim tracepoints fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current implementation of the mark_victim tracepoint provides only the process ID (pid) of the victim process. This limitation poses challenges for userspace tools requiring real-time OOM analysis and intervention. Although this information is available from the kernel logs, it’s not the appropriate format to provide OOM notifications. In Android, BPF programs are used with the mark_victim trace events to notify userspace of an OOM kill. For consistency, update the trace event to include the same information about the OOMed victim as the kernel logs. - UID In Android each installed application has a unique UID. Including the `uid` assists in correlating OOM events with specific apps. - Process Name (comm) Enables identification of the affected process. - OOM Score Will allow userspace to get additional insight of the relative kill priority of the OOM victim. In Android, the oom_score_adj is used to categorize app state (foreground, background, etc.), which aids in analyzing user-perceptible impacts of OOM events [1]. - Total VM, RSS Stats, and pgtables Amount of memory used by the victim that will, potentially, be freed up by killing it. [1] https://cs.android.com/android/platform/superproject/main/+/246dc8fc95b6d93afcba5c6d6c133307abb3ac2e:frameworks/base/services/core/java/com/android/server/am/ProcessList.java;l=188-283 Signed-off-by: Carlos Galo Reviewed-by: Steven Rostedt Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Signed-off-by: Andrew Morton --- include/trace/events/oom.h | 36 ++++++++++++++++++++++++++++++++---- mm/oom_kill.c | 6 +++++- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h index 26a11e4a2c36..b799f3bcba82 100644 --- a/include/trace/events/oom.h +++ b/include/trace/events/oom.h @@ -7,6 +7,8 @@ #include #include +#define PG_COUNT_TO_KB(x) ((x) << (PAGE_SHIFT - 10)) + TRACE_EVENT(oom_score_adj_update, TP_PROTO(struct task_struct *task), @@ -72,19 +74,45 @@ TRACE_EVENT(reclaim_retry_zone, ); TRACE_EVENT(mark_victim, - TP_PROTO(int pid), + TP_PROTO(struct task_struct *task, uid_t uid), - TP_ARGS(pid), + TP_ARGS(task, uid), TP_STRUCT__entry( __field(int, pid) + __string(comm, task->comm) + __field(unsigned long, total_vm) + __field(unsigned long, anon_rss) + __field(unsigned long, file_rss) + __field(unsigned long, shmem_rss) + __field(uid_t, uid) + __field(unsigned long, pgtables) + __field(short, oom_score_adj) ), TP_fast_assign( - __entry->pid = pid; + __entry->pid = task->pid; + __assign_str(comm, task->comm); + __entry->total_vm = PG_COUNT_TO_KB(task->mm->total_vm); + __entry->anon_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_ANONPAGES)); + __entry->file_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_FILEPAGES)); + __entry->shmem_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_SHMEMPAGES)); + __entry->uid = uid; + __entry->pgtables = mm_pgtables_bytes(task->mm) >> 10; + __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d", __entry->pid) + TP_printk("pid=%d comm=%s total-vm=%lukB anon-rss=%lukB file-rss:%lukB shmem-rss:%lukB uid=%u pgtables=%lukB oom_score_adj=%hd", + __entry->pid, + __get_str(comm), + __entry->total_vm, + __entry->anon_rss, + __entry->file_rss, + __entry->shmem_rss, + __entry->uid, + __entry->pgtables, + __entry->oom_score_adj + ) ); TRACE_EVENT(wake_reaper, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 91ccd82097c2..8d6a207c3c59 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include "internal.h" @@ -754,6 +755,7 @@ static inline void queue_oom_reaper(struct task_struct *tsk) */ static void mark_oom_victim(struct task_struct *tsk) { + const struct cred *cred; struct mm_struct *mm = tsk->mm; WARN_ON(oom_killer_disabled); @@ -773,7 +775,9 @@ static void mark_oom_victim(struct task_struct *tsk) */ __thaw_task(tsk); atomic_inc(&oom_victims); - trace_mark_victim(tsk->pid); + cred = get_task_cred(tsk); + trace_mark_victim(tsk, cred->uid.val); + put_cred(cred); } /** From 9602e0ce981986acf03324e13fd65abefbe90e23 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 23 Feb 2024 11:55:44 +0800 Subject: [PATCH 338/435] zram: zcomp: remove zcomp_set_max_streams() declaration The zcomp_set_max_streams() is removed from commit 43209ea2d17a ("zram: remove max_comp_streams internals"), remove the declaration. Link: https://lkml.kernel.org/r/20240223035548.2591882-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Chengming Zhou Cc: Huacai Chen Cc: Johannes Weiner Cc: Minchan Kim Cc: Nhat Pham Cc: Sergey Senozhatsky Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- drivers/block/zram/zcomp.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index cdefdef93da8..e9fe63da0e9b 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -39,5 +39,4 @@ int zcomp_compress(struct zcomp_strm *zstrm, int zcomp_decompress(struct zcomp_strm *zstrm, const void *src, unsigned int src_len, void *dst); -bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); #endif /* _ZCOMP_H_ */ From dc24559472a682eb124e869cb110e7a2fd857322 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 23 Feb 2024 17:20:13 +0300 Subject: [PATCH 339/435] lib/stackdepot: off by one in depot_fetch_stack() The stack_pools[] array has DEPOT_MAX_POOLS. The "pools_num" tracks the number of pools which are initialized. See depot_init_pool() for more details. If pool_index == pools_num_cached, this will read one element beyond what we want. If not all the pools are initialized, then the pool will be NULL, triggering a WARN(), and if they are all initialized it will read one element beyond the end of the array. Link: https://lkml.kernel.org/r/361ac881-60b7-471f-91e5-5bf8fe8042b2@moroto.mountain Fixes: b29d31885814 ("lib/stackdepot: store free stack records in a freelist") Signed-off-by: Dan Carpenter Cc: Alexander Potapenko Cc: Andrey Konovalov Signed-off-by: Andrew Morton --- lib/stackdepot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 8c795bb20afb..af6cc19a2003 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -447,7 +447,7 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) lockdep_assert_not_held(&pool_lock); - if (pool_index > pools_num_cached) { + if (pool_index >= pools_num_cached) { WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", pool_index, pools_num_cached, handle); return NULL; From 8cc92a67932febc3f84cf6b3c7d08f27a1b2ebc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20Heidekr=C3=BCger?= Date: Sat, 24 Feb 2024 10:54:14 +0000 Subject: [PATCH 340/435] kasan: fix a2 allocation and remove explicit cast in atomic tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address the additional feedback since 4e76c8cc3378 kasan: add atomic tests (""kasan: add atomic tests") by removing an explicit cast and fixing the size as well as the check of the allocation of `a2`. Link: https://lkml.kernel.org/r/20240224105414.211995-1-paul.heidekrueger@tum.de Link: https://lore.kernel.org/all/20240131210041.686657-1-paul.heidekrueger@tum.de/T/#u Fixes: 4e76c8cc3378 ("kasan: add atomic tests") Signed-off-by: Paul Heidekrüger Closes: https://bugzilla.kernel.org/show_bug.cgi?id=214055 Reviewed-by: Marco Elver Tested-by: Marco Elver Acked-by: Mark Rutland Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/kasan_test.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 2d8ae4fbe63b..7b32be2a3cf0 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -700,7 +700,7 @@ static void kmalloc_uaf3(struct kunit *test) static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe) { - int *i_unsafe = (int *)unsafe; + int *i_unsafe = unsafe; KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*i_unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, WRITE_ONCE(*i_unsafe, 42)); @@ -766,8 +766,8 @@ static void kasan_atomics(struct kunit *test) */ a1 = kzalloc(48, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a1); - a2 = kzalloc(sizeof(int), GFP_KERNEL); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a1); + a2 = kzalloc(sizeof(atomic_long_t), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a2); /* Use atomics to access the redzone. */ kasan_atomics_helper(test, a1 + 48, a2); From ff0b5905a9c9712f36fb7f9dd1be17564483b5d4 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sun, 25 Feb 2024 11:47:51 +1300 Subject: [PATCH 341/435] Docs/mm/damon/design: remove the details for pageout as paddr doesn't use MADV_PAGEOUT The doc needs a fix. As only in the case of virtual address, we are calling madvise() with MADV_PAGEOUT. But in the case of physical address, we are calling reclaim_pages() directly. MADV_PAGEOUT on virtual address is much more aggresive to reclaim memory compared to reclaim_pages() on paddr region. This patch removes the details so that the description can apply to both cases. And we don't need to couple with the implementation details. Link: https://lkml.kernel.org/r/20240224224751.4673-1-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: SeongJae Park Cc: Minchan Kim Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 8c89d26f0baa..5620aab9b385 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -315,7 +315,7 @@ that supports each action are as below. Supported by ``vaddr`` and ``fvaddr`` operations set. - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``. Supported by ``vaddr`` and ``fvaddr`` operations set. - - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``. + - ``pageout``: Reclaim the region. Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set. - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. Supported by ``vaddr`` and ``fvaddr`` operations set. From 912609e96cd728766373d84903f12a6d836de518 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 26 Feb 2024 12:03:20 +0000 Subject: [PATCH 342/435] arm64/mm: export contpte symbols only to GPL users Patch series "Address some contpte nits". These 2 patches address some nits raised by Catalin late in the review cycle for my contpte series [1]. [1] https://lore.kernel.org/linux-mm/20240215103205.2607016-1-ryan.roberts@arm.com/ This patch (of 2): The contpte symbols must be exported since some of the public inline ptep_* APIs are called from modules and these inlines now call the contpte functions. Originally they were exported as EXPORT_SYMBOL() for fear of breaking out-of-tree modules. But we subsequently concluded that EXPORT_SYMBOL_GPL() should be safe since these functions are deeply core mm routines, and any module operating at this level is not going to be able to survive on EXPORT_SYMBOL alone. Link: https://lkml.kernel.org/r/20240226120321.1055731-1-ryan.roberts@arm.com Link: https://lore.kernel.org/linux-mm/f9fc2b31-11cb-4969-8961-9c89fea41b74@nvidia.com/ Link: https://lkml.kernel.org/r/20240226120321.1055731-2-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Acked-by: Catalin Marinas Cc: John Hubbard Cc: Mark Rutland Signed-off-by: Andrew Morton --- arch/arm64/mm/contpte.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 16788f07716d..be0a226c4ff9 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -135,7 +135,7 @@ void __contpte_try_fold(struct mm_struct *mm, unsigned long addr, pte = pte_mkcont(pte); contpte_convert(mm, addr, orig_ptep, pte); } -EXPORT_SYMBOL(__contpte_try_fold); +EXPORT_SYMBOL_GPL(__contpte_try_fold); void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) @@ -150,7 +150,7 @@ void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte = pte_mknoncont(pte); contpte_convert(mm, addr, ptep, pte); } -EXPORT_SYMBOL(__contpte_try_unfold); +EXPORT_SYMBOL_GPL(__contpte_try_unfold); pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte) { @@ -178,7 +178,7 @@ pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte) return orig_pte; } -EXPORT_SYMBOL(contpte_ptep_get); +EXPORT_SYMBOL_GPL(contpte_ptep_get); pte_t contpte_ptep_get_lockless(pte_t *orig_ptep) { @@ -231,7 +231,7 @@ retry: return orig_pte; } -EXPORT_SYMBOL(contpte_ptep_get_lockless); +EXPORT_SYMBOL_GPL(contpte_ptep_get_lockless); void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) @@ -274,7 +274,7 @@ void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, } while (addr != end); } -EXPORT_SYMBOL(contpte_set_ptes); +EXPORT_SYMBOL_GPL(contpte_set_ptes); void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) @@ -282,7 +282,7 @@ void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, contpte_try_unfold_partial(mm, addr, ptep, nr); __clear_full_ptes(mm, addr, ptep, nr, full); } -EXPORT_SYMBOL(contpte_clear_full_ptes); +EXPORT_SYMBOL_GPL(contpte_clear_full_ptes); pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, @@ -291,7 +291,7 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, contpte_try_unfold_partial(mm, addr, ptep, nr); return __get_and_clear_full_ptes(mm, addr, ptep, nr, full); } -EXPORT_SYMBOL(contpte_get_and_clear_full_ptes); +EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes); int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) @@ -316,7 +316,7 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, return young; } -EXPORT_SYMBOL(contpte_ptep_test_and_clear_young); +EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young); int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) @@ -337,7 +337,7 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, return young; } -EXPORT_SYMBOL(contpte_ptep_clear_flush_young); +EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young); void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) @@ -355,7 +355,7 @@ void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, contpte_try_unfold_partial(mm, addr, ptep, nr); __wrprotect_ptes(mm, addr, ptep, nr); } -EXPORT_SYMBOL(contpte_wrprotect_ptes); +EXPORT_SYMBOL_GPL(contpte_wrprotect_ptes); int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, @@ -401,4 +401,4 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma, return 1; } -EXPORT_SYMBOL(contpte_ptep_set_access_flags); +EXPORT_SYMBOL_GPL(contpte_ptep_set_access_flags); From 94c18d5f7e0d612ce3fb9cb4aa8cfb1308d57a0a Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 26 Feb 2024 12:03:21 +0000 Subject: [PATCH 343/435] arm64/mm: improve comment in contpte_ptep_get_lockless() Make clear the atmicity/consistency requirements of the API and how we achieve them. Link: https://lore.kernel.org/linux-mm/Zc-Tqqfksho3BHmU@arm.com/ Link: https://lkml.kernel.org/r/20240226120321.1055731-3-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Reviewed-by: Catalin Marinas Cc: John Hubbard Cc: Mark Rutland Signed-off-by: Andrew Morton --- arch/arm64/mm/contpte.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index be0a226c4ff9..1b64b4c3f8bf 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -183,16 +183,20 @@ EXPORT_SYMBOL_GPL(contpte_ptep_get); pte_t contpte_ptep_get_lockless(pte_t *orig_ptep) { /* - * Gather access/dirty bits, which may be populated in any of the ptes - * of the contig range. We may not be holding the PTL, so any contiguous - * range may be unfolded/modified/refolded under our feet. Therefore we - * ensure we read a _consistent_ contpte range by checking that all ptes - * in the range are valid and have CONT_PTE set, that all pfns are - * contiguous and that all pgprots are the same (ignoring access/dirty). - * If we find a pte that is not consistent, then we must be racing with - * an update so start again. If the target pte does not have CONT_PTE - * set then that is considered consistent on its own because it is not - * part of a contpte range. + * The ptep_get_lockless() API requires us to read and return *orig_ptep + * so that it is self-consistent, without the PTL held, so we may be + * racing with other threads modifying the pte. Usually a READ_ONCE() + * would suffice, but for the contpte case, we also need to gather the + * access and dirty bits from across all ptes in the contiguous block, + * and we can't read all of those neighbouring ptes atomically, so any + * contiguous range may be unfolded/modified/refolded under our feet. + * Therefore we ensure we read a _consistent_ contpte range by checking + * that all ptes in the range are valid and have CONT_PTE set, that all + * pfns are contiguous and that all pgprots are the same (ignoring + * access/dirty). If we find a pte that is not consistent, then we must + * be racing with an update so start again. If the target pte does not + * have CONT_PTE set then that is considered consistent on its own + * because it is not part of a contpte range. */ pgprot_t orig_prot; From 2864f3d0f5831a50253befc5d4583868268b7153 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 26 Feb 2024 13:57:39 +1300 Subject: [PATCH 344/435] mm: madvise: pageout: ignore references rather than clearing young While doing MADV_PAGEOUT, the current code will clear PTE young so that vmscan won't read young flags to allow the reclamation of madvised folios to go ahead. It seems we can do it by directly ignoring references, thus we can remove tlb flush in madvise and rmap overhead in vmscan. Regarding the side effect, in the original code, if a parallel thread runs side by side to access the madvised memory with the thread doing madvise, folios will get a chance to be re-activated by vmscan (though the time gap is actually quite small since checking PTEs is done immediately after clearing PTEs young). But with this patch, they will still be reclaimed. But this behaviour doing PAGEOUT and doing access at the same time is quite silly like DoS. So probably, we don't need to care. Or ignoring the new access during the quite small time gap is even better. For DAMON's DAMOS_PAGEOUT based on physical address region, we still keep its behaviour as is since a physical address might be mapped by multiple processes. MADV_PAGEOUT based on virtual address is actually much more aggressive on reclamation. To untouch paddr's DAMOS_PAGEOUT, we simply pass ignore_references as false in reclaim_pages(). A microbench as below has shown 6% decrement on the latency of MADV_PAGEOUT, #define PGSIZE 4096 main() { int i; #define SIZE 512*1024*1024 volatile long *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); for (i = 0; i < SIZE/sizeof(long); i += PGSIZE / sizeof(long)) p[i] = 0x11; madvise(p, SIZE, MADV_PAGEOUT); } w/o patch w/ patch root@10:~# time ./a.out root@10:~# time ./a.out real 0m49.634s real 0m46.334s user 0m0.637s user 0m0.648s sys 0m47.434s sys 0m44.265s Link: https://lkml.kernel.org/r/20240226005739.24350-1-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: Minchan Kim Cc: SeongJae Park Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 2 +- mm/internal.h | 2 +- mm/madvise.c | 8 ++++---- mm/vmscan.c | 12 +++++++----- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 081e2a325778..5e6dc312072c 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -249,7 +249,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) put_folio: folio_put(folio); } - applied = reclaim_pages(&folio_list); + applied = reclaim_pages(&folio_list, false); cond_resched(); return applied * PAGE_SIZE; } diff --git a/mm/internal.h b/mm/internal.h index 2b7efffbe4d7..cb4eabb1051d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -869,7 +869,7 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); -unsigned long reclaim_pages(struct list_head *folio_list); +unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ diff --git a/mm/madvise.c b/mm/madvise.c index abde3edb04f0..44a498c94158 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -386,7 +386,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, return 0; } - if (pmd_young(orig_pmd)) { + if (!pageout && pmd_young(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); @@ -410,7 +410,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, huge_unlock: spin_unlock(ptl); if (pageout) - reclaim_pages(&folio_list); + reclaim_pages(&folio_list, true); return 0; } @@ -490,7 +490,7 @@ restart: VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - if (pte_young(ptent)) { + if (!pageout && pte_young(ptent)) { ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); ptent = pte_mkold(ptent); @@ -524,7 +524,7 @@ restart: pte_unmap_unlock(start_pte, ptl); } if (pageout) - reclaim_pages(&folio_list); + reclaim_pages(&folio_list, true); cond_resched(); return 0; diff --git a/mm/vmscan.c b/mm/vmscan.c index 198d623054c5..dcfbe617e9ef 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2085,7 +2085,8 @@ static void shrink_active_list(unsigned long nr_to_scan, } static unsigned int reclaim_folio_list(struct list_head *folio_list, - struct pglist_data *pgdat) + struct pglist_data *pgdat, + bool ignore_references) { struct reclaim_stat dummy_stat; unsigned int nr_reclaimed; @@ -2098,7 +2099,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list, .no_demotion = 1, }; - nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); @@ -2108,7 +2109,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list, return nr_reclaimed; } -unsigned long reclaim_pages(struct list_head *folio_list) +unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references) { int nid; unsigned int nr_reclaimed = 0; @@ -2130,11 +2131,12 @@ unsigned long reclaim_pages(struct list_head *folio_list) continue; } - nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), + ignore_references); nid = folio_nid(lru_to_folio(folio_list)); } while (!list_empty(folio_list)); - nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), ignore_references); memalloc_noreclaim_restore(noreclaim_flag); From 772dd0342727cc3c3b676b5d97c99708e75730a2 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 23 Feb 2024 17:58:00 -0800 Subject: [PATCH 345/435] mm: enumerate all gfp flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce GFP bits enumeration to let compiler track the number of used bits (which depends on the config options) instead of hardcoding them. That simplifies __GFP_BITS_SHIFT calculation. Link: https://lkml.kernel.org/r/20240224015800.2569851-1-surenb@google.com Suggested-by: Petr Tesařík Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Reviewed-by: Pasha Tatashin Acked-by: Michal Hocko Cc: Kent Overstreet Cc: Petr Tesarik Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 94 ++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 30 deletions(-) diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 1b6053da8754..868c8fb1bbc1 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -21,44 +21,78 @@ typedef unsigned int __bitwise gfp_t; * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c */ -/* Plain integer GFP bitmasks. Do not use this directly. */ -#define ___GFP_DMA 0x01u -#define ___GFP_HIGHMEM 0x02u -#define ___GFP_DMA32 0x04u -#define ___GFP_MOVABLE 0x08u -#define ___GFP_RECLAIMABLE 0x10u -#define ___GFP_HIGH 0x20u -#define ___GFP_IO 0x40u -#define ___GFP_FS 0x80u -#define ___GFP_ZERO 0x100u -/* 0x200u unused */ -#define ___GFP_DIRECT_RECLAIM 0x400u -#define ___GFP_KSWAPD_RECLAIM 0x800u -#define ___GFP_WRITE 0x1000u -#define ___GFP_NOWARN 0x2000u -#define ___GFP_RETRY_MAYFAIL 0x4000u -#define ___GFP_NOFAIL 0x8000u -#define ___GFP_NORETRY 0x10000u -#define ___GFP_MEMALLOC 0x20000u -#define ___GFP_COMP 0x40000u -#define ___GFP_NOMEMALLOC 0x80000u -#define ___GFP_HARDWALL 0x100000u -#define ___GFP_THISNODE 0x200000u -#define ___GFP_ACCOUNT 0x400000u -#define ___GFP_ZEROTAGS 0x800000u +enum { + ___GFP_DMA_BIT, + ___GFP_HIGHMEM_BIT, + ___GFP_DMA32_BIT, + ___GFP_MOVABLE_BIT, + ___GFP_RECLAIMABLE_BIT, + ___GFP_HIGH_BIT, + ___GFP_IO_BIT, + ___GFP_FS_BIT, + ___GFP_ZERO_BIT, + ___GFP_UNUSED_BIT, /* 0x200u unused */ + ___GFP_DIRECT_RECLAIM_BIT, + ___GFP_KSWAPD_RECLAIM_BIT, + ___GFP_WRITE_BIT, + ___GFP_NOWARN_BIT, + ___GFP_RETRY_MAYFAIL_BIT, + ___GFP_NOFAIL_BIT, + ___GFP_NORETRY_BIT, + ___GFP_MEMALLOC_BIT, + ___GFP_COMP_BIT, + ___GFP_NOMEMALLOC_BIT, + ___GFP_HARDWALL_BIT, + ___GFP_THISNODE_BIT, + ___GFP_ACCOUNT_BIT, + ___GFP_ZEROTAGS_BIT, #ifdef CONFIG_KASAN_HW_TAGS -#define ___GFP_SKIP_ZERO 0x1000000u -#define ___GFP_SKIP_KASAN 0x2000000u + ___GFP_SKIP_ZERO_BIT, + ___GFP_SKIP_KASAN_BIT, +#endif +#ifdef CONFIG_LOCKDEP + ___GFP_NOLOCKDEP_BIT, +#endif + ___GFP_LAST_BIT +}; + +/* Plain integer GFP bitmasks. Do not use this directly. */ +#define ___GFP_DMA BIT(___GFP_DMA_BIT) +#define ___GFP_HIGHMEM BIT(___GFP_HIGHMEM_BIT) +#define ___GFP_DMA32 BIT(___GFP_DMA32_BIT) +#define ___GFP_MOVABLE BIT(___GFP_MOVABLE_BIT) +#define ___GFP_RECLAIMABLE BIT(___GFP_RECLAIMABLE_BIT) +#define ___GFP_HIGH BIT(___GFP_HIGH_BIT) +#define ___GFP_IO BIT(___GFP_IO_BIT) +#define ___GFP_FS BIT(___GFP_FS_BIT) +#define ___GFP_ZERO BIT(___GFP_ZERO_BIT) +/* 0x200u unused */ +#define ___GFP_DIRECT_RECLAIM BIT(___GFP_DIRECT_RECLAIM_BIT) +#define ___GFP_KSWAPD_RECLAIM BIT(___GFP_KSWAPD_RECLAIM_BIT) +#define ___GFP_WRITE BIT(___GFP_WRITE_BIT) +#define ___GFP_NOWARN BIT(___GFP_NOWARN_BIT) +#define ___GFP_RETRY_MAYFAIL BIT(___GFP_RETRY_MAYFAIL_BIT) +#define ___GFP_NOFAIL BIT(___GFP_NOFAIL_BIT) +#define ___GFP_NORETRY BIT(___GFP_NORETRY_BIT) +#define ___GFP_MEMALLOC BIT(___GFP_MEMALLOC_BIT) +#define ___GFP_COMP BIT(___GFP_COMP_BIT) +#define ___GFP_NOMEMALLOC BIT(___GFP_NOMEMALLOC_BIT) +#define ___GFP_HARDWALL BIT(___GFP_HARDWALL_BIT) +#define ___GFP_THISNODE BIT(___GFP_THISNODE_BIT) +#define ___GFP_ACCOUNT BIT(___GFP_ACCOUNT_BIT) +#define ___GFP_ZEROTAGS BIT(___GFP_ZEROTAGS_BIT) +#ifdef CONFIG_KASAN_HW_TAGS +#define ___GFP_SKIP_ZERO BIT(___GFP_SKIP_ZERO_BIT) +#define ___GFP_SKIP_KASAN BIT(___GFP_SKIP_KASAN_BIT) #else #define ___GFP_SKIP_ZERO 0 #define ___GFP_SKIP_KASAN 0 #endif #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x4000000u +#define ___GFP_NOLOCKDEP BIT(___GFP_NOLOCKDEP_BIT) #else #define ___GFP_NOLOCKDEP 0 #endif -/* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* * Physical address zone modifiers (see linux/mmzone.h - low four bits) @@ -249,7 +283,7 @@ typedef unsigned int __bitwise gfp_t; #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT ___GFP_LAST_BIT #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /** From 319a624ec2b79db7a0b0a2a2a61e3aa5c96eabfc Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:27 -0500 Subject: [PATCH 346/435] mm/huge_memory: only split PMD mapping when necessary in unmap_folio() Patch series "Split a folio to any lower order folios", v5. File folio supports any order and multi-size THP is upstreamed[1], so both file and anonymous folios can be >0 order. Currently, split_huge_page() only splits a huge page to order-0 pages, but splitting to orders higher than 0 might better utilize large folios, if done properly. In addition, Large Block Sizes in XFS support would benefit from it during truncate[2]. This patchset adds support for splitting a large folio to any lower order folios. In addition to this implementation of split_huge_page_to_list_to_order(), a possible optimization could be splitting a large folio to arbitrary smaller folios instead of a single order. As both Hugh and Ryan pointed out [3,5] that split to a single order might not be optimal, an order-9 folio might be better split into 1 order-8, 1 order-7, ..., 1 order-1, and 2 order-0 folios, depending on subsequent folio operations. Leave this as future work. [1] https://lore.kernel.org/all/20231207161211.2374093-1-ryan.roberts@arm.com/ [2] https://lore.kernel.org/linux-mm/20240226094936.2677493-1-kernel@pankajraghav.com/ [3] https://lore.kernel.org/linux-mm/9dd96da-efa2-5123-20d4-4992136ef3ad@google.com/ [4] https://lore.kernel.org/linux-mm/cbb1d6a0-66dd-47d0-8733-f836fe050374@arm.com/ [5] https://lore.kernel.org/linux-mm/20240213215520.1048625-1-zi.yan@sent.com/ This patch (of 8): As multi-size THP support is added, not all THPs are PMD-mapped, thus during a huge page split, there is no need to always split PMD mapping in unmap_folio(). Make it conditional. Link: https://lkml.kernel.org/r/20240226205534.1603748-1-zi.yan@sent.com Link: https://lkml.kernel.org/r/20240226205534.1603748-2-zi.yan@sent.com Signed-off-by: Zi Yan Reviewed-by: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 28341a5067fb..b20e535e874c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2727,11 +2727,14 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void unmap_folio(struct folio *folio) { - enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | - TTU_SYNC | TTU_BATCH_FLUSH; + enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC | + TTU_BATCH_FLUSH; VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (folio_test_pmd_mappable(folio)) + ttu_flags |= TTU_SPLIT_HUGE_PMD; + /* * Anon pages need migration entries to preserve them, but file * pages can simply be left unmapped, then faulted back on demand. From 8897277acfef7f70fdecc054073bea2542fc7a1b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 26 Feb 2024 15:55:28 -0500 Subject: [PATCH 347/435] mm: support order-1 folios in the page cache Folios of order 1 have no space to store the deferred list. This is not a problem for the page cache as file-backed folios are never placed on the deferred list. All we need to do is prevent the core MM from touching the deferred list for order 1 folios and remove the code which prevented us from allocating order 1 folios. Link: https://lore.kernel.org/linux-mm/90344ea7-4eec-47ee-5996-0c22f42d6a6a@google.com/ Link: https://lkml.kernel.org/r/20240226205534.1603748-3-zi.yan@sent.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Zi Yan Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/filemap.c | 2 -- mm/huge_memory.c | 19 +++++++++++++++---- mm/internal.h | 3 +-- mm/readahead.c | 3 --- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index b7a21551fbc7..b4858d89f1b1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1912,8 +1912,6 @@ no_page: gfp_t alloc_gfp = gfp; err = -ENOMEM; - if (order == 1) - order = 0; if (order > 0) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b20e535e874c..9840f312c08f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -790,8 +790,10 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio) void folio_prep_large_rmappable(struct folio *folio) { - VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); - INIT_LIST_HEAD(&folio->_deferred_list); + if (!folio || !folio_test_large(folio)) + return; + if (folio_order(folio) > 1) + INIT_LIST_HEAD(&folio->_deferred_list); folio_set_large_rmappable(folio); } @@ -3114,7 +3116,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { - if (!list_empty(&folio->_deferred_list)) { + if (folio_order(folio) > 1 && + !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; list_del(&folio->_deferred_list); } @@ -3165,6 +3168,9 @@ void folio_undo_large_rmappable(struct folio *folio) struct deferred_split *ds_queue; unsigned long flags; + if (folio_order(folio) <= 1) + return; + /* * At this point, there is no one trying to add the folio to * deferred_list. If folio is not in deferred_list, it's safe @@ -3190,7 +3196,12 @@ void deferred_split_folio(struct folio *folio) #endif unsigned long flags; - VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); + /* + * Order 1 folios have no space for a deferred list, but we also + * won't waste much memory by not adding them to the deferred list. + */ + if (folio_order(folio) <= 1) + return; /* * The try_to_unmap() in page reclaim path might reach here too, diff --git a/mm/internal.h b/mm/internal.h index cb4eabb1051d..f376e3afbc4c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -420,8 +420,7 @@ static inline struct folio *page_rmappable_folio(struct page *page) { struct folio *folio = (struct folio *)page; - if (folio && folio_order(folio) > 1) - folio_prep_large_rmappable(folio); + folio_prep_large_rmappable(folio); return folio; } diff --git a/mm/readahead.c b/mm/readahead.c index 1e74455f908e..130c0e7df99f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -514,9 +514,6 @@ void page_cache_ra_order(struct readahead_control *ractl, /* Don't allocate pages past EOF */ while (index + (1UL << order) - 1 > limit) order--; - /* THP machinery does not support order-1 */ - if (order == 1) - order = 0; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) break; From 502003bb76b83649cd4ff7f701987ac5cf43bc4b Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:29 -0500 Subject: [PATCH 348/435] mm/memcg: use order instead of nr in split_page_memcg() We do not have non power of two pages, using nr is error prone if nr is not power-of-two. Use page order instead. Link: https://lkml.kernel.org/r/20240226205534.1603748-4-zi.yan@sent.com Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- mm/huge_memory.c | 5 +++-- mm/memcontrol.c | 3 ++- mm/page_alloc.c | 4 ++-- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4e4caeaea404..173bbb53c1ec 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1163,7 +1163,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -void split_page_memcg(struct page *head, unsigned int nr); +void split_page_memcg(struct page *head, int order); unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, @@ -1621,7 +1621,7 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void split_page_memcg(struct page *head, unsigned int nr) +static inline void split_page_memcg(struct page *head, int order) { } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9840f312c08f..96ac7c62c375 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2889,11 +2889,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; - unsigned int nr = thp_nr_pages(head); int i, nr_dropped = 0; + int order = folio_order(folio); + unsigned int nr = 1 << order; /* complete memcg works before add pages to LRU */ - split_page_memcg(head, nr); + split_page_memcg(head, order); if (folio_test_anon(folio) && folio_test_swapcache(folio)) { offset = swp_offset(folio->swap); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cb216d30a221..db9a3fb26a99 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3606,11 +3606,12 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) /* * Because page_memcg(head) is not set on tails, set it now. */ -void split_page_memcg(struct page *head, unsigned int nr) +void split_page_memcg(struct page *head, int order) { struct folio *folio = page_folio(head); struct mem_cgroup *memcg = folio_memcg(folio); int i; + unsigned int nr = 1 << order; if (mem_cgroup_disabled() || !memcg) return; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5ab921e3d7d1..44ffe1304878 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2617,7 +2617,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, 1 << order); - split_page_memcg(page, 1 << order); + split_page_memcg(page, order); } EXPORT_SYMBOL_GPL(split_page); @@ -4802,7 +4802,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *last = page + nr; split_page_owner(page, 1 << order); - split_page_memcg(page, 1 << order); + split_page_memcg(page, order); while (page < --last) set_page_refcounted(last); From 9a581c12cddb06696fe4811239934fcde57ceb91 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:30 -0500 Subject: [PATCH 349/435] mm/page_owner: use order instead of nr in split_page_owner() We do not have non power of two pages, using nr is error prone if nr is not power-of-two. Use page order instead. Link: https://lkml.kernel.org/r/20240226205534.1603748-5-zi.yan@sent.com Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/page_owner.h | 9 ++++----- mm/huge_memory.c | 2 +- mm/page_alloc.c | 4 ++-- mm/page_owner.c | 3 ++- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 119a0c9d2a8b..2b39c8e19d98 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -11,7 +11,7 @@ extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned short order); extern void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask); -extern void __split_page_owner(struct page *page, unsigned int nr); +extern void __split_page_owner(struct page *page, int order); extern void __folio_copy_owner(struct folio *newfolio, struct folio *old); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(const struct page *page); @@ -31,10 +31,10 @@ static inline void set_page_owner(struct page *page, __set_page_owner(page, order, gfp_mask); } -static inline void split_page_owner(struct page *page, unsigned int nr) +static inline void split_page_owner(struct page *page, int order) { if (static_branch_unlikely(&page_owner_inited)) - __split_page_owner(page, nr); + __split_page_owner(page, order); } static inline void folio_copy_owner(struct folio *newfolio, struct folio *old) { @@ -59,8 +59,7 @@ static inline void set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { } -static inline void split_page_owner(struct page *page, - unsigned short order) +static inline void split_page_owner(struct page *page, int order) { } static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 96ac7c62c375..2b95dbc95fae 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2933,7 +2933,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, unlock_page_lruvec(lruvec); /* Caller disabled irqs, so they are still disabled here */ - split_page_owner(head, nr); + split_page_owner(head, order); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 44ffe1304878..a9d1af34f4d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2616,7 +2616,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - split_page_owner(page, 1 << order); + split_page_owner(page, order); split_page_memcg(page, order); } EXPORT_SYMBOL_GPL(split_page); @@ -4801,7 +4801,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *page = virt_to_page((void *)addr); struct page *last = page + nr; - split_page_owner(page, 1 << order); + split_page_owner(page, order); split_page_memcg(page, order); while (page < --last) set_page_refcounted(last); diff --git a/mm/page_owner.c b/mm/page_owner.c index e56c1e92eccf..b678f7a6e702 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -306,11 +306,12 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_ext_put(page_ext); } -void __split_page_owner(struct page *page, unsigned int nr) +void __split_page_owner(struct page *page, int order) { int i; struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; + unsigned int nr = 1 << order; if (unlikely(!page_ext)) return; From b8791381d7edae3706edde207f52d9e483ed400c Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:31 -0500 Subject: [PATCH 350/435] mm: memcg: make memcg huge page split support any order split It sets memcg information for the pages after the split. A new parameter new_order is added to tell the order of subpages in the new page, always 0 for now. It prepares for upcoming changes to support split huge page to any lower order. Link: https://lkml.kernel.org/r/20240226205534.1603748-6-zi.yan@sent.com Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- mm/huge_memory.c | 2 +- mm/memcontrol.c | 11 ++++++----- mm/page_alloc.c | 4 ++-- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 173bbb53c1ec..9a2dea92be0e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1163,7 +1163,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -void split_page_memcg(struct page *head, int order); +void split_page_memcg(struct page *head, int old_order, int new_order); unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, @@ -1621,7 +1621,7 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void split_page_memcg(struct page *head, int order) +static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2b95dbc95fae..5d4b7c17b9bc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2894,7 +2894,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, unsigned int nr = 1 << order; /* complete memcg works before add pages to LRU */ - split_page_memcg(head, order); + split_page_memcg(head, order, 0); if (folio_test_anon(folio) && folio_test_swapcache(folio)) { offset = swp_offset(folio->swap); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index db9a3fb26a99..e4864e2729e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3606,23 +3606,24 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) /* * Because page_memcg(head) is not set on tails, set it now. */ -void split_page_memcg(struct page *head, int order) +void split_page_memcg(struct page *head, int old_order, int new_order) { struct folio *folio = page_folio(head); struct mem_cgroup *memcg = folio_memcg(folio); int i; - unsigned int nr = 1 << order; + unsigned int old_nr = 1 << old_order; + unsigned int new_nr = 1 << new_order; if (mem_cgroup_disabled() || !memcg) return; - for (i = 1; i < nr; i++) + for (i = new_nr; i < old_nr; i += new_nr) folio_page(folio, i)->memcg_data = folio->memcg_data; if (folio_memcg_kmem(folio)) - obj_cgroup_get_many(__folio_objcg(folio), nr - 1); + obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1); else - css_get_many(&memcg->css, nr - 1); + css_get_many(&memcg->css, old_nr / new_nr - 1); } #ifdef CONFIG_SWAP diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a9d1af34f4d6..3b3dae66d49d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2617,7 +2617,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, order); - split_page_memcg(page, order); + split_page_memcg(page, order, 0); } EXPORT_SYMBOL_GPL(split_page); @@ -4802,7 +4802,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *last = page + nr; split_page_owner(page, order); - split_page_memcg(page, order); + split_page_memcg(page, order, 0); while (page < --last) set_page_refcounted(last); From 46d44d09d24c5b451d6ab8f0fca5a40f651e3837 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:32 -0500 Subject: [PATCH 351/435] mm: page_owner: add support for splitting to any order in split page_owner It adds a new_order parameter to set new page order in page owner. It prepares for upcoming changes to support split huge page to any lower order. Link: https://lkml.kernel.org/r/20240226205534.1603748-7-zi.yan@sent.com Signed-off-by: Zi Yan Cc: David Hildenbrand Acked-by: David Hildenbrand Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/page_owner.h | 13 ++++++++----- mm/huge_memory.c | 2 +- mm/page_alloc.c | 4 ++-- mm/page_owner.c | 7 +++---- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 2b39c8e19d98..debdc25f08b9 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -11,7 +11,8 @@ extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned short order); extern void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask); -extern void __split_page_owner(struct page *page, int order); +extern void __split_page_owner(struct page *page, int old_order, + int new_order); extern void __folio_copy_owner(struct folio *newfolio, struct folio *old); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(const struct page *page); @@ -31,10 +32,11 @@ static inline void set_page_owner(struct page *page, __set_page_owner(page, order, gfp_mask); } -static inline void split_page_owner(struct page *page, int order) +static inline void split_page_owner(struct page *page, int old_order, + int new_order) { if (static_branch_unlikely(&page_owner_inited)) - __split_page_owner(page, order); + __split_page_owner(page, old_order, new_order); } static inline void folio_copy_owner(struct folio *newfolio, struct folio *old) { @@ -56,10 +58,11 @@ static inline void reset_page_owner(struct page *page, unsigned short order) { } static inline void set_page_owner(struct page *page, - unsigned int order, gfp_t gfp_mask) + unsigned short order, gfp_t gfp_mask) { } -static inline void split_page_owner(struct page *page, int order) +static inline void split_page_owner(struct page *page, int old_order, + int new_order) { } static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5d4b7c17b9bc..b2df788c11fa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2933,7 +2933,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, unlock_page_lruvec(lruvec); /* Caller disabled irqs, so they are still disabled here */ - split_page_owner(head, order); + split_page_owner(head, order, 0); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3b3dae66d49d..ff1f159251df 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2616,7 +2616,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - split_page_owner(page, order); + split_page_owner(page, order, 0); split_page_memcg(page, order, 0); } EXPORT_SYMBOL_GPL(split_page); @@ -4801,7 +4801,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *page = virt_to_page((void *)addr); struct page *last = page + nr; - split_page_owner(page, order); + split_page_owner(page, order, 0); split_page_memcg(page, order, 0); while (page < --last) set_page_refcounted(last); diff --git a/mm/page_owner.c b/mm/page_owner.c index b678f7a6e702..033e349f6479 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -306,19 +306,18 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_ext_put(page_ext); } -void __split_page_owner(struct page *page, int order) +void __split_page_owner(struct page *page, int old_order, int new_order) { int i; struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; - unsigned int nr = 1 << order; if (unlikely(!page_ext)) return; - for (i = 0; i < nr; i++) { + for (i = 0; i < (1 << old_order); i++) { page_owner = get_page_owner(page_ext); - page_owner->order = 0; + page_owner->order = new_order; page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); From c010d47f107f609b9f4d6a103b6dfc53889049e9 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:33 -0500 Subject: [PATCH 352/435] mm: thp: split huge page to any lower order pages To split a THP to any lower order pages, we need to reform THPs on subpages at given order and add page refcount based on the new page order. Also we need to reinitialize page_deferred_list after removing the page from the split_queue, otherwise a subsequent split will see list corruption when checking the page_deferred_list again. Note: Anonymous order-1 folio is not supported because _deferred_list, which is used by partially mapped folios, is stored in subpage 2 and an order-1 folio only has subpage 0 and 1. File-backed order-1 folios are fine, since they do not use _deferred_list. [ziy@nvidia.com: fixup per discussion with Ryan] Link: https://lkml.kernel.org/r/494F48CD-1F0F-4CAD-884E-6D48F40AF990@nvidia.com Link: https://lkml.kernel.org/r/20240226205534.1603748-8-zi.yan@sent.com Signed-off-by: Zi Yan Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 21 +++++--- mm/huge_memory.c | 107 +++++++++++++++++++++++++++++++--------- 2 files changed, 96 insertions(+), 32 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 5adb86af35fc..de0c89105076 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -265,10 +265,11 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, void folio_prep_large_rmappable(struct folio *folio); bool can_split_folio(struct folio *folio, int *pextra_pins); -int split_huge_page_to_list(struct page *page, struct list_head *list); +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order); static inline int split_huge_page(struct page *page) { - return split_huge_page_to_list(page, NULL); + return split_huge_page_to_list_to_order(page, NULL, 0); } void deferred_split_folio(struct folio *folio); @@ -422,7 +423,8 @@ can_split_folio(struct folio *folio, int *pextra_pins) return false; } static inline int -split_huge_page_to_list(struct page *page, struct list_head *list) +split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) { return 0; } @@ -519,17 +521,20 @@ static inline bool thp_migration_supported(void) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline int split_folio_to_list(struct folio *folio, - struct list_head *list) +static inline int split_folio_to_list_to_order(struct folio *folio, + struct list_head *list, int new_order) { - return split_huge_page_to_list(&folio->page, list); + return split_huge_page_to_list_to_order(&folio->page, list, new_order); } -static inline int split_folio(struct folio *folio) +static inline int split_folio_to_order(struct folio *folio, int new_order) { - return split_folio_to_list(folio, NULL); + return split_folio_to_list_to_order(folio, NULL, new_order); } +#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0) +#define split_folio(f) split_folio_to_order(f, 0) + /* * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to * limitations in the implementation like arm64 MTE can override this to diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b2df788c11fa..006b192ea6ab 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2770,7 +2770,6 @@ static void lru_add_page_tail(struct page *head, struct page *tail, struct lruvec *lruvec, struct list_head *list) { VM_BUG_ON_PAGE(!PageHead(head), head); - VM_BUG_ON_PAGE(PageCompound(tail), head); VM_BUG_ON_PAGE(PageLRU(tail), head); lockdep_assert_held(&lruvec->lru_lock); @@ -2791,7 +2790,8 @@ static void lru_add_page_tail(struct page *head, struct page *tail, } static void __split_huge_page_tail(struct folio *folio, int tail, - struct lruvec *lruvec, struct list_head *list) + struct lruvec *lruvec, struct list_head *list, + unsigned int new_order) { struct page *head = &folio->page; struct page *page_tail = head + tail; @@ -2861,10 +2861,15 @@ static void __split_huge_page_tail(struct folio *folio, int tail, * which needs correct compound_head(). */ clear_compound_head(page_tail); + if (new_order) { + prep_compound_page(page_tail, new_order); + folio_prep_large_rmappable(new_folio); + } /* Finally unfreeze refcount. Additional reference from page cache. */ - page_ref_unfreeze(page_tail, 1 + (!folio_test_anon(folio) || - folio_test_swapcache(folio))); + page_ref_unfreeze(page_tail, + 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ? + folio_nr_pages(new_folio) : 0)); if (folio_test_young(folio)) folio_set_young(new_folio); @@ -2882,7 +2887,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail, } static void __split_huge_page(struct page *page, struct list_head *list, - pgoff_t end) + pgoff_t end, unsigned int new_order) { struct folio *folio = page_folio(page); struct page *head = &folio->page; @@ -2890,11 +2895,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct address_space *swap_cache = NULL; unsigned long offset = 0; int i, nr_dropped = 0; + unsigned int new_nr = 1 << new_order; int order = folio_order(folio); unsigned int nr = 1 << order; /* complete memcg works before add pages to LRU */ - split_page_memcg(head, order, 0); + split_page_memcg(head, order, new_order); if (folio_test_anon(folio) && folio_test_swapcache(folio)) { offset = swp_offset(folio->swap); @@ -2907,8 +2913,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageHasHWPoisoned(head); - for (i = nr - 1; i >= 1; i--) { - __split_huge_page_tail(folio, i, lruvec, list); + for (i = nr - new_nr; i >= new_nr; i -= new_nr) { + __split_huge_page_tail(folio, i, lruvec, list, new_order); /* Some pages can be beyond EOF: drop them from page cache */ if (head[i].index >= end) { struct folio *tail = page_folio(head + i); @@ -2929,24 +2935,30 @@ static void __split_huge_page(struct page *page, struct list_head *list, } } - ClearPageCompound(head); + if (!new_order) + ClearPageCompound(head); + else { + struct folio *new_folio = (struct folio *)head; + + folio_set_order(new_folio, new_order); + } unlock_page_lruvec(lruvec); /* Caller disabled irqs, so they are still disabled here */ - split_page_owner(head, order, 0); + split_page_owner(head, order, new_order); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { /* Additional pin to swap cache */ if (PageSwapCache(head)) { - page_ref_add(head, 2); + page_ref_add(head, 1 + new_nr); xa_unlock(&swap_cache->i_pages); } else { page_ref_inc(head); } } else { /* Additional pin to page cache */ - page_ref_add(head, 2); + page_ref_add(head, 1 + new_nr); xa_unlock(&head->mapping->i_pages); } local_irq_enable(); @@ -2958,7 +2970,15 @@ static void __split_huge_page(struct page *page, struct list_head *list, if (folio_test_swapcache(folio)) split_swap_cluster(folio->swap); - for (i = 0; i < nr; i++) { + /* + * set page to its compound_head when split to non order-0 pages, so + * we can skip unlocking it below, since PG_locked is transferred to + * the compound_head of the page and the caller will unlock it. + */ + if (new_order) + page = compound_head(page); + + for (i = 0; i < nr; i += new_nr) { struct page *subpage = head + i; if (subpage == page) continue; @@ -2992,29 +3012,36 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) } /* - * This function splits huge page into normal pages. @page can point to any - * subpage of huge page to split. Split doesn't change the position of @page. + * This function splits huge page into pages in @new_order. @page can point to + * any subpage of huge page to split. Split doesn't change the position of + * @page. + * + * NOTE: order-1 anonymous folio is not supported because _deferred_list, + * which is used by partially mapped folios, is stored in subpage 2 and an + * order-1 folio only has subpage 0 and 1. File-backed order-1 folios are OK, + * since they do not use _deferred_list. * * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. * The huge page must be locked. * * If @list is null, tail pages will be added to LRU list, otherwise, to @list. * - * Both head page and tail pages will inherit mapping, flags, and so on from - * the hugepage. + * Pages in new_order will inherit mapping, flags, and so on from the hugepage. * - * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if - * they are not mapped. + * GUP pin and PG_locked transferred to @page or the compound page @page belongs + * to. Rest subpages can be freed if they are not mapped. * * Returns 0 if the hugepage is split successfully. * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under * us. */ -int split_huge_page_to_list(struct page *page, struct list_head *list) +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) { struct folio *folio = page_folio(page); struct deferred_split *ds_queue = get_deferred_split_queue(folio); - XA_STATE(xas, &folio->mapping->i_pages, folio->index); + /* reset xarray order to new order after split */ + XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; @@ -3024,6 +3051,31 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + /* Cannot split anonymous THP to order-1 */ + if (new_order == 1 && folio_test_anon(folio)) { + VM_WARN_ONCE(1, "Cannot split to order-1 folio"); + return -EINVAL; + } + + if (new_order) { + /* Only swapping a whole PMD-mapped folio is supported */ + if (folio_test_swapcache(folio)) + return -EINVAL; + /* Split shmem folio to non-zero order not supported */ + if (shmem_mapping(folio->mapping)) { + VM_WARN_ONCE(1, + "Cannot split shmem folio to non-0 order"); + return -EINVAL; + } + /* No split if the file system does not support large folio */ + if (!mapping_large_folio_support(folio->mapping)) { + VM_WARN_ONCE(1, + "Cannot split file folio to non-0 order"); + return -EINVAL; + } + } + + is_hzp = is_huge_zero_page(&folio->page); if (is_hzp) { pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); @@ -3120,14 +3172,21 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (folio_order(folio) > 1 && !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; - list_del(&folio->_deferred_list); + /* + * Reinitialize page_deferred_list after removing the + * page from the split_queue, otherwise a subsequent + * split will see list corruption when checking the + * page_deferred_list. + */ + list_del_init(&folio->_deferred_list); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { int nr = folio_nr_pages(folio); xas_split(&xas, folio, folio_order(folio)); - if (folio_test_pmd_mappable(folio)) { + if (folio_test_pmd_mappable(folio) && + new_order < HPAGE_PMD_ORDER) { if (folio_test_swapbacked(folio)) { __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); @@ -3139,7 +3198,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } } - __split_huge_page(page, list, end); + __split_huge_page(page, list, end, new_order); ret = 0; } else { spin_unlock(&ds_queue->split_queue_lock); From fc4d182316bd5309b4066fd9ef21529ea397a7d4 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:34 -0500 Subject: [PATCH 353/435] mm: huge_memory: enable debugfs to split huge pages to any order It is used to test split_huge_page_to_list_to_order for pagecache THPs. Also add test cases for split_huge_page_to_list_to_order via both debugfs. [ziy@nvidia.com: fix issue discovered with NFS] Link: https://lkml.kernel.org/r/262E4DAA-4A78-4328-B745-1355AE356A07@nvidia.com Link: https://lkml.kernel.org/r/20240226205534.1603748-9-zi.yan@sent.com Signed-off-by: Zi Yan Tested-by: Aishwarya TCV Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Cc: Aishwarya TCV Signed-off-by: Andrew Morton --- mm/huge_memory.c | 34 ++-- tools/testing/selftests/mm/run_vmtests.sh | 22 ++- .../selftests/mm/split_huge_page_test.c | 168 +++++++++++++++++- 3 files changed, 205 insertions(+), 19 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 006b192ea6ab..fd745bcc97ff 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3419,7 +3419,7 @@ static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) } static int split_huge_pages_pid(int pid, unsigned long vaddr_start, - unsigned long vaddr_end) + unsigned long vaddr_end, unsigned int new_order) { int ret = 0; struct task_struct *task; @@ -3483,13 +3483,19 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, goto next; total++; - if (!can_split_folio(folio, NULL)) + /* + * For folios with private, split_huge_page_to_list_to_order() + * will try to drop it before split and then check if the folio + * can be split or not. So skip the check here. + */ + if (!folio_test_private(folio) && + !can_split_folio(folio, NULL)) goto next; if (!folio_trylock(folio)) goto next; - if (!split_folio(folio)) + if (!split_folio_to_order(folio, new_order)) split++; folio_unlock(folio); @@ -3507,7 +3513,7 @@ out: } static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, - pgoff_t off_end) + pgoff_t off_end, unsigned int new_order) { struct filename *file; struct file *candidate; @@ -3546,7 +3552,7 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, if (!folio_trylock(folio)) goto next; - if (!split_folio(folio)) + if (!split_folio_to_order(folio, new_order)) split++; folio_unlock(folio); @@ -3571,10 +3577,14 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, { static DEFINE_MUTEX(split_debug_mutex); ssize_t ret; - /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */ + /* + * hold pid, start_vaddr, end_vaddr, new_order or + * file_path, off_start, off_end, new_order + */ char input_buf[MAX_INPUT_BUF_SZ]; int pid; unsigned long vaddr_start, vaddr_end; + unsigned int new_order = 0; ret = mutex_lock_interruptible(&split_debug_mutex); if (ret) @@ -3603,29 +3613,29 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, goto out; } - ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end); - if (ret != 2) { + ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order); + if (ret != 2 && ret != 3) { ret = -EINVAL; goto out; } - ret = split_huge_pages_in_file(file_path, off_start, off_end); + ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order); if (!ret) ret = input_len; goto out; } - ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); + ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order); if (ret == 1 && pid == 1) { split_huge_pages_all(); ret = strlen(input_buf); goto out; - } else if (ret != 3) { + } else if (ret != 3 && ret != 4) { ret = -EINVAL; goto out; } - ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end); + ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order); if (!ret) ret = strlen(input_buf); out: diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index de03d38907d6..02dc1c38bd76 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -399,7 +399,27 @@ CATEGORY="thp" run_test ./khugepaged -s 2 CATEGORY="thp" run_test ./transhuge-stress -d 20 -CATEGORY="thp" run_test ./split_huge_page_test +# Try to create XFS if not provided +if [ -z "${SPLIT_HUGE_PAGE_TEST_XFS_PATH}" ]; then + if test_selected "thp"; then + if grep xfs /proc/filesystems &>/dev/null; then + XFS_IMG=$(mktemp /tmp/xfs_img_XXXXXX) + SPLIT_HUGE_PAGE_TEST_XFS_PATH=$(mktemp -d /tmp/xfs_dir_XXXXXX) + truncate -s 314572800 ${XFS_IMG} + mkfs.xfs -q ${XFS_IMG} + mount -o loop ${XFS_IMG} ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + MOUNTED_XFS=1 + fi + fi +fi + +CATEGORY="thp" run_test ./split_huge_page_test ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + +if [ -n "${MOUNTED_XFS}" ]; then + umount ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + rmdir ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + rm -f ${XFS_IMG} +fi CATEGORY="migration" run_test ./migration diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 7b698a848bab..856662d2f87a 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "vm_util.h" #include "../kselftest.h" @@ -24,10 +25,11 @@ unsigned int pageshift; uint64_t pmd_pagesize; #define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" +#define SMAP_PATH "/proc/self/smaps" #define INPUT_MAX 80 -#define PID_FMT "%d,0x%lx,0x%lx" -#define PATH_FMT "%s,0x%lx,0x%lx" +#define PID_FMT "%d,0x%lx,0x%lx,%d" +#define PATH_FMT "%s,0x%lx,0x%lx,%d" #define PFN_MASK ((1UL<<55)-1) #define KPF_THP (1UL<<22) @@ -102,7 +104,7 @@ void split_pmd_thp(void) /* split all THPs */ write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, - (uint64_t)one_page + len); + (uint64_t)one_page + len, 0); for (i = 0; i < len; i++) if (one_page[i] != (char)i) @@ -177,7 +179,7 @@ void split_pte_mapped_thp(void) /* split all remapped THPs */ write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, - (uint64_t)pte_mapped + pagesize * 4); + (uint64_t)pte_mapped + pagesize * 4, 0); /* smap does not show THPs after mremap, use kpageflags instead */ thp_size = 0; @@ -237,7 +239,7 @@ void split_file_backed_thp(void) } /* split the file-backed THP */ - write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end); + write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end, 0); status = unlink(testfile); if (status) { @@ -265,8 +267,149 @@ cleanup: ksft_exit_fail_msg("Error occurred\n"); } +bool prepare_thp_fs(const char *xfs_path, char *thp_fs_template, + const char **thp_fs_loc) +{ + if (xfs_path) { + *thp_fs_loc = xfs_path; + return false; + } + + *thp_fs_loc = mkdtemp(thp_fs_template); + + if (!*thp_fs_loc) + ksft_exit_fail_msg("cannot create temp folder\n"); + + return true; +} + +void cleanup_thp_fs(const char *thp_fs_loc, bool created_tmp) +{ + int status; + + if (!created_tmp) + return; + + status = rmdir(thp_fs_loc); + if (status) + ksft_exit_fail_msg("cannot remove tmp dir: %s\n", + strerror(errno)); +} + +int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd, + char **addr) +{ + size_t i; + int dummy; + + srand(time(NULL)); + + *fd = open(testfile, O_CREAT | O_RDWR, 0664); + if (*fd == -1) + ksft_exit_fail_msg("Failed to create a file at %s\n", testfile); + + for (i = 0; i < fd_size; i++) { + unsigned char byte = (unsigned char)i; + + write(*fd, &byte, sizeof(byte)); + } + close(*fd); + sync(); + *fd = open("/proc/sys/vm/drop_caches", O_WRONLY); + if (*fd == -1) { + ksft_perror("open drop_caches"); + goto err_out_unlink; + } + if (write(*fd, "3", 1) != 1) { + ksft_perror("write to drop_caches"); + goto err_out_unlink; + } + close(*fd); + + *fd = open(testfile, O_RDWR); + if (*fd == -1) { + ksft_perror("Failed to open testfile\n"); + goto err_out_unlink; + } + + *addr = mmap(NULL, fd_size, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0); + if (*addr == (char *)-1) { + ksft_perror("cannot mmap"); + goto err_out_close; + } + madvise(*addr, fd_size, MADV_HUGEPAGE); + + for (size_t i = 0; i < fd_size; i++) + dummy += *(*addr + i); + + if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) { + ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n"); + munmap(*addr, fd_size); + close(*fd); + unlink(testfile); + ksft_test_result_skip("Pagecache folio split skipped\n"); + return -2; + } + return 0; +err_out_close: + close(*fd); +err_out_unlink: + unlink(testfile); + ksft_exit_fail_msg("Failed to create large pagecache folios\n"); + return -1; +} + +void split_thp_in_pagecache_to_order(size_t fd_size, int order, const char *fs_loc) +{ + int fd; + char *addr; + size_t i; + char testfile[INPUT_MAX]; + int err = 0; + + err = snprintf(testfile, INPUT_MAX, "%s/test", fs_loc); + + if (err < 0) + ksft_exit_fail_msg("cannot generate right test file name\n"); + + err = create_pagecache_thp_and_fd(testfile, fd_size, &fd, &addr); + if (err) + return; + err = 0; + + write_debugfs(PID_FMT, getpid(), (uint64_t)addr, (uint64_t)addr + fd_size, order); + + for (i = 0; i < fd_size; i++) + if (*(addr + i) != (char)i) { + ksft_print_msg("%lu byte corrupted in the file\n", i); + err = EXIT_FAILURE; + goto out; + } + + if (!check_huge_file(addr, 0, pmd_pagesize)) { + ksft_print_msg("Still FilePmdMapped not split\n"); + err = EXIT_FAILURE; + goto out; + } + +out: + munmap(addr, fd_size); + close(fd); + unlink(testfile); + if (err) + ksft_exit_fail_msg("Split PMD-mapped pagecache folio to order %d failed\n", order); + ksft_test_result_pass("Split PMD-mapped pagecache folio to order %d passed\n", order); +} + int main(int argc, char **argv) { + int i; + size_t fd_size; + char *optional_xfs_path = NULL; + char fs_loc_template[] = "/tmp/thp_fs_XXXXXX"; + const char *fs_loc; + bool created_tmp; + ksft_print_header(); if (geteuid() != 0) { @@ -274,7 +417,10 @@ int main(int argc, char **argv) ksft_finished(); } - ksft_set_plan(3); + if (argc > 1) + optional_xfs_path = argv[1]; + + ksft_set_plan(3+9); pagesize = getpagesize(); pageshift = ffs(pagesize) - 1; @@ -282,9 +428,19 @@ int main(int argc, char **argv) if (!pmd_pagesize) ksft_exit_fail_msg("Reading PMD pagesize failed\n"); + fd_size = 2 * pmd_pagesize; + split_pmd_thp(); split_pte_mapped_thp(); split_file_backed_thp(); + created_tmp = prepare_thp_fs(optional_xfs_path, fs_loc_template, + &fs_loc); + for (i = 8; i >= 0; i--) + split_thp_in_pagecache_to_order(fd_size, i, fs_loc); + cleanup_thp_fs(fs_loc, created_tmp); + ksft_finished(); + + return 0; } From b4d02baa9f3ed9fc4311e0bd69966861a2f5eb83 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 26 Feb 2024 15:13:23 +0100 Subject: [PATCH 354/435] mm/memfd: refactor memfd_tag_pins() and memfd_wait_for_pins() Patch series "mm: remove total_mapcount()", v2. Let's remove the remaining user from mm/memfd.c so we can get rid of total_mapcount(). This patch (of 2): Both functions are the remaining users of total_mapcount(). Let's get rid of the calls by converting the code to folios. As it turns out, the code is unnecessarily complicated, especially: 1) We can query the number of pagecache references for a folio simply via folio_nr_pages(). This will handle other folio sizes in the future correctly. 2) The xas_set(xas, page->index + cache_count) call to increment the iterator for large folios is not required. Remove it. Further, simplify the XA_CHECK_SCHED check, counting each entry exactly once. Memfd pages can be swapped out when using shmem; leave xa_is_value() checks in place. Link: https://lkml.kernel.org/r/20240226141324.278526-1-david@redhat.com Link: https://lkml.kernel.org/r/20240226141324.278526-2-david@redhat.com Co-developed-by: Matthew Wilcox (Oracle) Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memfd.c | 47 ++++++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/mm/memfd.c b/mm/memfd.c index d3a1ba4208c9..7d8d3ab3fa37 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -29,29 +29,25 @@ #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE #define LAST_SCAN 4 /* about 150ms max */ +static bool memfd_folio_has_extra_refs(struct folio *folio) +{ + return folio_ref_count(folio) - folio_mapcount(folio) != + folio_nr_pages(folio); +} + static void memfd_tag_pins(struct xa_state *xas) { - struct page *page; + struct folio *folio; int latency = 0; - int cache_count; lru_add_drain(); xas_lock_irq(xas); - xas_for_each(xas, page, ULONG_MAX) { - cache_count = 1; - if (!xa_is_value(page) && - PageTransHuge(page) && !PageHuge(page)) - cache_count = HPAGE_PMD_NR; - - if (!xa_is_value(page) && - page_count(page) - total_mapcount(page) != cache_count) + xas_for_each(xas, folio, ULONG_MAX) { + if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) xas_set_mark(xas, MEMFD_TAG_PINNED); - if (cache_count != 1) - xas_set(xas, page->index + cache_count); - latency += cache_count; - if (latency < XA_CHECK_SCHED) + if (++latency < XA_CHECK_SCHED) continue; latency = 0; @@ -66,16 +62,16 @@ static void memfd_tag_pins(struct xa_state *xas) /* * Setting SEAL_WRITE requires us to verify there's no pending writer. However, * via get_user_pages(), drivers might have some pending I/O without any active - * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios * and see whether it has an elevated ref-count. If so, we tag them and wait for * them to be dropped. * The caller must guarantee that no new user will acquire writable references - * to those pages to avoid races. + * to those folios to avoid races. */ static int memfd_wait_for_pins(struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, 0); - struct page *page; + struct folio *folio; int error, scan; memfd_tag_pins(&xas); @@ -83,7 +79,6 @@ static int memfd_wait_for_pins(struct address_space *mapping) error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { int latency = 0; - int cache_count; if (!xas_marked(&xas, MEMFD_TAG_PINNED)) break; @@ -95,20 +90,15 @@ static int memfd_wait_for_pins(struct address_space *mapping) xas_set(&xas, 0); xas_lock_irq(&xas); - xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) { + xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) { bool clear = true; - cache_count = 1; - if (!xa_is_value(page) && - PageTransHuge(page) && !PageHuge(page)) - cache_count = HPAGE_PMD_NR; - - if (!xa_is_value(page) && cache_count != - page_count(page) - total_mapcount(page)) { + if (!xa_is_value(folio) && + memfd_folio_has_extra_refs(folio)) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still - * found pages pinned. + * found folios pinned. */ if (scan == LAST_SCAN) error = -EBUSY; @@ -118,8 +108,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) if (clear) xas_clear_mark(&xas, MEMFD_TAG_PINNED); - latency += cache_count; - if (latency < XA_CHECK_SCHED) + if (++latency < XA_CHECK_SCHED) continue; latency = 0; From 5ce1f4844ba0def4b1b5526d8ccea27a98e840e5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 26 Feb 2024 15:13:24 +0100 Subject: [PATCH 355/435] mm: remove total_mapcount() All users of total_mapcount() are gone, let's remove it. Link: https://lkml.kernel.org/r/20240226141324.278526-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6f4825d82965..49e22a2f6ccc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1183,7 +1183,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for * debugging purposes - it does not include PTE-mapped sub-pages; look - * at folio_mapcount() or page_mapcount() or total_mapcount() instead. + * at folio_mapcount() or page_mapcount() instead. */ static inline int folio_entire_mapcount(struct folio *folio) { @@ -1243,13 +1243,6 @@ static inline int folio_mapcount(struct folio *folio) return folio_total_mapcount(folio); } -static inline int total_mapcount(struct page *page) -{ - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) + 1; - return folio_total_mapcount(page_folio(page)); -} - static inline bool folio_large_is_mapped(struct folio *folio) { /* From 44503b97ad9784943afb39d62fce3936580bec6f Mon Sep 17 00:00:00 2001 From: Martin Kaiser Date: Mon, 26 Feb 2024 20:11:57 +0100 Subject: [PATCH 356/435] lib/test_vmalloc.c: fix typo in function name Fix a typo and change the function name to init_test_configuration. Both caller and definition have the same typo, so the current code already works. Link: https://lkml.kernel.org/r/20240226191159.39509-2-martin@kaiser.cx Signed-off-by: Martin Kaiser Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- lib/test_vmalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 3718d9886407..191b6bd5dff9 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -501,7 +501,7 @@ static int test_func(void *private) } static int -init_test_configurtion(void) +init_test_configuration(void) { /* * A maximum number of workers is defined as hard-coded @@ -531,7 +531,7 @@ static void do_concurrent_test(void) /* * Set some basic configurations plus sanity check. */ - ret = init_test_configurtion(); + ret = init_test_configuration(); if (ret < 0) return; From e2c5bfebabaedbec8ca858b66f95f3a993428b0c Mon Sep 17 00:00:00 2001 From: Martin Kaiser Date: Mon, 26 Feb 2024 20:11:58 +0100 Subject: [PATCH 357/435] lib/test_vmalloc.c: drop empty exit function The module is never loaded successfully. Therefore, it'll never be unloaded and we can remove the exit function. Link: https://lkml.kernel.org/r/20240226191159.39509-3-martin@kaiser.cx Signed-off-by: Martin Kaiser Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- lib/test_vmalloc.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 191b6bd5dff9..d0c0cbe1913d 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -600,12 +600,7 @@ static int vmalloc_test_init(void) return -EAGAIN; /* Fail will directly unload the module */ } -static void vmalloc_test_exit(void) -{ -} - module_init(vmalloc_test_init) -module_exit(vmalloc_test_exit) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Uladzislau Rezki"); From 4c4a52544ae03d84ffd3b5e9593833ffe05485a1 Mon Sep 17 00:00:00 2001 From: Martin Kaiser Date: Mon, 26 Feb 2024 20:11:59 +0100 Subject: [PATCH 358/435] lib/test_vmalloc.c: use unsigned long constant Use an unsigned long constant instead of an int constant and a cast. This fixes the checkpatch warning WARNING: Unnecessary typecast of c90 int constant - '(unsigned long) 1' could be '1UL' + align = ((unsigned long) 1) << i; Link: https://lkml.kernel.org/r/20240226191159.39509-4-martin@kaiser.cx Signed-off-by: Martin Kaiser Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- lib/test_vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index d0c0cbe1913d..4ddf769861ff 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -117,7 +117,7 @@ static int align_shift_alloc_test(void) int i; for (i = 0; i < BITS_PER_LONG; i++) { - align = ((unsigned long) 1) << i; + align = 1UL << i; ptr = __vmalloc_node(PAGE_SIZE, align, GFP_KERNEL|__GFP_ZERO, 0, __builtin_return_address(0)); From 5dad604809c5acc546ec74057498db1623f1c408 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Tue, 27 Feb 2024 11:51:35 +0800 Subject: [PATCH 359/435] mm/khugepaged: keep mm in mm_slot without MMF_DISABLE_THP check Previously, we removed the mm from mm_slot and dropped mm_count if the MMF_THP_DISABLE flag was set. However, we didn't re-add the mm back after clearing the MMF_THP_DISABLE flag. Additionally, We add a check for the MMF_THP_DISABLE flag in hugepage_vma_revalidate(). Link: https://lkml.kernel.org/r/20240227035135.54593-1-ioworker0@gmail.com Fixes: 879c6000e191 ("mm/khugepaged: bypassing unnecessary scans with MMF_DISABLE_THP check") Signed-off-by: Lance Yang Suggested-by: Yang Shi Reviewed-by: Yang Shi Reviewed-by: David Hildenbrand Cc: Michal Hocko Cc: Minchan Kim Cc: Muchun Song Cc: Peter Xu Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/khugepaged.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2771fc043b3b..1c0073daad82 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -920,7 +920,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, { struct vm_area_struct *vma; - if (unlikely(hpage_collapse_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); @@ -1428,7 +1428,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) lockdep_assert_held(&khugepaged_mm_lock); - if (hpage_collapse_test_exit_or_disable(mm)) { + if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); @@ -2456,7 +2456,7 @@ breakouterloop_mmap_lock: * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ - if (hpage_collapse_test_exit_or_disable(mm) || !vma) { + if (hpage_collapse_test_exit(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find From 99fbb6bfc16f202adc411ad5d353db214750d121 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:35 +0000 Subject: [PATCH 360/435] mm: make folios_put() the basis of release_pages() Patch series "Rearrange batched folio freeing", v3. Other than the obvious "remove calls to compound_head" changes, the fundamental belief here is that iterating a linked list is much slower than iterating an array (5-15x slower in my testing). There's also an associated belief that since we iterate the batch of folios three times, we do better when the array is small (ie 15 entries) than we do with a batch that is hundreds of entries long, which only gives us the opportunity for the first pages to fall out of cache by the time we get to the end. It is possible we should increase the size of folio_batch. Hopefully the bots let us know if this introduces any performance regressions. This patch (of 3): By making release_pages() call folios_put(), we can get rid of the calls to compound_head() for the callers that already know they have folios. We can also get rid of the lock_batch tracking as we know the size of the batch is limited by folio_batch. This does reduce the maximum number of pages for which the lruvec lock is held, from SWAP_CLUSTER_MAX (32) to PAGEVEC_SIZE (15). I do not expect this to make a significant difference, but if it does, we can increase PAGEVEC_SIZE to 31. Link: https://lkml.kernel.org/r/20240227174254.710559-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240227174254.710559-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 +++++--- mm/mlock.c | 3 +- mm/swap.c | 100 ++++++++++++++++++++++++++------------------- 3 files changed, 70 insertions(+), 49 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 49e22a2f6ccc..5c57fde9b69b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -36,6 +36,7 @@ struct anon_vma; struct anon_vma_chain; struct user_struct; struct pt_regs; +struct folio_batch; extern int sysctl_page_lock_unfairness; @@ -1512,6 +1513,8 @@ static inline void folio_put_refs(struct folio *folio, int refs) __folio_put(folio); } +void folios_put_refs(struct folio_batch *folios, unsigned int *refs); + /* * union release_pages_arg - an array of pages or folios * @@ -1534,18 +1537,19 @@ void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. - * @nr: How many folios there are. * - * Like folio_put(), but for an array of folios. This is more efficient - * than writing the loop yourself as it will optimise the locks which - * need to be taken if the folios are freed. + * Like folio_put(), but for a batch of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which need + * to be taken if the folios are freed. The folios batch is returned + * empty and ready to be reused for another batch; there is no need to + * reinitialise it. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ -static inline void folios_put(struct folio **folios, unsigned int nr) +static inline void folios_put(struct folio_batch *folios) { - release_pages(folios, nr); + folios_put_refs(folios, NULL); } static inline void put_page(struct page *page) diff --git a/mm/mlock.c b/mm/mlock.c index 086546ac5766..1ed2f2ab37cd 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -206,8 +206,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch) if (lruvec) unlock_page_lruvec_irq(lruvec); - folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } void mlock_drain_local(void) diff --git a/mm/swap.c b/mm/swap.c index e5380d732c0d..3d51f8c72017 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -89,7 +89,7 @@ static void __page_cache_release(struct folio *folio) __folio_clear_lru_flags(folio); unlock_page_lruvec_irqrestore(lruvec, flags); } - /* See comment on folio_test_mlocked in release_pages() */ + /* See comment on folio_test_mlocked in folios_put() */ if (unlikely(folio_test_mlocked(folio))) { long nr_pages = folio_nr_pages(folio); @@ -175,7 +175,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily - * true of release_pages(): but those only clear the mlocked flag after + * true of folios_put(): but those only clear the mlocked flag after * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { @@ -221,8 +221,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); - folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } static void folio_batch_add_and_move(struct folio_batch *fbatch, @@ -946,47 +945,30 @@ void lru_cache_disable(void) } /** - * release_pages - batched put_page() - * @arg: array of pages to release - * @nr: number of pages + * folios_put_refs - Reduce the reference count on a batch of folios. + * @folios: The folios. + * @refs: The number of refs to subtract from each folio. * - * Decrement the reference count on all the pages in @arg. If it - * fell to zero, remove the page from the LRU and free it. + * Like folio_put(), but for a batch of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which need + * to be taken if the folios are freed. The folios batch is returned + * empty and ready to be reused for another batch; there is no need + * to reinitialise it. If @refs is NULL, we subtract one from each + * folio refcount. * - * Note that the argument can be an array of pages, encoded pages, - * or folio pointers. We ignore any encoded bits, and turn any of - * them into just a folio that gets free'd. + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. */ -void release_pages(release_pages_arg arg, int nr) +void folios_put_refs(struct folio_batch *folios, unsigned int *refs) { int i; - struct encoded_page **encoded = arg.encoded_pages; LIST_HEAD(pages_to_free); struct lruvec *lruvec = NULL; unsigned long flags = 0; - unsigned int lock_batch; - for (i = 0; i < nr; i++) { - unsigned int nr_refs = 1; - struct folio *folio; - - /* Turn any of the argument types into a folio */ - folio = page_folio(encoded_page_ptr(encoded[i])); - - /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ - if (unlikely(encoded_page_flags(encoded[i]) & - ENCODED_PAGE_BIT_NR_PAGES_NEXT)) - nr_refs = encoded_nr_pages(encoded[++i]); - - /* - * Make sure the IRQ-safe lock-holding time does not get - * excessive with a continuous string of pages from the - * same lruvec. The lock is held only if lruvec != NULL. - */ - if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { - unlock_page_lruvec_irqrestore(lruvec, flags); - lruvec = NULL; - } + for (i = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; + unsigned int nr_refs = refs ? refs[i] : 1; if (is_huge_zero_page(&folio->page)) continue; @@ -1016,13 +998,8 @@ void release_pages(release_pages_arg arg, int nr) } if (folio_test_lru(folio)) { - struct lruvec *prev_lruvec = lruvec; - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); - if (prev_lruvec != lruvec) - lock_batch = 0; - lruvec_del_folio(lruvec, folio); __folio_clear_lru_flags(folio); } @@ -1046,6 +1023,47 @@ void release_pages(release_pages_arg arg, int nr) mem_cgroup_uncharge_list(&pages_to_free); free_unref_page_list(&pages_to_free); + folio_batch_reinit(folios); +} +EXPORT_SYMBOL(folios_put_refs); + +/** + * release_pages - batched put_page() + * @arg: array of pages to release + * @nr: number of pages + * + * Decrement the reference count on all the pages in @arg. If it + * fell to zero, remove the page from the LRU and free it. + * + * Note that the argument can be an array of pages, encoded pages, + * or folio pointers. We ignore any encoded bits, and turn any of + * them into just a folio that gets free'd. + */ +void release_pages(release_pages_arg arg, int nr) +{ + struct folio_batch fbatch; + int refs[PAGEVEC_SIZE]; + struct encoded_page **encoded = arg.encoded_pages; + int i; + + folio_batch_init(&fbatch); + for (i = 0; i < nr; i++) { + /* Turn any of the argument types into a folio */ + struct folio *folio = page_folio(encoded_page_ptr(encoded[i])); + + /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ + refs[fbatch.nr] = 1; + if (unlikely(encoded_page_flags(encoded[i]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + refs[fbatch.nr] = encoded_nr_pages(encoded[++i]); + + if (folio_batch_add(&fbatch, folio) > 0) + continue; + folios_put_refs(&fbatch, refs); + } + + if (fbatch.nr) + folios_put_refs(&fbatch, refs); } EXPORT_SYMBOL(release_pages); From 7c76d92253dbb7c53ba03a4cd6639113cd1f7d3a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:36 +0000 Subject: [PATCH 361/435] mm: convert free_unref_page_list() to use folios Most of its callees are not yet ready to accept a folio, but we know all of the pages passed in are actually folios because they're linked through ->lru. Link: https://lkml.kernel.org/r/20240227174254.710559-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ff1f159251df..20d4ba095ad2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2520,17 +2520,17 @@ void free_unref_page(struct page *page, unsigned int order) void free_unref_page_list(struct list_head *list) { unsigned long __maybe_unused UP_flags; - struct page *page, *next; + struct folio *folio, *next; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; int batch_count = 0; int migratetype; /* Prepare pages for freeing */ - list_for_each_entry_safe(page, next, list, lru) { - unsigned long pfn = page_to_pfn(page); - if (!free_unref_page_prepare(page, pfn, 0)) { - list_del(&page->lru); + list_for_each_entry_safe(folio, next, list, lru) { + unsigned long pfn = folio_pfn(folio); + if (!free_unref_page_prepare(&folio->page, pfn, 0)) { + list_del(&folio->lru); continue; } @@ -2538,24 +2538,25 @@ void free_unref_page_list(struct list_head *list) * Free isolated pages directly to the allocator, see * comment in free_unref_page. */ - migratetype = get_pcppage_migratetype(page); + migratetype = get_pcppage_migratetype(&folio->page); if (unlikely(is_migrate_isolate(migratetype))) { - list_del(&page->lru); - free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); + list_del(&folio->lru); + free_one_page(folio_zone(folio), &folio->page, pfn, + 0, migratetype, FPI_NONE); continue; } } - list_for_each_entry_safe(page, next, list, lru) { - struct zone *zone = page_zone(page); + list_for_each_entry_safe(folio, next, list, lru) { + struct zone *zone = folio_zone(folio); - list_del(&page->lru); - migratetype = get_pcppage_migratetype(page); + list_del(&folio->lru); + migratetype = get_pcppage_migratetype(&folio->page); /* * Either different zone requiring a different pcp lock or * excessive lock hold times when freeing a large list of - * pages. + * folios. */ if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) { if (pcp) { @@ -2566,15 +2567,16 @@ void free_unref_page_list(struct list_head *list) batch_count = 0; /* - * trylock is necessary as pages may be getting freed + * trylock is necessary as folios may be getting freed * from IRQ or SoftIRQ context after an IO completion. */ pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (unlikely(!pcp)) { pcp_trylock_finish(UP_flags); - free_one_page(zone, page, page_to_pfn(page), - 0, migratetype, FPI_NONE); + free_one_page(zone, &folio->page, + folio_pfn(folio), 0, + migratetype, FPI_NONE); locked_zone = NULL; continue; } @@ -2588,8 +2590,8 @@ void free_unref_page_list(struct list_head *list) if (unlikely(migratetype >= MIGRATE_PCPTYPES)) migratetype = MIGRATE_MOVABLE; - trace_mm_page_free_batched(page); - free_unref_page_commit(zone, pcp, page, migratetype, 0); + trace_mm_page_free_batched(&folio->page); + free_unref_page_commit(zone, pcp, &folio->page, migratetype, 0); batch_count++; } From 90491d87dd46a4c843dae775b9e72c91624c5a7b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:37 +0000 Subject: [PATCH 362/435] mm: add free_unref_folios() Iterate over a folio_batch rather than a linked list. This is easier for the CPU to prefetch and has a batch count naturally built in so we don't need to track it. Again, this lowers the maximum lock hold time from 32 folios to 15, but I do not expect this to have a significant effect. Link: https://lkml.kernel.org/r/20240227174254.710559-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/internal.h | 5 +++-- mm/page_alloc.c | 59 ++++++++++++++++++++++++++++++------------------- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index f376e3afbc4c..1dfdc3bde1b0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -451,8 +451,9 @@ extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; -extern void free_unref_page(struct page *page, unsigned int order); -extern void free_unref_page_list(struct list_head *list); +void free_unref_page(struct page *page, unsigned int order); +void free_unref_folios(struct folio_batch *fbatch); +void free_unref_page_list(struct list_head *list); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 20d4ba095ad2..31d97322feea 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -2515,57 +2516,51 @@ void free_unref_page(struct page *page, unsigned int order) } /* - * Free a list of 0-order pages + * Free a batch of 0-order pages */ -void free_unref_page_list(struct list_head *list) +void free_unref_folios(struct folio_batch *folios) { unsigned long __maybe_unused UP_flags; - struct folio *folio, *next; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; - int batch_count = 0; - int migratetype; + int i, j, migratetype; - /* Prepare pages for freeing */ - list_for_each_entry_safe(folio, next, list, lru) { + /* Prepare folios for freeing */ + for (i = 0, j = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; unsigned long pfn = folio_pfn(folio); - if (!free_unref_page_prepare(&folio->page, pfn, 0)) { - list_del(&folio->lru); + if (!free_unref_page_prepare(&folio->page, pfn, 0)) continue; - } /* - * Free isolated pages directly to the allocator, see + * Free isolated folios directly to the allocator, see * comment in free_unref_page. */ migratetype = get_pcppage_migratetype(&folio->page); if (unlikely(is_migrate_isolate(migratetype))) { - list_del(&folio->lru); free_one_page(folio_zone(folio), &folio->page, pfn, 0, migratetype, FPI_NONE); continue; } + if (j != i) + folios->folios[j] = folio; + j++; } + folios->nr = j; - list_for_each_entry_safe(folio, next, list, lru) { + for (i = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; struct zone *zone = folio_zone(folio); - list_del(&folio->lru); migratetype = get_pcppage_migratetype(&folio->page); - /* - * Either different zone requiring a different pcp lock or - * excessive lock hold times when freeing a large list of - * folios. - */ - if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) { + /* Different zone requires a different pcp lock */ + if (zone != locked_zone) { if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); } - batch_count = 0; - /* * trylock is necessary as folios may be getting freed * from IRQ or SoftIRQ context after an IO completion. @@ -2592,13 +2587,31 @@ void free_unref_page_list(struct list_head *list) trace_mm_page_free_batched(&folio->page); free_unref_page_commit(zone, pcp, &folio->page, migratetype, 0); - batch_count++; } if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); } + folio_batch_reinit(folios); +} + +void free_unref_page_list(struct list_head *list) +{ + struct folio_batch fbatch; + + folio_batch_init(&fbatch); + while (!list_empty(list)) { + struct folio *folio = list_first_entry(list, struct folio, lru); + + list_del(&folio->lru); + if (folio_batch_add(&fbatch, folio) > 0) + continue; + free_unref_folios(&fbatch); + } + + if (fbatch.nr) + free_unref_folios(&fbatch); } /* From 6871cc5742f411bf8ebbcb78b4afeb992d888228 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:38 +0000 Subject: [PATCH 363/435] mm: use folios_put() in __folio_batch_release() There's no need to indirect through release_pages() and iterate over this batch of folios an extra time; we can just use the batch that we have. Link: https://lkml.kernel.org/r/20240227174254.710559-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/swap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 3d51f8c72017..1cfb7b897ebd 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1083,8 +1083,7 @@ void __folio_batch_release(struct folio_batch *fbatch) lru_add_drain(); fbatch->percpu_pvec_drained = true; } - release_pages(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } EXPORT_SYMBOL(__folio_batch_release); From 4882c80975e2bf7241a5b043eb1dbe8df2726a29 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:39 +0000 Subject: [PATCH 364/435] memcg: add mem_cgroup_uncharge_folios() Almost identical to mem_cgroup_uncharge_list(), except it takes a folio_batch instead of a list_head. Link: https://lkml.kernel.org/r/20240227174254.710559-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 14 ++++++++++++-- mm/memcontrol.c | 13 +++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9a2dea92be0e..35a0d7a851f0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -721,10 +721,16 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) __mem_cgroup_uncharge_list(page_list); } +void __mem_cgroup_uncharge_folios(struct folio_batch *folios); +static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge_folios(folios); +} + void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); - void mem_cgroup_replace_folio(struct folio *old, struct folio *new); - void mem_cgroup_migrate(struct folio *old, struct folio *new); /** @@ -1299,6 +1305,10 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } +static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) +{ +} + static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e4864e2729e6..2edb250d131b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -7531,6 +7532,18 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list) uncharge_batch(&ug); } +void __mem_cgroup_uncharge_folios(struct folio_batch *folios) +{ + struct uncharge_gather ug; + unsigned int i; + + uncharge_gather_clear(&ug); + for (i = 0; i < folios->nr; i++) + uncharge_folio(folios->folios[i], &ug); + if (ug.memcg) + uncharge_batch(&ug); +} + /** * mem_cgroup_replace_folio - Charge a folio's replacement. * @old: Currently circulating folio. From 7c33b8c4229af19797c78de48827ca70228c1f47 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:40 +0000 Subject: [PATCH 365/435] mm: remove use of folio list from folios_put() Instead of putting the interesting folios on a list, delete the uninteresting one from the folio_batch. Link: https://lkml.kernel.org/r/20240227174254.710559-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/swap.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 1cfb7b897ebd..ee8b131bf32c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -961,12 +961,11 @@ void lru_cache_disable(void) */ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) { - int i; - LIST_HEAD(pages_to_free); + int i, j; struct lruvec *lruvec = NULL; unsigned long flags = 0; - for (i = 0; i < folios->nr; i++) { + for (i = 0, j = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; unsigned int nr_refs = refs ? refs[i] : 1; @@ -1016,14 +1015,20 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) count_vm_event(UNEVICTABLE_PGCLEARED); } - list_add(&folio->lru, &pages_to_free); + if (j != i) + folios->folios[j] = folio; + j++; } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); + if (!j) { + folio_batch_reinit(folios); + return; + } - mem_cgroup_uncharge_list(&pages_to_free); - free_unref_page_list(&pages_to_free); - folio_batch_reinit(folios); + folios->nr = j; + mem_cgroup_uncharge_folios(folios); + free_unref_folios(folios); } EXPORT_SYMBOL(folios_put_refs); From 24835f899c0129a4733e899e4da20e2e72f40bd9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:41 +0000 Subject: [PATCH 366/435] mm: use free_unref_folios() in put_pages_list() Break up the list of folios into batches here so that the folios are more likely to be cache hot when doing the rest of the processing. Link: https://lkml.kernel.org/r/20240227174254.710559-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/swap.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index ee8b131bf32c..ad3f2e9448a4 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -138,22 +138,25 @@ EXPORT_SYMBOL(__folio_put); */ void put_pages_list(struct list_head *pages) { - struct folio *folio, *next; + struct folio_batch fbatch; + struct folio *folio; - list_for_each_entry_safe(folio, next, pages, lru) { - if (!folio_put_testzero(folio)) { - list_del(&folio->lru); + folio_batch_init(&fbatch); + list_for_each_entry(folio, pages, lru) { + if (!folio_put_testzero(folio)) continue; - } if (folio_test_large(folio)) { - list_del(&folio->lru); __folio_put_large(folio); continue; } /* LRU flag must be clear because it's passed using the lru */ + if (folio_batch_add(&fbatch, folio) > 0) + continue; + free_unref_folios(&fbatch); } - free_unref_page_list(pages); + if (fbatch.nr) + free_unref_folios(&fbatch); INIT_LIST_HEAD(pages); } EXPORT_SYMBOL(put_pages_list); From f1ee018baee9f4e724e08859c2559323be768be3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:42 +0000 Subject: [PATCH 367/435] mm: use __page_cache_release() in folios_put() Pass a pointer to the lruvec so we can take advantage of the folio_lruvec_relock_irqsave(). Adjust the calling convention of folio_lruvec_relock_irqsave() to suit and add a page_cache_release() wrapper. Link: https://lkml.kernel.org/r/20240227174254.710559-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 16 +++++----- mm/swap.c | 62 ++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 41 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 35a0d7a851f0..b7f5e0c17de7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1705,18 +1705,18 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, return folio_lruvec_lock_irq(folio); } -/* Don't lock again iff page's lruvec locked */ -static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio, - struct lruvec *locked_lruvec, unsigned long *flags) +/* Don't lock again iff folio's lruvec locked */ +static inline void folio_lruvec_relock_irqsave(struct folio *folio, + struct lruvec **lruvecp, unsigned long *flags) { - if (locked_lruvec) { - if (folio_matches_lruvec(folio, locked_lruvec)) - return locked_lruvec; + if (*lruvecp) { + if (folio_matches_lruvec(folio, *lruvecp)) + return; - unlock_page_lruvec_irqrestore(locked_lruvec, *flags); + unlock_page_lruvec_irqrestore(*lruvecp, *flags); } - return folio_lruvec_lock_irqsave(folio, flags); + *lruvecp = folio_lruvec_lock_irqsave(folio, flags); } #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/mm/swap.c b/mm/swap.c index ad3f2e9448a4..dce5ea67ae05 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -74,22 +74,21 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), }; -/* - * This path almost never happens for VM activity - pages are normally freed - * in batches. But it gets used by networking - and for compound pages. - */ -static void __page_cache_release(struct folio *folio) +static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp, + unsigned long *flagsp) { if (folio_test_lru(folio)) { - struct lruvec *lruvec; - unsigned long flags; - - lruvec = folio_lruvec_lock_irqsave(folio, &flags); - lruvec_del_folio(lruvec, folio); + folio_lruvec_relock_irqsave(folio, lruvecp, flagsp); + lruvec_del_folio(*lruvecp, folio); __folio_clear_lru_flags(folio); - unlock_page_lruvec_irqrestore(lruvec, flags); } - /* See comment on folio_test_mlocked in folios_put() */ + + /* + * In rare cases, when truncation or holepunching raced with + * munlock after VM_LOCKED was cleared, Mlocked may still be + * found set here. This does not indicate a problem, unless + * "unevictable_pgs_cleared" appears worryingly large. + */ if (unlikely(folio_test_mlocked(folio))) { long nr_pages = folio_nr_pages(folio); @@ -99,9 +98,23 @@ static void __page_cache_release(struct folio *folio) } } +/* + * This path almost never happens for VM activity - pages are normally freed + * in batches. But it gets used by networking - and for compound pages. + */ +static void page_cache_release(struct folio *folio) +{ + struct lruvec *lruvec = NULL; + unsigned long flags; + + __page_cache_release(folio, &lruvec, &flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); +} + static void __folio_put_small(struct folio *folio) { - __page_cache_release(folio); + page_cache_release(folio); mem_cgroup_uncharge(folio); free_unref_page(&folio->page, 0); } @@ -115,7 +128,7 @@ static void __folio_put_large(struct folio *folio) * be called for hugetlb (it has a separate hugetlb_cgroup.) */ if (!folio_test_hugetlb(folio)) - __page_cache_release(folio); + page_cache_release(folio); destroy_large_folio(folio); } @@ -216,7 +229,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (move_fn != lru_add_fn && !folio_test_clear_lru(folio)) continue; - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); + folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); folio_set_lru(folio); @@ -999,24 +1012,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) continue; } - if (folio_test_lru(folio)) { - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, - &flags); - lruvec_del_folio(lruvec, folio); - __folio_clear_lru_flags(folio); - } - - /* - * In rare cases, when truncation or holepunching raced with - * munlock after VM_LOCKED was cleared, Mlocked may still be - * found set here. This does not indicate a problem, unless - * "unevictable_pgs_cleared" appears worryingly large. - */ - if (unlikely(folio_test_mlocked(folio))) { - __folio_clear_mlocked(folio); - zone_stat_sub_folio(folio, NR_MLOCK); - count_vm_event(UNEVICTABLE_PGCLEARED); - } + __page_cache_release(folio, &lruvec, &flags); if (j != i) folios->folios[j] = folio; From 31b2ff82aefb33ce92496a1becddd6ce51060db2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:43 +0000 Subject: [PATCH 368/435] mm: handle large folios in free_unref_folios() Call folio_undo_large_rmappable() if needed. free_unref_page_prepare() destroys the ability to call folio_order(), so stash the order in folio->private for the benefit of the second loop. Link: https://lkml.kernel.org/r/20240227174254.710559-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/page_alloc.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 31d97322feea..025ad1a7df7b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2516,7 +2516,7 @@ void free_unref_page(struct page *page, unsigned int order) } /* - * Free a batch of 0-order pages + * Free a batch of folios */ void free_unref_folios(struct folio_batch *folios) { @@ -2529,19 +2529,25 @@ void free_unref_folios(struct folio_batch *folios) for (i = 0, j = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; unsigned long pfn = folio_pfn(folio); - if (!free_unref_page_prepare(&folio->page, pfn, 0)) + unsigned int order = folio_order(folio); + + if (order > 0 && folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + if (!free_unref_page_prepare(&folio->page, pfn, order)) continue; /* - * Free isolated folios directly to the allocator, see - * comment in free_unref_page. + * Free isolated folios and orders not handled on the PCP + * directly to the allocator, see comment in free_unref_page. */ migratetype = get_pcppage_migratetype(&folio->page); - if (unlikely(is_migrate_isolate(migratetype))) { + if (!pcp_allowed_order(order) || + is_migrate_isolate(migratetype)) { free_one_page(folio_zone(folio), &folio->page, pfn, - 0, migratetype, FPI_NONE); + order, migratetype, FPI_NONE); continue; } + folio->private = (void *)(unsigned long)order; if (j != i) folios->folios[j] = folio; j++; @@ -2551,7 +2557,9 @@ void free_unref_folios(struct folio_batch *folios) for (i = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; struct zone *zone = folio_zone(folio); + unsigned int order = (unsigned long)folio->private; + folio->private = NULL; migratetype = get_pcppage_migratetype(&folio->page); /* Different zone requires a different pcp lock */ @@ -2570,7 +2578,7 @@ void free_unref_folios(struct folio_batch *folios) if (unlikely(!pcp)) { pcp_trylock_finish(UP_flags); free_one_page(zone, &folio->page, - folio_pfn(folio), 0, + folio_pfn(folio), order, migratetype, FPI_NONE); locked_zone = NULL; continue; @@ -2586,7 +2594,8 @@ void free_unref_folios(struct folio_batch *folios) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(&folio->page); - free_unref_page_commit(zone, pcp, &folio->page, migratetype, 0); + free_unref_page_commit(zone, pcp, &folio->page, migratetype, + order); } if (pcp) { From f77171d241e379ea93448a53d58104191e02135c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:44 +0000 Subject: [PATCH 369/435] mm: allow non-hugetlb large folios to be batch processed Hugetlb folios still get special treatment, but normal large folios can now be freed by free_unref_folios(). This should have a reasonable performance impact, TBD. Link: https://lkml.kernel.org/r/20240227174254.710559-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/swap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index dce5ea67ae05..6b697d33fa5b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1003,12 +1003,13 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) if (!folio_ref_sub_and_test(folio, nr_refs)) continue; - if (folio_test_large(folio)) { + /* hugetlb has its own memcg */ + if (folio_test_hugetlb(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - __folio_put_large(folio); + free_huge_folio(folio); continue; } From bc2ff4cbc3294c01f29449405c42ee26ee0e1f59 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:45 +0000 Subject: [PATCH 370/435] mm: free folios in a batch in shrink_folio_list() Use free_unref_page_batch() to free the folios. This may increase the number of IPIs from calling try_to_unmap_flush() more often, but that's going to be very workload-dependent. It may even reduce the number of IPIs as we now batch-free large folios instead of freeing them one at a time. Link: https://lkml.kernel.org/r/20240227174254.710559-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: David Hildenbrand Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/vmscan.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index dcfbe617e9ef..144879d5bb89 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1006,14 +1006,15 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references) { + struct folio_batch free_folios; LIST_HEAD(ret_folios); - LIST_HEAD(free_folios); LIST_HEAD(demote_folios); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; bool do_demote_pass; struct swap_iocb *plug = NULL; + folio_batch_init(&free_folios); memset(stat, 0, sizeof(*stat)); cond_resched(); do_demote_pass = can_demote(pgdat->node_id, sc); @@ -1412,14 +1413,11 @@ free_it: */ nr_reclaimed += nr_pages; - /* - * Is there need to periodically free_folio_list? It would - * appear not as the counts should be low - */ - if (unlikely(folio_test_large(folio))) - destroy_large_folio(folio); - else - list_add(&folio->lru, &free_folios); + if (folio_batch_add(&free_folios, folio) == 0) { + mem_cgroup_uncharge_folios(&free_folios); + try_to_unmap_flush(); + free_unref_folios(&free_folios); + } continue; activate_locked_split: @@ -1483,9 +1481,9 @@ keep: pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; - mem_cgroup_uncharge_list(&free_folios); + mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); - free_unref_page_list(&free_folios); + free_unref_folios(&free_folios); list_splice(&ret_folios, folio_list); count_vm_events(PGACTIVATE, pgactivate); From 29f3843026cf83414a8bc319c97c1d09a6c33f4e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:46 +0000 Subject: [PATCH 371/435] mm: free folios directly in move_folios_to_lru() The few folios which can't be moved to the LRU list (because their refcount dropped to zero) used to be returned to the caller to dispose of. Make this simpler to call by freeing the folios directly through free_unref_folios(). Link: https://lkml.kernel.org/r/20240227174254.710559-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/vmscan.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 144879d5bb89..e3349b75f15b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1781,7 +1781,6 @@ static bool too_many_isolated(struct pglist_data *pgdat, int file, /* * move_folios_to_lru() moves folios from private @list to appropriate LRU list. - * On return, @list is reused as a list of folios to be freed by the caller. * * Returns the number of pages moved to the given lruvec. */ @@ -1789,8 +1788,9 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, struct list_head *list) { int nr_pages, nr_moved = 0; - LIST_HEAD(folios_to_free); + struct folio_batch free_folios; + folio_batch_init(&free_folios); while (!list_empty(list)) { struct folio *folio = lru_to_folio(list); @@ -1819,12 +1819,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, if (unlikely(folio_put_testzero(folio))) { __folio_clear_lru_flags(folio); - if (unlikely(folio_test_large(folio))) { + if (folio_batch_add(&free_folios, folio) == 0) { spin_unlock_irq(&lruvec->lru_lock); - destroy_large_folio(folio); + mem_cgroup_uncharge_folios(&free_folios); + free_unref_folios(&free_folios); spin_lock_irq(&lruvec->lru_lock); - } else - list_add(&folio->lru, &folios_to_free); + } continue; } @@ -1841,10 +1841,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, workingset_age_nonresident(lruvec, nr_pages); } - /* - * To save our caller's stack, now use input list for pages to free. - */ - list_splice(&folios_to_free, list); + if (free_folios.nr) { + spin_unlock_irq(&lruvec->lru_lock); + mem_cgroup_uncharge_folios(&free_folios); + free_unref_folios(&free_folios); + spin_lock_irq(&lruvec->lru_lock); + } return nr_moved; } @@ -1923,8 +1925,6 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, spin_unlock_irq(&lruvec->lru_lock); lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); - mem_cgroup_uncharge_list(&folio_list); - free_unref_page_list(&folio_list); /* * If dirty folios are scanned that are not queued for IO, it @@ -2065,8 +2065,6 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_activate = move_folios_to_lru(lruvec, &l_active); nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); - /* Keep all free folios in l_active list */ - list_splice(&l_inactive, &l_active); __count_vm_events(PGDEACTIVATE, nr_deactivate); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); @@ -2076,8 +2074,6 @@ static void shrink_active_list(unsigned long nr_to_scan, if (nr_rotated) lru_note_cost(lruvec, file, 0, nr_rotated); - mem_cgroup_uncharge_list(&l_active); - free_unref_page_list(&l_active); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } @@ -4578,10 +4574,6 @@ retry: spin_unlock_irq(&lruvec->lru_lock); - mem_cgroup_uncharge_list(&list); - free_unref_page_list(&list); - - INIT_LIST_HEAD(&list); list_splice_init(&clean, &list); if (!list_empty(&list)) { From be5a9e17a2ccbecfb7020aa1938e2c62d8a9189c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:47 +0000 Subject: [PATCH 372/435] memcg: remove mem_cgroup_uncharge_list() All users have been converted to mem_cgroup_uncharge_folios() so we can remove this API. Link: https://lkml.kernel.org/r/20240227174254.710559-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 12 ------------ mm/memcontrol.c | 19 ------------------- 2 files changed, 31 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b7f5e0c17de7..394fd0a887ae 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -713,14 +713,6 @@ static inline void mem_cgroup_uncharge(struct folio *folio) __mem_cgroup_uncharge(folio); } -void __mem_cgroup_uncharge_list(struct list_head *page_list); -static inline void mem_cgroup_uncharge_list(struct list_head *page_list) -{ - if (mem_cgroup_disabled()) - return; - __mem_cgroup_uncharge_list(page_list); -} - void __mem_cgroup_uncharge_folios(struct folio_batch *folios); static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { @@ -1301,10 +1293,6 @@ static inline void mem_cgroup_uncharge(struct folio *folio) { } -static inline void mem_cgroup_uncharge_list(struct list_head *page_list) -{ -} - static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2edb250d131b..fabce2b50c69 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7513,25 +7513,6 @@ void __mem_cgroup_uncharge(struct folio *folio) uncharge_batch(&ug); } -/** - * __mem_cgroup_uncharge_list - uncharge a list of page - * @page_list: list of pages to uncharge - * - * Uncharge a list of pages previously charged with - * __mem_cgroup_charge(). - */ -void __mem_cgroup_uncharge_list(struct list_head *page_list) -{ - struct uncharge_gather ug; - struct folio *folio; - - uncharge_gather_clear(&ug); - list_for_each_entry(folio, page_list, lru) - uncharge_folio(folio, &ug); - if (ug.memcg) - uncharge_batch(&ug); -} - void __mem_cgroup_uncharge_folios(struct folio_batch *folios) { struct uncharge_gather ug; From 8b7b0a5eee22e3cd0468944d0720120c36340a2b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:48 +0000 Subject: [PATCH 373/435] mm: remove free_unref_page_list() All callers now use free_unref_folios() so we can delete this function. Link: https://lkml.kernel.org/r/20240227174254.710559-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/internal.h | 1 - mm/page_alloc.c | 18 ------------------ 2 files changed, 19 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 1dfdc3bde1b0..b1d806125e8d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -453,7 +453,6 @@ extern int user_min_free_kbytes; void free_unref_page(struct page *page, unsigned int order); void free_unref_folios(struct folio_batch *fbatch); -void free_unref_page_list(struct list_head *list); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 025ad1a7df7b..7873e9375802 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2605,24 +2605,6 @@ void free_unref_folios(struct folio_batch *folios) folio_batch_reinit(folios); } -void free_unref_page_list(struct list_head *list) -{ - struct folio_batch fbatch; - - folio_batch_init(&fbatch); - while (!list_empty(list)) { - struct folio *folio = list_first_entry(list, struct folio, lru); - - list_del(&folio->lru); - if (folio_batch_add(&fbatch, folio) > 0) - continue; - free_unref_folios(&fbatch); - } - - if (fbatch.nr) - free_unref_folios(&fbatch); -} - /* * split_page takes a non-compound higher-order page, and splits it into * n (1< Date: Tue, 27 Feb 2024 17:42:49 +0000 Subject: [PATCH 374/435] mm: remove lru_to_page() The last user was removed over a year ago; remove the definition. Link: https://lkml.kernel.org/r/20240227174254.710559-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5c57fde9b69b..d45eadc440f5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -227,7 +227,6 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) -#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); From 4907e80b76af004b6af42f0d4131e23ac73bc07c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:50 +0000 Subject: [PATCH 375/435] mm: convert free_pages_and_swap_cache() to use folios_put() Process the pages in batch-sized quantities instead of all-at-once. Link: https://lkml.kernel.org/r/20240227174254.710559-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/swap_state.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 2f540748f7c0..96b5c585f047 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -310,21 +311,25 @@ void free_page_and_swap_cache(struct page *page) */ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { - lru_add_drain(); - for (int i = 0; i < nr; i++) { - struct page *page = encoded_page_ptr(pages[i]); + struct folio_batch folios; + unsigned int refs[PAGEVEC_SIZE]; - /* - * Skip over the "nr_pages" entry. It's sufficient to call - * free_swap_cache() only once per folio. - */ + lru_add_drain(); + folio_batch_init(&folios); + for (int i = 0; i < nr; i++) { + struct folio *folio = page_folio(encoded_page_ptr(pages[i])); + + free_swap_cache(&folio->page); + refs[folios.nr] = 1; if (unlikely(encoded_page_flags(pages[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) - i++; + refs[folios.nr] = encoded_nr_pages(pages[++i]); - free_swap_cache(page); + if (folio_batch_add(&folios, folio) == 0) + folios_put_refs(&folios, refs); } - release_pages(pages, nr); + if (folios.nr) + folios_put_refs(&folios, refs); } static inline bool swap_use_vma_readahead(void) From d4111eecdc3c2a5eabafcc467dbfce0e216fa485 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:51 +0000 Subject: [PATCH 376/435] mm: use a folio in __collapse_huge_page_copy_succeeded() These pages are all chained together through the lru list, so we know they're folios. Use the folio APIs to save three hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20240227174254.710559-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/khugepaged.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1c0073daad82..b9223e803262 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -689,9 +689,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, spinlock_t *ptl, struct list_head *compound_pagelist) { - struct folio *src_folio; - struct page *src_page; - struct page *tmp; + struct folio *src, *tmp; pte_t *_pte; pte_t pteval; @@ -710,10 +708,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, ksm_might_unmap_zero_page(vma->vm_mm, pteval); } } else { - src_page = pte_page(pteval); - src_folio = page_folio(src_page); - if (!folio_test_large(src_folio)) - release_pte_folio(src_folio); + struct page *src_page = pte_page(pteval); + + src = page_folio(src_page); + if (!folio_test_large(src)) + release_pte_folio(src); /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats @@ -721,20 +720,19 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); - folio_remove_rmap_pte(src_folio, src_page, vma); + folio_remove_rmap_pte(src, src_page, vma); spin_unlock(ptl); free_page_and_swap_cache(src_page); } } - list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { - list_del(&src_page->lru); - mod_node_page_state(page_pgdat(src_page), - NR_ISOLATED_ANON + page_is_file_lru(src_page), - -compound_nr(src_page)); - unlock_page(src_page); - free_swap_cache(src_page); - putback_lru_page(src_page); + list_for_each_entry_safe(src, tmp, compound_pagelist, lru) { + list_del(&src->lru); + node_stat_sub_folio(src, NR_ISOLATED_ANON + + folio_is_file_lru(src)); + folio_unlock(src); + free_swap_cache(&src->page); + folio_putback_lru(src); } } From 63b774993dd02b17127cb404b7362fc436632995 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:52 +0000 Subject: [PATCH 377/435] mm: convert free_swap_cache() to take a folio All but one caller already has a folio, so convert free_page_and_swap_cache() to have a folio and remove the call to page_folio(). Link: https://lkml.kernel.org/r/20240227174254.710559-19-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Reviewed-by: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/swap.h | 8 ++++---- mm/khugepaged.c | 2 +- mm/memory.c | 2 +- mm/swap_state.c | 12 ++++++------ 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 8d28f6091a32..1ad6f63d1a52 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -447,9 +447,9 @@ static inline unsigned long total_swapcache_pages(void) return global_node_page_state(NR_SWAPCACHE); } -extern void free_swap_cache(struct page *page); -extern void free_page_and_swap_cache(struct page *); -extern void free_pages_and_swap_cache(struct encoded_page **, int); +void free_swap_cache(struct folio *folio); +void free_page_and_swap_cache(struct page *); +void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; @@ -531,7 +531,7 @@ static inline void put_swap_device(struct swap_info_struct *si) /* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */ #define free_swap_and_cache(e) is_pfn_swap_entry(e) -static inline void free_swap_cache(struct page *page) +static inline void free_swap_cache(struct folio *folio) { } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b9223e803262..38830174608f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -731,7 +731,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, node_stat_sub_folio(src, NR_ISOLATED_ANON + folio_is_file_lru(src)); folio_unlock(src); - free_swap_cache(&src->page); + free_swap_cache(src); folio_putback_lru(src); } } diff --git a/mm/memory.c b/mm/memory.c index 1c45b6a42a1b..75e0abda8642 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3452,7 +3452,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) folio_put(new_folio); if (old_folio) { if (page_copied) - free_swap_cache(&old_folio->page); + free_swap_cache(old_folio); folio_put(old_folio); } diff --git a/mm/swap_state.c b/mm/swap_state.c index 96b5c585f047..bfc7e8c58a6d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -283,10 +283,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, * folio_free_swap() _with_ the lock. * - Marcelo */ -void free_swap_cache(struct page *page) +void free_swap_cache(struct folio *folio) { - struct folio *folio = page_folio(page); - if (folio_test_swapcache(folio) && !folio_mapped(folio) && folio_trylock(folio)) { folio_free_swap(folio); @@ -300,9 +298,11 @@ void free_swap_cache(struct page *page) */ void free_page_and_swap_cache(struct page *page) { - free_swap_cache(page); + struct folio *folio = page_folio(page); + + free_swap_cache(folio); if (!is_huge_zero_page(page)) - put_page(page); + folio_put(folio); } /* @@ -319,7 +319,7 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) for (int i = 0; i < nr; i++) { struct folio *folio = page_folio(encoded_page_ptr(pages[i])); - free_swap_cache(&folio->page); + free_swap_cache(folio); refs[folios.nr] = 1; if (unlikely(encoded_page_flags(pages[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) From 8f8cd6c0a43ed637e620bbe45a8d0e0c2f4d5130 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 27 Feb 2024 10:35:46 +0800 Subject: [PATCH 378/435] modules: wait do_free_init correctly The synchronization here is to ensure the ordering of freeing of a module init so that it happens before W+X checking. It is worth noting it is not that the freeing was not happening, it is just that our sanity checkers raced against the permission checkers which assume init memory is already gone. Commit 1a7b7d922081 ("modules: Use vmalloc special flag") moved calling do_free_init() into a global workqueue instead of relying on it being called through call_rcu(..., do_free_init), which used to allowed us call do_free_init() asynchronously after the end of a subsequent grace period. The move to a global workqueue broke the gaurantees for code which needed to be sure the do_free_init() would complete with rcu_barrier(). To fix this callers which used to rely on rcu_barrier() must now instead use flush_work(&init_free_wq). Without this fix, we still could encounter false positive reports in W+X checking since the rcu_barrier() here can not ensure the ordering now. Even worse, the rcu_barrier() can introduce significant delay. Eric Chanudet reported that the rcu_barrier introduces ~0.1s delay on a PREEMPT_RT kernel. [ 0.291444] Freeing unused kernel memory: 5568K [ 0.402442] Run /sbin/init as init process With this fix, the above delay can be eliminated. Link: https://lkml.kernel.org/r/20240227023546.2490667-1-changbin.du@huawei.com Fixes: 1a7b7d922081 ("modules: Use vmalloc special flag") Signed-off-by: Changbin Du Tested-by: Eric Chanudet Acked-by: Luis Chamberlain Cc: Xiaoyi Su Signed-off-by: Andrew Morton --- include/linux/moduleloader.h | 8 ++++++++ init/main.c | 5 +++-- kernel/module/main.c | 9 +++++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 001b2ce83832..89b1e0ed9811 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -115,6 +115,14 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod); +#ifdef CONFIG_MODULES +void flush_module_init_free_work(void); +#else +static inline void flush_module_init_free_work(void) +{ +} +#endif + /* Any cleanup needed when module leaves. */ void module_arch_cleanup(struct module *mod); diff --git a/init/main.c b/init/main.c index 749a9f8d2c9b..5ecd4e8cf6d3 100644 --- a/init/main.c +++ b/init/main.c @@ -88,6 +88,7 @@ #include #include #include +#include #include #include #include @@ -1403,11 +1404,11 @@ static void mark_readonly(void) if (rodata_enabled) { /* * load_module() results in W+X mappings, which are cleaned - * up with call_rcu(). Let's make sure that queued work is + * up with init_free_wq. Let's make sure that queued work is * flushed so that we don't hit false positives looking for * insecure pages which are W+X. */ - rcu_barrier(); + flush_module_init_free_work(); mark_rodata_ro(); debug_checkwx(); rodata_test(); diff --git a/kernel/module/main.c b/kernel/module/main.c index 36681911c05a..b0b99348e1a8 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2489,6 +2489,11 @@ static void do_free_init(struct work_struct *w) } } +void flush_module_init_free_work(void) +{ + flush_work(&init_free_wq); +} + #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "module." /* Default value for module->async_probe_requested */ @@ -2593,8 +2598,8 @@ static noinline int do_init_module(struct module *mod) * Note that module_alloc() on most architectures creates W+X page * mappings which won't be cleaned up until do_free_init() runs. Any * code such as mark_rodata_ro() which depends on those mappings to - * be cleaned up needs to sync with the queued work - ie - * rcu_barrier() + * be cleaned up needs to sync with the queued work by invoking + * flush_module_init_free_work(). */ if (llist_add(&freeinit->node, &init_free_list)) schedule_work(&init_free_wq); From d3246b6ee42a155ab57e936841028fff3d7d98b4 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Tue, 27 Feb 2024 09:49:52 +0800 Subject: [PATCH 379/435] crash_core: export vmemmap when CONFIG_SPARSEMEM_VMEMMAP is enabled In memory_model.h, if CONFIG_SPARSEMEM_VMEMMAP is configed, kernel will use vmemmap to do the __pfn_to_page/page_to_pfn, and kernel will not use the "classic sparse" to do the __pfn_to_page/page_to_pfn. So export the vmemmap when CONFIG_SPARSEMEM_VMEMMAP is configed. This makes the user applications (crash, etc) get faster pfn_to_page/page_to_pfn operations too. Link: https://lkml.kernel.org/r/20240227014952.3184-1-shijie@os.amperecomputing.com Signed-off-by: Huang Shijie Acked-by: Baoquan He Cc: Dave Young Cc: Kazuhito Hagio Cc: Lianbo Jiang Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/vmcore_info.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index 8f77e238a54f..f95516cd45bb 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -152,6 +152,9 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(mem_map); VMCOREINFO_SYMBOL(contig_page_data); #endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP + VMCOREINFO_SYMBOL_ARRAY(vmemmap); +#endif #ifdef CONFIG_SPARSEMEM VMCOREINFO_SYMBOL_ARRAY(mem_section); VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); From 435a75548109f19e5b5b14ae35b9acb063c084e9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 28 Feb 2024 16:42:36 +0000 Subject: [PATCH 380/435] mm: use folio more widely in __split_huge_page We already have a folio; use it instead of the head page where reasonable. Saves a couple of calls to compound_head() and elimimnates a few references to page->mapping. Link: https://lkml.kernel.org/r/20240228164326.1355045-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/huge_memory.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fd745bcc97ff..a81a09236c16 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2919,7 +2919,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, if (head[i].index >= end) { struct folio *tail = page_folio(head + i); - if (shmem_mapping(head->mapping)) + if (shmem_mapping(folio->mapping)) nr_dropped++; else if (folio_test_clear_dirty(tail)) folio_account_cleaned(tail, @@ -2927,7 +2927,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, __filemap_remove_folio(tail, NULL); folio_put(tail); } else if (!PageAnon(page)) { - __xa_store(&head->mapping->i_pages, head[i].index, + __xa_store(&folio->mapping->i_pages, head[i].index, head + i, 0); } else if (swap_cache) { __xa_store(&swap_cache->i_pages, offset + i, @@ -2948,23 +2948,23 @@ static void __split_huge_page(struct page *page, struct list_head *list, split_page_owner(head, order, new_order); /* See comment in __split_huge_page_tail() */ - if (PageAnon(head)) { + if (folio_test_anon(folio)) { /* Additional pin to swap cache */ - if (PageSwapCache(head)) { - page_ref_add(head, 1 + new_nr); + if (folio_test_swapcache(folio)) { + folio_ref_add(folio, 1 + new_nr); xa_unlock(&swap_cache->i_pages); } else { - page_ref_inc(head); + folio_ref_inc(folio); } } else { /* Additional pin to page cache */ - page_ref_add(head, 1 + new_nr); - xa_unlock(&head->mapping->i_pages); + folio_ref_add(folio, 1 + new_nr); + xa_unlock(&folio->mapping->i_pages); } local_irq_enable(); if (nr_dropped) - shmem_uncharge(head->mapping->host, nr_dropped); + shmem_uncharge(folio->mapping->host, nr_dropped); remap_page(folio, nr); if (folio_test_swapcache(folio)) @@ -2980,9 +2980,10 @@ static void __split_huge_page(struct page *page, struct list_head *list, for (i = 0; i < nr; i += new_nr) { struct page *subpage = head + i; + struct folio *new_folio = page_folio(subpage); if (subpage == page) continue; - unlock_page(subpage); + folio_unlock(new_folio); /* * Subpages may be freed if there wasn't any mapping From c8b36003121834cb77fcaf8a1ce0a454d7a97891 Mon Sep 17 00:00:00 2001 From: Richard Chang Date: Wed, 28 Feb 2024 05:11:17 +0000 Subject: [PATCH 381/435] mm: add alloc_contig_migrate_range allocation statistics alloc_contig_migrate_range has every information to be able to understand big contiguous allocation latency. For example, how many pages are migrated, how many times they were needed to unmap from page tables. This patch adds the trace event to collect the allocation statistics. In the field, it was quite useful to understand CMA allocation latency. [akpm@linux-foundation.org: a/trace_mm_alloc_config_migrate_range_info_enabled/trace_mm_alloc_contig_migrate_range_info_enabled] Link: https://lkml.kernel.org/r/20240228051127.2859472-1-richardycc@google.com Signed-off-by: Richard Chang Reviewed-by: Steven Rostedt (Google) Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Minchan Kim Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/trace/events/kmem.h | 38 +++++++++++++++++++++++++++++++++++++ mm/internal.h | 3 ++- mm/page_alloc.c | 32 ++++++++++++++++++++++++++----- mm/page_isolation.c | 2 +- 4 files changed, 68 insertions(+), 7 deletions(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 58688768ef0f..6e62cc64cd92 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -304,6 +304,44 @@ TRACE_EVENT(mm_page_alloc_extfrag, __entry->change_ownership) ); +TRACE_EVENT(mm_alloc_contig_migrate_range_info, + + TP_PROTO(unsigned long start, + unsigned long end, + unsigned long nr_migrated, + unsigned long nr_reclaimed, + unsigned long nr_mapped, + int migratetype), + + TP_ARGS(start, end, nr_migrated, nr_reclaimed, nr_mapped, migratetype), + + TP_STRUCT__entry( + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned long, nr_migrated) + __field(unsigned long, nr_reclaimed) + __field(unsigned long, nr_mapped) + __field(int, migratetype) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->nr_migrated = nr_migrated; + __entry->nr_reclaimed = nr_reclaimed; + __entry->nr_mapped = nr_mapped; + __entry->migratetype = migratetype; + ), + + TP_printk("start=0x%lx end=0x%lx migratetype=%d nr_migrated=%lu nr_reclaimed=%lu nr_mapped=%lu", + __entry->start, + __entry->end, + __entry->migratetype, + __entry->nr_migrated, + __entry->nr_reclaimed, + __entry->nr_mapped) +); + /* * Required for uniquely and securely identifying mm in rss_stat tracepoint. */ diff --git a/mm/internal.h b/mm/internal.h index b1d806125e8d..5be91e2f9943 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -539,7 +539,8 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + int migratetype); /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void init_cma_reserved_pageblock(struct page *page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7873e9375802..29c982542dcc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6222,9 +6222,14 @@ static void alloc_contig_dump_pages(struct list_head *page_list) } } -/* [start, end) must belong to a single zone. */ +/* + * [start, end) must belong to a single zone. + * @migratetype: using migratetype to filter the type of migration in + * trace_mm_alloc_contig_migrate_range_info. + */ int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + int migratetype) { /* This function is based on compact_zone() from compaction.c. */ unsigned int nr_reclaimed; @@ -6235,6 +6240,10 @@ int __alloc_contig_migrate_range(struct compact_control *cc, .nid = zone_to_nid(cc->zone), .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; + struct page *page; + unsigned long total_mapped = 0; + unsigned long total_migrated = 0; + unsigned long total_reclaimed = 0; lru_cache_disable(); @@ -6260,9 +6269,18 @@ int __alloc_contig_migrate_range(struct compact_control *cc, &cc->migratepages); cc->nr_migratepages -= nr_reclaimed; + if (trace_mm_alloc_contig_migrate_range_info_enabled()) { + total_reclaimed += nr_reclaimed; + list_for_each_entry(page, &cc->migratepages, lru) + total_mapped += page_mapcount(page); + } + ret = migrate_pages(&cc->migratepages, alloc_migration_target, NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); + if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret) + total_migrated += cc->nr_migratepages; + /* * On -ENOMEM, migrate_pages() bails out right away. It is pointless * to retry again over this error, so do the same here. @@ -6276,9 +6294,13 @@ int __alloc_contig_migrate_range(struct compact_control *cc, if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY) alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); - return ret; } - return 0; + + trace_mm_alloc_contig_migrate_range_info(start, end, migratetype, + total_migrated, + total_reclaimed, + total_mapped); + return (ret < 0) ? ret : 0; } /** @@ -6358,7 +6380,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, * allocated. So, if we fall through be sure to clear ret so that * -EBUSY is not accidentally used or returned to caller. */ - ret = __alloc_contig_migrate_range(&cc, start, end); + ret = __alloc_contig_migrate_range(&cc, start, end, migratetype); if (ret && ret != -EBUSY) goto done; ret = 0; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index cd0ea3668253..a5c8fa4c2a75 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -434,7 +434,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, } ret = __alloc_contig_migrate_range(&cc, head_pfn, - head_pfn + nr_pages); + head_pfn + nr_pages, page_mt); /* * restore the page's migratetype so that it can From 6c1b748ebf27befffec83b77ca1960bf70ed6ac9 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 27 Feb 2024 19:41:51 -0800 Subject: [PATCH 382/435] mm/memory.c: do_numa_page(): remove a redundant page table read do_numa_page() is reading from the same page table entry, twice, while holding the page table lock: once while checking that the pte hasn't changed, and again in order to modify the pte. Instead, just read the pte once, and save it in the same old_pte variable that already exists. This has no effect on behavior, other than to provide a tiny potential improvement to performance, by avoiding the redundant memory read (which the compiler cannot elide, due to READ_ONCE()). Also improve the associated comments nearby. Link: https://lkml.kernel.org/r/20240228034151.459370-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: David Hildenbrand Reviewed-by: Ryan Roberts Signed-off-by: Andrew Morton --- mm/memory.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 75e0abda8642..230bcd12a336 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5127,18 +5127,18 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) int flags = 0; /* - * The "pte" at this point cannot be used safely without - * validation through pte_unmap_same(). It's of NUMA type but - * the pfn may be screwed if the read is non atomic. + * The pte cannot be used safely until we verify, while holding the page + * table lock, that its contents have not changed during fault handling. */ spin_lock(vmf->ptl); - if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { + /* Read the live PTE from the page tables: */ + old_pte = ptep_get(vmf->pte); + + if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } - /* Get the normal PTE */ - old_pte = ptep_get(vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); /* From 26e93839d6d9d9c6169fa7559b8d1577e42d4ace Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Wed, 28 Feb 2024 02:38:54 +0000 Subject: [PATCH 383/435] mm/zsmalloc: don't need to reserve LSB in handle We will save allocated tag in the object header to indicate that it's allocated. handle |= OBJ_ALLOCATED_TAG; So the object header needs to reserve LSB for this tag bit. But the handle itself doesn't need to reserve LSB to save tag, since it's only used to find the position of object, by (pfn + obj_idx). So remove LSB reserve from handle, one more bit can be used as obj_idx. Link: https://lkml.kernel.org/r/20240228023854.3511239-1-chengming.zhou@linux.dev Signed-off-by: Chengming Zhou Reviewed-by: Sergey Senozhatsky Cc: Johannes Weiner Cc: Minchan Kim Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 63ec385cd670..7d7cb3eaabe0 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -110,7 +110,7 @@ #define OBJ_TAG_BITS 1 #define OBJ_TAG_MASK OBJ_ALLOCATED_TAG -#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) +#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) #define HUGE_BITS 1 @@ -737,14 +737,12 @@ static struct page *get_next_page(struct page *page) static void obj_to_location(unsigned long obj, struct page **page, unsigned int *obj_idx) { - obj >>= OBJ_TAG_BITS; *page = pfn_to_page(obj >> OBJ_INDEX_BITS); *obj_idx = (obj & OBJ_INDEX_MASK); } static void obj_to_page(unsigned long obj, struct page **page) { - obj >>= OBJ_TAG_BITS; *page = pfn_to_page(obj >> OBJ_INDEX_BITS); } @@ -759,7 +757,6 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) obj = page_to_pfn(page) << OBJ_INDEX_BITS; obj |= obj_idx & OBJ_INDEX_MASK; - obj <<= OBJ_TAG_BITS; return obj; } From fc37bbb3289f61e23e3f866eeeb6c865ee4d3088 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Thu, 22 Feb 2024 22:04:14 +0800 Subject: [PATCH 384/435] hugetlb: code clean for hugetlb_hstate_alloc_pages Patch series "hugetlb: parallelize hugetlb page init on boot", v6. Introduction ------------ Hugetlb initialization during boot takes up a considerable amount of time. For instance, on a 2TB system, initializing 1,800 1GB huge pages takes 1-2 seconds out of 10 seconds. Initializing 11,776 1GB pages on a 12TB Intel host takes more than 1 minute[1]. This is a noteworthy figure. Inspired by [2] and [3], hugetlb initialization can also be accelerated through parallelization. Kernel already has infrastructure like padata_do_multithreaded, this patch uses it to achieve effective results by minimal modifications. [1] https://lore.kernel.org/all/783f8bac-55b8-5b95-eb6a-11a583675000@google.com/ [2] https://lore.kernel.org/all/20200527173608.2885243-1-daniel.m.jordan@oracle.com/ [3] https://lore.kernel.org/all/20230906112605.2286994-1-usama.arif@bytedance.com/ [4] https://lore.kernel.org/all/76becfc1-e609-e3e8-2966-4053143170b6@google.com/ max_threads ----------- This patch use `padata_do_multithreaded` like this: ``` job.max_threads = num_node_state(N_MEMORY) * multiplier; padata_do_multithreaded(&job); ``` To fully utilize the CPU, the number of parallel threads needs to be carefully considered. `max_threads = num_node_state(N_MEMORY)` does not fully utilize the CPU, so we need to multiply it by a multiplier. Tests below indicate that a multiplier of 2 significantly improves performance, and although larger values also provide improvements, the gains are marginal. multiplier 1 2 3 4 5 ------------ ------- ------- ------- ------- ------- 256G 2node 358ms 215ms 157ms 134ms 126ms 2T 4node 979ms 679ms 543ms 489ms 481ms 50G 2node 71ms 44ms 37ms 30ms 31ms Therefore, choosing 2 as the multiplier strikes a good balance between enhancing parallel processing capabilities and maintaining efficient resource management. Test result ----------- test case no patch(ms) patched(ms) saved ------------------- -------------- ------------- -------- 256c2T(4 node) 1G 4745 2024 57.34% 128c1T(2 node) 1G 3358 1712 49.02% 12T 1G 77000 18300 76.23% 256c2T(4 node) 2M 3336 1051 68.52% 128c1T(2 node) 2M 1943 716 63.15% This patch (of 8): The readability of `hugetlb_hstate_alloc_pages` is poor. By cleaning the code, its readability can be improved, facilitating future modifications. This patch extracts two functions to reduce the complexity of `hugetlb_hstate_alloc_pages` and has no functional changes. - hugetlb_hstate_alloc_pages_node_specific() to handle iterates through each online node and performs allocation if necessary. - hugetlb_hstate_alloc_pages_report() report error during allocation. And the value of h->max_huge_pages is updated accordingly. Link: https://lkml.kernel.org/r/20240222140422.393911-1-gang.li@linux.dev Link: https://lkml.kernel.org/r/20240222140422.393911-2-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Muchun Song Reviewed-by: Tim Chen Cc: Daniel Jordan Cc: David Hildenbrand Cc: Jane Chu Cc: Paul E. McKenney Cc: Randy Dunlap Cc: Steffen Klassert Cc: Alexey Dobriyan Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c3d68671cbae..1ee4d11e09d8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3482,6 +3482,33 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) h->max_huge_pages_node[nid] = i; } +static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h) +{ + int i; + bool node_specific_alloc = false; + + for_each_online_node(i) { + if (h->max_huge_pages_node[i] > 0) { + hugetlb_hstate_alloc_pages_onenode(h, i); + node_specific_alloc = true; + } + } + + return node_specific_alloc; +} + +static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h) +{ + if (allocated < h->max_huge_pages) { + char buf[32]; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", + h->max_huge_pages, buf, allocated); + h->max_huge_pages = allocated; + } +} + /* * NOTE: this routine is called in different contexts for gigantic and * non-gigantic pages. @@ -3499,7 +3526,6 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) struct folio *folio; LIST_HEAD(folio_list); nodemask_t *node_alloc_noretry; - bool node_specific_alloc = false; /* skip gigantic hugepages allocation if hugetlb_cma enabled */ if (hstate_is_gigantic(h) && hugetlb_cma_size) { @@ -3508,14 +3534,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) } /* do node specific alloc */ - for_each_online_node(i) { - if (h->max_huge_pages_node[i] > 0) { - hugetlb_hstate_alloc_pages_onenode(h, i); - node_specific_alloc = true; - } - } - - if (node_specific_alloc) + if (hugetlb_hstate_alloc_pages_specific_nodes(h)) return; /* below will do all node balanced alloc */ @@ -3558,14 +3577,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) /* list will be empty if hstate_is_gigantic */ prep_and_add_allocated_folios(h, &folio_list); - if (i < h->max_huge_pages) { - char buf[32]; - - string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); - pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", - h->max_huge_pages, buf, i); - h->max_huge_pages = i; - } + hugetlb_hstate_alloc_pages_errcheck(i, h); kfree(node_alloc_noretry); } From d5c3eb3f5026aece935afca3cbe8f9fea113844c Mon Sep 17 00:00:00 2001 From: Gang Li Date: Thu, 22 Feb 2024 22:04:15 +0800 Subject: [PATCH 385/435] hugetlb: split hugetlb_hstate_alloc_pages 1G and 2M huge pages have different allocation and initialization logic, which leads to subtle differences in parallelization. Therefore, it is appropriate to split hugetlb_hstate_alloc_pages into gigantic and non-gigantic. This patch has no functional changes. Link: https://lkml.kernel.org/r/20240222140422.393911-3-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Tim Chen Reviewed-by: Muchun Song Cc: Alexey Dobriyan Cc: Daniel Jordan Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Paul E. McKenney Cc: Randy Dunlap Cc: Steffen Klassert Signed-off-by: Andrew Morton --- mm/hugetlb.c | 87 ++++++++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1ee4d11e09d8..c1de29f2b5a2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3509,6 +3509,43 @@ static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, } } +static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h) +{ + unsigned long i; + + for (i = 0; i < h->max_huge_pages; ++i) { + if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) + break; + cond_resched(); + } + + return i; +} + +static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) +{ + unsigned long i; + struct folio *folio; + LIST_HEAD(folio_list); + nodemask_t node_alloc_noretry; + + /* Bit mask controlling how hard we retry per-node allocations.*/ + nodes_clear(node_alloc_noretry); + + for (i = 0; i < h->max_huge_pages; ++i) { + folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], + &node_alloc_noretry); + if (!folio) + break; + list_add(&folio->lru, &folio_list); + cond_resched(); + } + + prep_and_add_allocated_folios(h, &folio_list); + + return i; +} + /* * NOTE: this routine is called in different contexts for gigantic and * non-gigantic pages. @@ -3522,10 +3559,7 @@ static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, */ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { - unsigned long i; - struct folio *folio; - LIST_HEAD(folio_list); - nodemask_t *node_alloc_noretry; + unsigned long allocated; /* skip gigantic hugepages allocation if hugetlb_cma enabled */ if (hstate_is_gigantic(h) && hugetlb_cma_size) { @@ -3538,47 +3572,12 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) return; /* below will do all node balanced alloc */ - if (!hstate_is_gigantic(h)) { - /* - * Bit mask controlling how hard we retry per-node allocations. - * Ignore errors as lower level routines can deal with - * node_alloc_noretry == NULL. If this kmalloc fails at boot - * time, we are likely in bigger trouble. - */ - node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), - GFP_KERNEL); - } else { - /* allocations done at boot time */ - node_alloc_noretry = NULL; - } + if (hstate_is_gigantic(h)) + allocated = hugetlb_gigantic_pages_alloc_boot(h); + else + allocated = hugetlb_pages_alloc_boot(h); - /* bit mask controlling how hard we retry per-node allocations */ - if (node_alloc_noretry) - nodes_clear(*node_alloc_noretry); - - for (i = 0; i < h->max_huge_pages; ++i) { - if (hstate_is_gigantic(h)) { - /* - * gigantic pages not added to list as they are not - * added to pools now. - */ - if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) - break; - } else { - folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], - node_alloc_noretry); - if (!folio) - break; - list_add(&folio->lru, &folio_list); - } - cond_resched(); - } - - /* list will be empty if hstate_is_gigantic */ - prep_and_add_allocated_folios(h, &folio_list); - - hugetlb_hstate_alloc_pages_errcheck(i, h); - kfree(node_alloc_noretry); + hugetlb_hstate_alloc_pages_errcheck(allocated, h); } static void __init hugetlb_init_hstates(void) From 2e73ff236e09bd2c91c91fd5c84804503b7ea90a Mon Sep 17 00:00:00 2001 From: Gang Li Date: Thu, 22 Feb 2024 22:04:16 +0800 Subject: [PATCH 386/435] hugetlb: pass *next_nid_to_alloc directly to for_each_node_mask_to_alloc With parallelization of hugetlb allocation across different threads, each thread works on a differnet node to allocate pages from, instead of all allocating from a common node h->next_nid_to_alloc. To address this, it's necessary to assign a separate next_nid_to_alloc for each thread. Consequently, the hstate_next_node_to_alloc and for_each_node_mask_to_alloc have been modified to directly accept a *next_nid_to_alloc parameter, ensuring thread-specific allocation and avoiding concurrent access issues. Link: https://lkml.kernel.org/r/20240222140422.393911-4-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Tim Chen Reviewed-by: Muchun Song Cc: Alexey Dobriyan Cc: Daniel Jordan Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Paul E. McKenney Cc: Randy Dunlap Cc: Steffen Klassert Signed-off-by: Andrew Morton --- mm/hugetlb.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c1de29f2b5a2..1c8274f8650e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1464,15 +1464,15 @@ static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) * next node from which to allocate, handling wrap at end of node * mask. */ -static int hstate_next_node_to_alloc(struct hstate *h, +static int hstate_next_node_to_alloc(int *next_node, nodemask_t *nodes_allowed) { int nid; VM_BUG_ON(!nodes_allowed); - nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); - h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); + nid = get_valid_node_allowed(*next_node, nodes_allowed); + *next_node = next_node_allowed(nid, nodes_allowed); return nid; } @@ -1495,10 +1495,10 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) return nid; } -#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ +#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \ for (nr_nodes = nodes_weight(*mask); \ nr_nodes > 0 && \ - ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ + ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \ nr_nodes--) #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ @@ -2350,12 +2350,13 @@ static void prep_and_add_allocated_folios(struct hstate *h, */ static struct folio *alloc_pool_huge_folio(struct hstate *h, nodemask_t *nodes_allowed, - nodemask_t *node_alloc_noretry) + nodemask_t *node_alloc_noretry, + int *next_node) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nr_nodes, node; - for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) { struct folio *folio; folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node, @@ -3310,7 +3311,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) goto found; } /* allocate from next node when distributing huge pages */ - for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_MEMORY]) { m = memblock_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); @@ -3679,7 +3680,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) { - for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) { if (h->surplus_huge_pages_node[node]) goto found; } @@ -3794,7 +3795,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, cond_resched(); folio = alloc_pool_huge_folio(h, nodes_allowed, - node_alloc_noretry); + node_alloc_noretry, + &h->next_nid_to_alloc); if (!folio) { prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); From eb52286634f042432ec775077a73334603a1c6e4 Mon Sep 17 00:00:00 2001 From: "Gang Li Subject: padata: dispatch works on" Date: Wed, 6 Mar 2024 13:04:17 -0800 Subject: [PATCH 387/435] Author: Gang Li padata: dispatch works on different nodes Date: Thu, 22 Feb 2024 22:04:17 +0800 When a group of tasks that access different nodes are scheduled on the same node, they may encounter bandwidth bottlenecks and access latency. Thus, numa_aware flag is introduced here, allowing tasks to be distributed across different nodes to fully utilize the advantage of multi-node systems. Link: https://lkml.kernel.org/r/20240222140422.393911-5-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Muchun Song Reviewed-by: Tim Chen Cc: Alexey Dobriyan Cc: Daniel Jordan Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Paul E. McKenney Cc: Randy Dunlap Cc: Steffen Klassert Signed-off-by: Andrew Morton --- include/linux/padata.h | 2 ++ kernel/padata.c | 14 ++++++++++++-- mm/mm_init.c | 1 + 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/linux/padata.h b/include/linux/padata.h index 495b16b6b4d7..8f418711351b 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -137,6 +137,7 @@ struct padata_shell { * appropriate for one worker thread to do at once. * @max_threads: Max threads to use for the job, actual number may be less * depending on task size and minimum chunk size. + * @numa_aware: Distribute jobs to different nodes with CPU in a round robin fashion. */ struct padata_mt_job { void (*thread_fn)(unsigned long start, unsigned long end, void *arg); @@ -146,6 +147,7 @@ struct padata_mt_job { unsigned long align; unsigned long min_chunk; int max_threads; + bool numa_aware; }; /** diff --git a/kernel/padata.c b/kernel/padata.c index 179fb1518070..e3f639ff1670 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -485,7 +485,8 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) struct padata_work my_work, *pw; struct padata_mt_job_state ps; LIST_HEAD(works); - int nworks; + int nworks, nid; + static atomic_t last_used_nid __initdata; if (job->size == 0) return; @@ -517,7 +518,16 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) ps.chunk_size = roundup(ps.chunk_size, job->align); list_for_each_entry(pw, &works, pw_list) - queue_work(system_unbound_wq, &pw->pw_work); + if (job->numa_aware) { + int old_node = atomic_read(&last_used_nid); + + do { + nid = next_node_in(old_node, node_states[N_CPU]); + } while (!atomic_try_cmpxchg(&last_used_nid, &old_node, nid)); + queue_work_node(nid, system_unbound_wq, &pw->pw_work); + } else { + queue_work(system_unbound_wq, &pw->pw_work); + } /* Use the current thread, which saves starting a workqueue worker. */ padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK); diff --git a/mm/mm_init.c b/mm/mm_init.c index 2c19f5515e36..549e76af8f82 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2231,6 +2231,7 @@ static int __init deferred_init_memmap(void *data) .align = PAGES_PER_SECTION, .min_chunk = PAGES_PER_SECTION, .max_threads = max_threads, + .numa_aware = false, }; padata_do_multithreaded(&job); From bd5ed02e23958cb56d0f5a90ebe620c8b47dab47 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Thu, 22 Feb 2024 22:04:18 +0800 Subject: [PATCH 388/435] padata: downgrade padata_do_multithreaded to serial execution for non-SMP hugetlb parallelization depends on PADATA, and PADATA depends on SMP. PADATA consists of two distinct functionality: One part is padata_do_multithreaded which disregards order and simply divides tasks into several groups for parallel execution. Hugetlb init parallelization depends on padata_do_multithreaded. The other part is composed of a set of APIs that, while handling data in an out-of-order parallel manner, can eventually return the data with ordered sequence. Currently Only `crypto/pcrypt.c` use them. All users of PADATA of non-SMP case currently only use padata_do_multithreaded. It is easy to implement a serial one in include/linux/padata.h. And it is not necessary to implement another functionality unless the only user of crypto/pcrypt.c does not depend on SMP in the future. Link: https://lkml.kernel.org/r/20240222140422.393911-6-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: Paul E. McKenney Acked-by: Daniel Jordan Cc: David Hildenbrand Cc: David Rientjes Cc: Jane Chu Cc: Muchun Song Cc: Randy Dunlap Cc: Steffen Klassert Cc: Tim Chen Cc: Alexey Dobriyan Cc: Mike Kravetz Signed-off-by: Andrew Morton --- include/linux/padata.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/padata.h b/include/linux/padata.h index 8f418711351b..0146daf34430 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -180,10 +180,6 @@ struct padata_instance { #ifdef CONFIG_PADATA extern void __init padata_init(void); -#else -static inline void __init padata_init(void) {} -#endif - extern struct padata_instance *padata_alloc(const char *name); extern void padata_free(struct padata_instance *pinst); extern struct padata_shell *padata_alloc_shell(struct padata_instance *pinst); @@ -194,4 +190,12 @@ extern void padata_do_serial(struct padata_priv *padata); extern void __init padata_do_multithreaded(struct padata_mt_job *job); extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, cpumask_var_t cpumask); +#else +static inline void __init padata_init(void) {} +static inline void __init padata_do_multithreaded(struct padata_mt_job *job) +{ + job->thread_fn(job->start, job->start + job->size, job->fn_arg); +} +#endif + #endif From 26d1dc6bb2305cd64dc0cd18e5773cbbcec41bb1 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Thu, 22 Feb 2024 22:04:19 +0800 Subject: [PATCH 389/435] hugetlb: have CONFIG_HUGETLBFS select CONFIG_PADATA Allow hugetlb use padata_do_multithreaded for parallel initialization. Select CONFIG_PADATA in this case. Link: https://lkml.kernel.org/r/20240222140422.393911-7-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Muchun Song Tested-by: Paul E. McKenney Acked-by: Daniel Jordan Cc: Alexey Dobriyan Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Randy Dunlap Cc: Steffen Klassert Cc: Tim Chen Signed-off-by: Andrew Morton --- fs/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/Kconfig b/fs/Kconfig index 713c8655fa0a..7963939592d7 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -261,6 +261,7 @@ menuconfig HUGETLBFS depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN depends on (SYSFS || SYSCTL) select MEMFD_CREATE + select PADATA if SMP help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read From c6c21c31d0d8b200d448b5edc1e2d47991b3df5a Mon Sep 17 00:00:00 2001 From: Gang Li Date: Thu, 22 Feb 2024 22:04:20 +0800 Subject: [PATCH 390/435] hugetlb: parallelize 2M hugetlb allocation and initialization By distributing both the allocation and the initialization tasks across multiple threads, the initialization of 2M hugetlb will be faster, thereby improving the boot speed. Here are some test results: test case no patch(ms) patched(ms) saved ------------------- -------------- ------------- -------- 256c2T(4 node) 2M 3336 1051 68.52% 128c1T(2 node) 2M 1943 716 63.15% Link: https://lkml.kernel.org/r/20240222140422.393911-8-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Muchun Song Cc: Alexey Dobriyan Cc: Daniel Jordan Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Paul E. McKenney Cc: Randy Dunlap Cc: Steffen Klassert Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/hugetlb.c | 73 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 17 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1c8274f8650e..9934ed8aad68 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -3510,6 +3511,30 @@ static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, } } +static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg) +{ + struct hstate *h = (struct hstate *)arg; + int i, num = end - start; + nodemask_t node_alloc_noretry; + LIST_HEAD(folio_list); + int next_node = first_online_node; + + /* Bit mask controlling how hard we retry per-node allocations.*/ + nodes_clear(node_alloc_noretry); + + for (i = 0; i < num; ++i) { + struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], + &node_alloc_noretry, &next_node); + if (!folio) + break; + + list_move(&folio->lru, &folio_list); + cond_resched(); + } + + prep_and_add_allocated_folios(h, &folio_list); +} + static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h) { unsigned long i; @@ -3525,26 +3550,40 @@ static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h) static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) { - unsigned long i; - struct folio *folio; - LIST_HEAD(folio_list); - nodemask_t node_alloc_noretry; + struct padata_mt_job job = { + .fn_arg = h, + .align = 1, + .numa_aware = true + }; - /* Bit mask controlling how hard we retry per-node allocations.*/ - nodes_clear(node_alloc_noretry); + job.thread_fn = hugetlb_pages_alloc_boot_node; + job.start = 0; + job.size = h->max_huge_pages; - for (i = 0; i < h->max_huge_pages; ++i) { - folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], - &node_alloc_noretry); - if (!folio) - break; - list_add(&folio->lru, &folio_list); - cond_resched(); - } + /* + * job.max_threads is twice the num_node_state(N_MEMORY), + * + * Tests below indicate that a multiplier of 2 significantly improves + * performance, and although larger values also provide improvements, + * the gains are marginal. + * + * Therefore, choosing 2 as the multiplier strikes a good balance between + * enhancing parallel processing capabilities and maintaining efficient + * resource management. + * + * +------------+-------+-------+-------+-------+-------+ + * | multiplier | 1 | 2 | 3 | 4 | 5 | + * +------------+-------+-------+-------+-------+-------+ + * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms | + * | 2T 4node | 979ms | 679ms | 543ms | 489ms | 481ms | + * | 50G 2node | 71ms | 44ms | 37ms | 30ms | 31ms | + * +------------+-------+-------+-------+-------+-------+ + */ + job.max_threads = num_node_state(N_MEMORY) * 2; + job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2; + padata_do_multithreaded(&job); - prep_and_add_allocated_folios(h, &folio_list); - - return i; + return h->nr_huge_pages; } /* From b78b27d02930f6f0262353080d0f784ce7aa377e Mon Sep 17 00:00:00 2001 From: Gang Li Date: Thu, 22 Feb 2024 22:04:21 +0800 Subject: [PATCH 391/435] hugetlb: parallelize 1G hugetlb initialization Optimizing the initialization speed of 1G huge pages through parallelization. 1G hugetlbs are allocated from bootmem, a process that is already very fast and does not currently require optimization. Therefore, we focus on parallelizing only the initialization phase in `gather_bootmem_prealloc`. Here are some test results: test case no patch(ms) patched(ms) saved ------------------- -------------- ------------- -------- 256c2T(4 node) 1G 4745 2024 57.34% 128c1T(2 node) 1G 3358 1712 49.02% 12T 1G 77000 18300 76.23% [akpm@linux-foundation.org: s/initialied/initialized/, per Alexey] Link: https://lkml.kernel.org/r/20240222140422.393911-9-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Muchun Song Cc: Alexey Dobriyan Cc: Daniel Jordan Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Paul E. McKenney Cc: Randy Dunlap Cc: Steffen Klassert Cc: Tim Chen Signed-off-by: Andrew Morton --- arch/powerpc/mm/hugetlbpage.c | 2 +- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 51 +++++++++++++++++++++++++++++------ 3 files changed, 45 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 16557d008eef..594a4b7b2ca2 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -226,7 +226,7 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) return 0; m = phys_to_virt(gpage_freearray[--nr_gpages]); gpage_freearray[nr_gpages] = 0; - list_add(&m->list, &huge_boot_pages); + list_add(&m->list, &huge_boot_pages[0]); m->hstate = hstate; return 1; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c1ee640d87b1..77b30a8c6076 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -178,7 +178,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); extern int sysctl_hugetlb_shm_group; -extern struct list_head huge_boot_pages; +extern struct list_head huge_boot_pages[MAX_NUMNODES]; /* arch callbacks */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9934ed8aad68..418d66953224 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -69,7 +69,7 @@ static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) #endif static unsigned long hugetlb_cma_size __initdata; -__initdata LIST_HEAD(huge_boot_pages); +__initdata struct list_head huge_boot_pages[MAX_NUMNODES]; /* for command line parsing */ static struct hstate * __initdata parsed_hstate; @@ -3301,7 +3301,7 @@ int alloc_bootmem_huge_page(struct hstate *h, int nid) int __alloc_bootmem_huge_page(struct hstate *h, int nid) { struct huge_bootmem_page *m = NULL; /* initialize for clang */ - int nr_nodes, node; + int nr_nodes, node = nid; /* do node specific alloc */ if (nid != NUMA_NO_NODE) { @@ -3339,7 +3339,7 @@ found: huge_page_size(h) - PAGE_SIZE); /* Put them into a private list first because mem_map is not up yet */ INIT_LIST_HEAD(&m->list); - list_add(&m->list, &huge_boot_pages); + list_add(&m->list, &huge_boot_pages[node]); m->hstate = h; return 1; } @@ -3390,8 +3390,6 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, /* Send list for bulk vmemmap optimization processing */ hugetlb_vmemmap_optimize_folios(h, folio_list); - /* Add all new pool pages to free lists in one lock cycle */ - spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { if (!folio_test_hugetlb_vmemmap_optimized(folio)) { /* @@ -3404,23 +3402,25 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); } + /* Subdivide locks to achieve better parallel performance */ + spin_lock_irqsave(&hugetlb_lock, flags); __prep_account_new_huge_page(h, folio_nid(folio)); enqueue_hugetlb_folio(h, folio); + spin_unlock_irqrestore(&hugetlb_lock, flags); } - spin_unlock_irqrestore(&hugetlb_lock, flags); } /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages. */ -static void __init gather_bootmem_prealloc(void) +static void __init gather_bootmem_prealloc_node(unsigned long nid) { LIST_HEAD(folio_list); struct huge_bootmem_page *m; struct hstate *h = NULL, *prev_h = NULL; - list_for_each_entry(m, &huge_boot_pages, list) { + list_for_each_entry(m, &huge_boot_pages[nid], list) { struct page *page = virt_to_page(m); struct folio *folio = (void *)page; @@ -3453,6 +3453,31 @@ static void __init gather_bootmem_prealloc(void) prep_and_add_bootmem_folios(h, &folio_list); } +static void __init gather_bootmem_prealloc_parallel(unsigned long start, + unsigned long end, void *arg) +{ + int nid; + + for (nid = start; nid < end; nid++) + gather_bootmem_prealloc_node(nid); +} + +static void __init gather_bootmem_prealloc(void) +{ + struct padata_mt_job job = { + .thread_fn = gather_bootmem_prealloc_parallel, + .fn_arg = NULL, + .start = 0, + .size = num_node_state(N_MEMORY), + .align = 1, + .min_chunk = 1, + .max_threads = num_node_state(N_MEMORY), + .numa_aware = true, + }; + + padata_do_multithreaded(&job); +} + static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) { unsigned long i; @@ -3600,6 +3625,7 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long allocated; + static bool initialized __initdata; /* skip gigantic hugepages allocation if hugetlb_cma enabled */ if (hstate_is_gigantic(h) && hugetlb_cma_size) { @@ -3607,6 +3633,15 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) return; } + /* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */ + if (!initialized) { + int i = 0; + + for (i = 0; i < MAX_NUMNODES; i++) + INIT_LIST_HEAD(&huge_boot_pages[i]); + initialized = true; + } + /* do node specific alloc */ if (hugetlb_hstate_alloc_pages_specific_nodes(h)) return; From dfbac6dc68bae989bd68a56947dcca16c5574fda Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:28 +0000 Subject: [PATCH 392/435] mm: separate out FOLIO_FLAGS from PAGEFLAGS Patch series "PageFlags cleanups". We have now successfully removed all of the uses of some of the PageFlags from the kernel, but there's nothing to stop somebody reintroducing them. By splitting out FOLIO_FLAGS from PAGEFLAGS, we can stop defining the old flags; and we do that in some of the later patches. After doing this, I realised that dump_page() was living dangerously; we could end up calling folio_test_foo() on a pointer which no longer pointed to a folio (as dump_page() is not necessarily called when the caller has a reference to the page). So I fixed that up. And then I realised that this was the key to making dump_page() take a const argument, which means we can constify the page flags testing, which means we can remove more cast-away-the-const bad code. And here's where I ended up. This patch (of 8): We've progressed far enough with the folio transition that some flags are now no longer checked on pages, but only on folios. To prevent new users appearing, prepare to only define the folio versions of the flag test/set/clear. Link: https://lkml.kernel.org/r/20240227192337.757313-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240227192337.757313-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 63 ++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 735cddc13d20..95ab75d0b39c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -367,54 +367,77 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) #define FOLIO_PF_NO_COMPOUND 0 #define FOLIO_PF_SECOND 1 +#define FOLIO_HEAD_PAGE 0 +#define FOLIO_SECOND_PAGE 1 + /* * Macros to create function definitions for page flags */ +#define FOLIO_TEST_FLAG(name, page) \ +static __always_inline bool folio_test_##name(struct folio *folio) \ +{ return test_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_SET_FLAG(name, page) \ +static __always_inline void folio_set_##name(struct folio *folio) \ +{ set_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_CLEAR_FLAG(name, page) \ +static __always_inline void folio_clear_##name(struct folio *folio) \ +{ clear_bit(PG_##name, folio_flags(folio, page)); } + +#define __FOLIO_SET_FLAG(name, page) \ +static __always_inline void __folio_set_##name(struct folio *folio) \ +{ __set_bit(PG_##name, folio_flags(folio, page)); } + +#define __FOLIO_CLEAR_FLAG(name, page) \ +static __always_inline void __folio_clear_##name(struct folio *folio) \ +{ __clear_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_TEST_SET_FLAG(name, page) \ +static __always_inline bool folio_test_set_##name(struct folio *folio) \ +{ return test_and_set_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_TEST_CLEAR_FLAG(name, page) \ +static __always_inline bool folio_test_clear_##name(struct folio *folio) \ +{ return test_and_clear_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_FLAG(name, page) \ +FOLIO_TEST_FLAG(name, page) \ +FOLIO_SET_FLAG(name, page) \ +FOLIO_CLEAR_FLAG(name, page) + #define TESTPAGEFLAG(uname, lname, policy) \ -static __always_inline bool folio_test_##lname(struct folio *folio) \ -{ return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ static __always_inline int Page##uname(struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags); } #define SETPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void folio_set_##lname(struct folio *folio) \ -{ set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void SetPage##uname(struct page *page) \ { set_bit(PG_##lname, &policy(page, 1)->flags); } #define CLEARPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void folio_clear_##lname(struct folio *folio) \ -{ clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void ClearPage##uname(struct page *page) \ { clear_bit(PG_##lname, &policy(page, 1)->flags); } #define __SETPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void __folio_set_##lname(struct folio *folio) \ -{ __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +__FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void __SetPage##uname(struct page *page) \ { __set_bit(PG_##lname, &policy(page, 1)->flags); } #define __CLEARPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void __folio_clear_##lname(struct folio *folio) \ -{ __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void __ClearPage##uname(struct page *page) \ { __clear_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTSETFLAG(uname, lname, policy) \ -static __always_inline \ -bool folio_test_set_##lname(struct folio *folio) \ -{ return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestSetPage##uname(struct page *page) \ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTCLEARFLAG(uname, lname, policy) \ -static __always_inline \ -bool folio_test_clear_##lname(struct folio *folio) \ -{ return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } From 0d846469fd216d37a91845945e9baad11dfa107b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:29 +0000 Subject: [PATCH 393/435] mm: remove PageWaiters, PageSetWaiters and PageClearWaiters All callers have been converted to use folios. This was the only user of PF_ONLY_HEAD, so remove that too. Link: https://lkml.kernel.org/r/20240227192337.757313-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 95ab75d0b39c..d8f5127ae72e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -328,9 +328,6 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) * for compound page all operations related to the page flag applied to * head page. * - * PF_ONLY_HEAD: - * for compound page, callers only ever operate on the head page. - * * PF_NO_TAIL: * modifications of the page flag must be done on small or head pages, * checks can be done on tail pages too. @@ -346,9 +343,6 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) page; }) #define PF_ANY(page, enforce) PF_POISONED_CHECK(page) #define PF_HEAD(page, enforce) PF_POISONED_CHECK(compound_head(page)) -#define PF_ONLY_HEAD(page, enforce) ({ \ - VM_BUG_ON_PGFLAGS(PageTail(page), page); \ - PF_POISONED_CHECK(page); }) #define PF_NO_TAIL(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ PF_POISONED_CHECK(compound_head(page)); }) @@ -362,7 +356,6 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) /* Which page is the flag stored in */ #define FOLIO_PF_ANY 0 #define FOLIO_PF_HEAD 0 -#define FOLIO_PF_ONLY_HEAD 0 #define FOLIO_PF_NO_TAIL 0 #define FOLIO_PF_NO_COMPOUND 0 #define FOLIO_PF_SECOND 1 @@ -488,7 +481,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; } TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) -PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) +FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE) PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL) PAGEFLAG(Referenced, referenced, PF_HEAD) TESTCLEARFLAG(Referenced, referenced, PF_HEAD) @@ -1138,7 +1131,6 @@ static inline bool folio_has_private(struct folio *folio) #undef PF_ANY #undef PF_HEAD -#undef PF_ONLY_HEAD #undef PF_NO_TAIL #undef PF_NO_COMPOUND #undef PF_SECOND From 7da8988c7c0e28dad8d0e9a697d6e7baa66f4534 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:30 +0000 Subject: [PATCH 394/435] mm: remove PageYoung and PageIdle definitions All callers have been converted to use folios, so remove the various set/clear/test functions defined on pages. Link: https://lkml.kernel.org/r/20240227192337.757313-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d8f5127ae72e..582ca7400eca 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -599,10 +599,10 @@ PAGEFLAG_FALSE(HWPoison, hwpoison) #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) -TESTPAGEFLAG(Young, young, PF_ANY) -SETPAGEFLAG(Young, young, PF_ANY) -TESTCLEARFLAG(Young, young, PF_ANY) -PAGEFLAG(Idle, idle, PF_ANY) +FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE) +FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE) +FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE) +FOLIO_FLAG(idle, FOLIO_HEAD_PAGE) #endif /* From fae7d834c43ccdb9fcecaf4d0f33145d884b3e5c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:31 +0000 Subject: [PATCH 395/435] mm: add __dump_folio() Turn __dump_page() into a wrapper around __dump_folio(). Snapshot the page & folio into a stack variable so we don't hit BUG_ON() if an allocation is freed under us and what was a folio pointer becomes a pointer to a tail page. [willy@infradead.org: fix build issue] Link: https://lkml.kernel.org/r/ZeAKCyTn_xS3O9cE@casper.infradead.org [willy@infradead.org: fix __dump_folio] Link: https://lkml.kernel.org/r/ZeJJegP8zM7S9GTy@casper.infradead.org [willy@infradead.org: fix pointer confusion] Link: https://lkml.kernel.org/r/ZeYa00ixxC4k1ot-@casper.infradead.org [akpm@linux-foundation.org: s/printk/pr_warn/] Link: https://lkml.kernel.org/r/20240227192337.757313-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++ include/linux/mmzone.h | 3 + mm/debug.c | 128 +++++++++++++++++++++++------------------ 3 files changed, 83 insertions(+), 55 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d45eadc440f5..02547c8adda0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2066,6 +2066,13 @@ static inline long folio_nr_pages(struct folio *folio) #endif } +/* Only hugetlbfs can allocate folios larger than MAX_ORDER */ +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) +#else +#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES +#endif + /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 633812a1d220..c11b7cde81ef 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -76,9 +76,12 @@ extern const char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) +# define is_migrate_cma_folio(folio, pfn) (MIGRATE_CMA == \ + get_pfnblock_flags_mask(&folio->page, pfn, MIGRATETYPE_MASK)) #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false +# define is_migrate_cma_folio(folio, pfn) false #endif static inline bool is_migrate_movable(int mt) diff --git a/mm/debug.c b/mm/debug.c index ee533a5ceb79..12fed592c955 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -51,84 +51,102 @@ const struct trace_print_flags vmaflag_names[] = { {0, NULL} }; -static void __dump_page(struct page *page) +static void __dump_folio(struct folio *folio, struct page *page, + unsigned long pfn, unsigned long idx) { - struct folio *folio = page_folio(page); - struct page *head = &folio->page; - struct address_space *mapping; - bool compound = PageCompound(page); - /* - * Accessing the pageblock without the zone lock. It could change to - * "isolate" again in the meantime, but since we are just dumping the - * state for debugging, it should be fine to accept a bit of - * inaccuracy here due to racing. - */ - bool page_cma = is_migrate_cma_page(page); - int mapcount; + struct address_space *mapping = folio_mapping(folio); + int mapcount = 0; char *type = ""; - if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) { - /* - * Corrupt page, so we cannot call page_mapping. Instead, do a - * safe subset of the steps that page_mapping() does. Caution: - * this will be misleading for tail pages, PageSwapCache pages, - * and potentially other situations. (See the page_mapping() - * implementation for what's missing here.) - */ - unsigned long tmp = (unsigned long)page->mapping; - - if (tmp & PAGE_MAPPING_ANON) - mapping = NULL; - else - mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS); - head = page; - folio = (struct folio *)page; - compound = false; - } else { - mapping = page_mapping(page); + /* + * page->_mapcount space in struct page is used by slab pages to + * encode own info, and we must avoid calling page_folio() again. + */ + if (!folio_test_slab(folio)) { + mapcount = atomic_read(&page->_mapcount) + 1; + if (folio_test_large(folio)) + mapcount += folio_entire_mapcount(folio); } - /* - * Avoid VM_BUG_ON() in page_mapcount(). - * page->_mapcount space in struct page is used by sl[aou]b pages to - * encode own info. - */ - mapcount = PageSlab(head) ? 0 : page_mapcount(page); - - pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", - page, page_ref_count(head), mapcount, mapping, - page_to_pgoff(page), page_to_pfn(page)); - if (compound) { - pr_warn("head:%p order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n", - head, compound_order(head), + pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", + folio_ref_count(folio), mapcount, mapping, + folio->index + idx, pfn); + if (folio_test_large(folio)) { + pr_warn("head: order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n", + folio_order(folio), folio_entire_mapcount(folio), folio_nr_pages_mapped(folio), atomic_read(&folio->_pincount)); } #ifdef CONFIG_MEMCG - if (head->memcg_data) - pr_warn("memcg:%lx\n", head->memcg_data); + if (folio->memcg_data) + pr_warn("memcg:%lx\n", folio->memcg_data); #endif - if (PageKsm(page)) + if (folio_test_ksm(folio)) type = "ksm "; - else if (PageAnon(page)) + else if (folio_test_anon(folio)) type = "anon "; else if (mapping) dump_mapping(mapping); BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); - pr_warn("%sflags: %pGp%s\n", type, &head->flags, - page_cma ? " CMA" : ""); - pr_warn("page_type: %pGt\n", &head->page_type); + /* + * Accessing the pageblock without the zone lock. It could change to + * "isolate" again in the meantime, but since we are just dumping the + * state for debugging, it should be fine to accept a bit of + * inaccuracy here due to racing. + */ + pr_warn("%sflags: %pGp%s\n", type, &folio->flags, + is_migrate_cma_folio(folio, pfn) ? " CMA" : ""); + pr_warn("page_type: %pGt\n", &folio->page.page_type); print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, sizeof(struct page), false); - if (head != page) + if (folio_test_large(folio)) print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32, - sizeof(unsigned long), head, - sizeof(struct page), false); + sizeof(unsigned long), folio, + 2 * sizeof(struct page), false); +} + +static void __dump_page(const struct page *page) +{ + struct folio *foliop, folio; + struct page precise; + unsigned long pfn = page_to_pfn(page); + unsigned long idx, nr_pages = 1; + int loops = 5; + +again: + memcpy(&precise, page, sizeof(*page)); + foliop = page_folio(&precise); + if (foliop == (struct folio *)&precise) { + idx = 0; + if (!folio_test_large(foliop)) + goto dump; + foliop = (struct folio *)page; + } else { + idx = folio_page_idx(foliop, page); + } + + if (idx < MAX_FOLIO_NR_PAGES) { + memcpy(&folio, foliop, 2 * sizeof(struct page)); + nr_pages = folio_nr_pages(&folio); + foliop = &folio; + } + + if (idx > nr_pages) { + if (loops-- > 0) + goto again; + pr_warn("page does not match folio\n"); + precise.compound_head &= ~1UL; + foliop = (struct folio *)&precise; + idx = 0; + } + +dump: + __dump_folio(foliop, &precise, pfn, idx); } void dump_page(struct page *page, const char *reason) From b3a3203309c89061452250f7384507787b7badcb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:32 +0000 Subject: [PATCH 396/435] mm: make dump_page() take a const argument Now that __dump_page() takes a const argument, we can make dump_page() take a const struct page too. Link: https://lkml.kernel.org/r/20240227192337.757313-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mmdebug.h | 2 +- mm/debug.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 7c3e7b0b0e8f..39a7714605a7 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -10,7 +10,7 @@ struct vm_area_struct; struct mm_struct; struct vma_iterator; -void dump_page(struct page *page, const char *reason); +void dump_page(const struct page *page, const char *reason); void dump_vma(const struct vm_area_struct *vma); void dump_mm(const struct mm_struct *mm); void vma_iter_dump_tree(const struct vma_iterator *vmi); diff --git a/mm/debug.c b/mm/debug.c index 12fed592c955..c1c1a6a484e4 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -149,7 +149,7 @@ dump: __dump_folio(foliop, &precise, pfn, idx); } -void dump_page(struct page *page, const char *reason) +void dump_page(const struct page *page, const char *reason) { if (PagePoisoned(page)) pr_warn("page:%p is uninitialized and poisoned", page); From ce3467af6bded1c0018ca67ea1599f45fbb8100b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:33 +0000 Subject: [PATCH 397/435] mm: constify testing page/folio flags Now that dump_page() takes a const argument, we can constify all the page flag tests. Link: https://lkml.kernel.org/r/20240227192337.757313-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 582ca7400eca..3463cd1baebf 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -237,7 +237,7 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) } #endif -static __always_inline int page_is_fake_head(struct page *page) +static __always_inline int page_is_fake_head(const struct page *page) { return page_fixed_fake_head(page) != page; } @@ -281,12 +281,12 @@ static inline unsigned long _compound_head(const struct page *page) */ #define folio_page(folio, n) nth_page(&(folio)->page, n) -static __always_inline int PageTail(struct page *page) +static __always_inline int PageTail(const struct page *page) { return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } -static __always_inline int PageCompound(struct page *page) +static __always_inline int PageCompound(const struct page *page) { return test_bit(PG_head, &page->flags) || READ_ONCE(page->compound_head) & 1; @@ -306,6 +306,16 @@ static inline void page_init_poison(struct page *page, size_t size) } #endif +static const unsigned long *const_folio_flags(const struct folio *folio, + unsigned n) +{ + const struct page *page = &folio->page; + + VM_BUG_ON_PGFLAGS(PageTail(page), page); + VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); + return &page[n].flags; +} + static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; @@ -367,8 +377,8 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) * Macros to create function definitions for page flags */ #define FOLIO_TEST_FLAG(name, page) \ -static __always_inline bool folio_test_##name(struct folio *folio) \ -{ return test_bit(PG_##name, folio_flags(folio, page)); } +static __always_inline bool folio_test_##name(const struct folio *folio) \ +{ return test_bit(PG_##name, const_folio_flags(folio, page)); } #define FOLIO_SET_FLAG(name, page) \ static __always_inline void folio_set_##name(struct folio *folio) \ @@ -401,7 +411,7 @@ FOLIO_CLEAR_FLAG(name, page) #define TESTPAGEFLAG(uname, lname, policy) \ FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ -static __always_inline int Page##uname(struct page *page) \ +static __always_inline int Page##uname(const struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags); } #define SETPAGEFLAG(uname, lname, policy) \ @@ -801,7 +811,7 @@ static __always_inline bool folio_test_head(struct folio *folio) return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY)); } -static __always_inline int PageHead(struct page *page) +static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); From 29cfe7556bfd6be043b6eb602a29c89d43565d71 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:34 +0000 Subject: [PATCH 398/435] mm: constify more page/folio tests Constify the flag tests that aren't automatically generated and the tests that look like flag tests but are more complicated. Link: https://lkml.kernel.org/r/20240227192337.757313-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 52 +++++++++++++++++++------------------- mm/hugetlb.c | 4 +-- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 3463cd1baebf..652d77805e99 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -558,13 +558,13 @@ PAGEFLAG_FALSE(HighMem, highmem) #endif #ifdef CONFIG_SWAP -static __always_inline bool folio_test_swapcache(struct folio *folio) +static __always_inline bool folio_test_swapcache(const struct folio *folio) { return folio_test_swapbacked(folio) && - test_bit(PG_swapcache, folio_flags(folio, 0)); + test_bit(PG_swapcache, const_folio_flags(folio, 0)); } -static __always_inline bool PageSwapCache(struct page *page) +static __always_inline bool PageSwapCache(const struct page *page) { return folio_test_swapcache(page_folio(page)); } @@ -663,22 +663,22 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) */ #define PAGE_MAPPING_DAX_SHARED ((void *)0x1) -static __always_inline bool folio_mapping_flags(struct folio *folio) +static __always_inline bool folio_mapping_flags(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0; } -static __always_inline int PageMappingFlags(struct page *page) +static __always_inline int PageMappingFlags(const struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0; } -static __always_inline bool folio_test_anon(struct folio *folio) +static __always_inline bool folio_test_anon(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0; } -static __always_inline bool PageAnon(struct page *page) +static __always_inline bool PageAnon(const struct page *page) { return folio_test_anon(page_folio(page)); } @@ -689,7 +689,7 @@ static __always_inline bool __folio_test_movable(const struct folio *folio) PAGE_MAPPING_MOVABLE; } -static __always_inline int __PageMovable(struct page *page) +static __always_inline int __PageMovable(const struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_MOVABLE; @@ -702,13 +702,13 @@ static __always_inline int __PageMovable(struct page *page) * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ -static __always_inline bool folio_test_ksm(struct folio *folio) +static __always_inline bool folio_test_ksm(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_KSM; } -static __always_inline bool PageKsm(struct page *page) +static __always_inline bool PageKsm(const struct page *page) { return folio_test_ksm(page_folio(page)); } @@ -747,9 +747,9 @@ static inline bool folio_xor_flags_has_waiters(struct folio *folio, * some of the bytes in it may be; see the is_partially_uptodate() * address_space operation. */ -static inline bool folio_test_uptodate(struct folio *folio) +static inline bool folio_test_uptodate(const struct folio *folio) { - bool ret = test_bit(PG_uptodate, folio_flags(folio, 0)); + bool ret = test_bit(PG_uptodate, const_folio_flags(folio, 0)); /* * Must ensure that the data we read out of the folio is loaded * _after_ we've loaded folio->flags to check the uptodate bit. @@ -764,7 +764,7 @@ static inline bool folio_test_uptodate(struct folio *folio) return ret; } -static inline int PageUptodate(struct page *page) +static inline int PageUptodate(const struct page *page) { return folio_test_uptodate(page_folio(page)); } @@ -806,9 +806,9 @@ void set_page_writeback(struct page *page); #define folio_start_writeback_keepwrite(folio) \ __folio_start_writeback(folio, true) -static __always_inline bool folio_test_head(struct folio *folio) +static __always_inline bool folio_test_head(const struct folio *folio) { - return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY)); + return test_bit(PG_head, const_folio_flags(folio, FOLIO_PF_ANY)); } static __always_inline int PageHead(const struct page *page) @@ -827,7 +827,7 @@ CLEARPAGEFLAG(Head, head, PF_ANY) * * Return: True if the folio is larger than one page. */ -static inline bool folio_test_large(struct folio *folio) +static inline bool folio_test_large(const struct folio *folio) { return folio_test_head(folio); } @@ -856,7 +856,7 @@ TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable) #define PG_head_mask ((1UL << PG_head)) #ifdef CONFIG_HUGETLB_PAGE -int PageHuge(struct page *page); +int PageHuge(const struct page *page); SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) @@ -869,10 +869,10 @@ CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) * Return: True for hugetlbfs folios, false for anon folios or folios * belonging to other filesystems. */ -static inline bool folio_test_hugetlb(struct folio *folio) +static inline bool folio_test_hugetlb(const struct folio *folio) { return folio_test_large(folio) && - test_bit(PG_hugetlb, folio_flags(folio, 1)); + test_bit(PG_hugetlb, const_folio_flags(folio, 1)); } #else TESTPAGEFLAG_FALSE(Huge, hugetlb) @@ -887,7 +887,7 @@ TESTPAGEFLAG_FALSE(Huge, hugetlb) * hugetlbfs pages, but not normal pages. PageTransHuge() can only be * called only in the core VM paths where hugetlbfs pages can't exist. */ -static inline int PageTransHuge(struct page *page) +static inline int PageTransHuge(const struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); return PageHead(page); @@ -898,7 +898,7 @@ static inline int PageTransHuge(struct page *page) * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ -static inline int PageTransCompound(struct page *page) +static inline int PageTransCompound(const struct page *page) { return PageCompound(page); } @@ -908,7 +908,7 @@ static inline int PageTransCompound(struct page *page) * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ -static inline int PageTransTail(struct page *page) +static inline int PageTransTail(const struct page *page) { return PageTail(page); } @@ -972,7 +972,7 @@ static inline int page_type_has_type(unsigned int page_type) return (int)page_type < PAGE_MAPCOUNT_RESERVE; } -static inline int page_has_type(struct page *page) +static inline int page_has_type(const struct page *page) { return page_type_has_type(page->page_type); } @@ -1056,7 +1056,7 @@ extern bool is_free_buddy_page(struct page *page); PAGEFLAG(Isolated, isolated, PF_ANY); -static __always_inline int PageAnonExclusive(struct page *page) +static __always_inline int PageAnonExclusive(const struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); @@ -1129,12 +1129,12 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) * Determine if a page has private stuff, indicating that release routines * should be invoked upon it. */ -static inline int page_has_private(struct page *page) +static inline int page_has_private(const struct page *page) { return !!(page->flags & PAGE_FLAGS_PRIVATE); } -static inline bool folio_has_private(struct folio *folio) +static inline bool folio_has_private(const struct folio *folio) { return page_has_private(&folio->page); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 418d66953224..bb17e5c22759 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2164,9 +2164,9 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, * transparent huge pages. See the PageTransHuge() documentation for more * details. */ -int PageHuge(struct page *page) +int PageHuge(const struct page *page) { - struct folio *folio; + const struct folio *folio; if (!PageCompound(page)) return 0; From 9164448d3100d5118bda5e9d38b69a9f32cea509 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:35 +0000 Subject: [PATCH 399/435] mm: remove cast from page_to_nid() Now that PF_POISONED_CHECK() can take a const argument, we can drop the cast. Link: https://lkml.kernel.org/r/20240227192337.757313-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 02547c8adda0..699e850d143c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1641,13 +1641,11 @@ static inline int page_zone_id(struct page *page) } #ifdef NODE_NOT_IN_PAGE_FLAGS -extern int page_to_nid(const struct page *page); +int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { - struct page *p = (struct page *)page; - - return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK; + return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif From ac96cc4d1ceda01d08deda1e45b9f1b55b0624d2 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 27 Feb 2024 23:42:01 +1300 Subject: [PATCH 400/435] mm: make folio_pte_batch available outside of mm/memory.c madvise, mprotect and some others might need folio_pte_batch to check if a range of PTEs are completely mapped to a large folio with contiguous physical addresses. Let's make it available in mm/internal.h. While at it, add proper kernel doc and sanity-check more input parameters using two additional VM_WARN_ON_FOLIO(). [21cnbao@gmail.com: build fix] Link: https://lkml.kernel.org/r/CAGsJ_4wWzG-37D82vqP_zt+Fcbz+URVe5oXLBc4M5wbN8A_gpQ@mail.gmail.com [david@redhat.com: improve the doc for the exported func] Link: https://lkml.kernel.org/r/20240227104201.337988-1-21cnbao@gmail.com Signed-off-by: David Hildenbrand Signed-off-by: Barry Song Suggested-by: David Hildenbrand Reviewed-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Lance Yang Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/internal.h | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++ mm/memory.c | 76 ----------------------------------------- 2 files changed, 93 insertions(+), 76 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 5be91e2f9943..d1c69119b24f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -83,6 +83,99 @@ static inline void *folio_raw_mapping(struct folio *folio) return (void *)(mapping & ~PAGE_MAPPING_FLAGS); } +#ifdef CONFIG_MMU + +/* Flags for folio_pte_batch(). */ +typedef int __bitwise fpb_t; + +/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ +#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) + +/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ +#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) + +static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) +{ + if (flags & FPB_IGNORE_DIRTY) + pte = pte_mkclean(pte); + if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) + pte = pte_clear_soft_dirty(pte); + return pte_wrprotect(pte_mkold(pte)); +} + +/** + * folio_pte_batch - detect a PTE batch for a large folio + * @folio: The large folio to detect a PTE batch for. + * @addr: The user virtual address the first page is mapped at. + * @start_ptep: Page table pointer for the first entry. + * @pte: Page table entry for the first page. + * @max_nr: The maximum number of table entries to consider. + * @flags: Flags to modify the PTE batch semantics. + * @any_writable: Optional pointer to indicate whether any entry except the + * first one is writable. + * + * Detect a PTE batch: consecutive (present) PTEs that map consecutive + * pages of the same large folio. + * + * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, + * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and + * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). + * + * start_ptep must map any page of the folio. max_nr must be at least one and + * must be limited by the caller so scanning cannot exceed a single page table. + * + * Return: the number of table entries in the batch. + */ +static inline int folio_pte_batch(struct folio *folio, unsigned long addr, + pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, + bool *any_writable) +{ + unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); + const pte_t *end_ptep = start_ptep + max_nr; + pte_t expected_pte, *ptep; + bool writable; + int nr; + + if (any_writable) + *any_writable = false; + + VM_WARN_ON_FOLIO(!pte_present(pte), folio); + VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); + VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); + + nr = pte_batch_hint(start_ptep, pte); + expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); + ptep = start_ptep + nr; + + while (ptep < end_ptep) { + pte = ptep_get(ptep); + if (any_writable) + writable = !!pte_write(pte); + pte = __pte_batch_clear_ignored(pte, flags); + + if (!pte_same(pte, expected_pte)) + break; + + /* + * Stop immediately once we reached the end of the folio. In + * corner cases the next PFN might fall into a different + * folio. + */ + if (pte_pfn(pte) >= folio_end_pfn) + break; + + if (any_writable) + *any_writable |= writable; + + nr = pte_batch_hint(ptep, pte); + expected_pte = pte_advance_pfn(expected_pte, nr); + ptep += nr; + } + + return min(ptep - start_ptep, max_nr); +} +#endif /* CONFIG_MMU */ + void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, int nr_throttled); static inline void acct_reclaim_writeback(struct folio *folio) diff --git a/mm/memory.c b/mm/memory.c index 230bcd12a336..abd4f33d62c9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -953,82 +953,6 @@ static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma, set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); } -/* Flags for folio_pte_batch(). */ -typedef int __bitwise fpb_t; - -/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ -#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) - -/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ -#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) - -static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) -{ - if (flags & FPB_IGNORE_DIRTY) - pte = pte_mkclean(pte); - if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) - pte = pte_clear_soft_dirty(pte); - return pte_wrprotect(pte_mkold(pte)); -} - -/* - * Detect a PTE batch: consecutive (present) PTEs that map consecutive - * pages of the same folio. - * - * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, - * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and - * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). - * - * If "any_writable" is set, it will indicate if any other PTE besides the - * first (given) PTE is writable. - */ -static inline int folio_pte_batch(struct folio *folio, unsigned long addr, - pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, - bool *any_writable) -{ - unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); - const pte_t *end_ptep = start_ptep + max_nr; - pte_t expected_pte, *ptep; - bool writable; - int nr; - - if (any_writable) - *any_writable = false; - - VM_WARN_ON_FOLIO(!pte_present(pte), folio); - - nr = pte_batch_hint(start_ptep, pte); - expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); - ptep = start_ptep + nr; - - while (ptep < end_ptep) { - pte = ptep_get(ptep); - if (any_writable) - writable = !!pte_write(pte); - pte = __pte_batch_clear_ignored(pte, flags); - - if (!pte_same(pte, expected_pte)) - break; - - /* - * Stop immediately once we reached the end of the folio. In - * corner cases the next PFN might fall into a different - * folio. - */ - if (pte_pfn(pte) >= folio_end_pfn) - break; - - if (any_writable) - *any_writable |= writable; - - nr = pte_batch_hint(ptep, pte); - expected_pte = pte_advance_pfn(expected_pte, nr); - ptep += nr; - } - - return min(ptep - start_ptep, max_nr); -} - /* * Copy one present PTE, trying to batch-process subsequent PTEs that map * consecutive pages of the same folio by copying them as well. From f1cce6f7fa302e2ea996de33eeeeb3a70d93a3e7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Feb 2024 15:29:45 +0000 Subject: [PATCH 401/435] mm/mempolicy: use a folio in do_mbind() We actually add folios to the pagelist already, but then work with them as pages. Removes a call to compound_head() in PageKsm() and removes a reference to page->index. Link: https://lkml.kernel.org/r/20240229153015.1996829-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Gregory Price Signed-off-by: Andrew Morton --- mm/mempolicy.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 56f9a6ed939a..0fe77738d971 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1356,29 +1356,30 @@ static long do_mbind(unsigned long start, unsigned long len, */ if (new->mode == MPOL_INTERLEAVE || new->mode == MPOL_WEIGHTED_INTERLEAVE) { - struct page *page; + struct folio *folio; unsigned int order; unsigned long addr = -EFAULT; - list_for_each_entry(page, &pagelist, lru) { - if (!PageKsm(page)) + list_for_each_entry(folio, &pagelist, lru) { + if (!folio_test_ksm(folio)) break; } - if (!list_entry_is_head(page, &pagelist, lru)) { + if (!list_entry_is_head(folio, &pagelist, lru)) { vma_iter_init(&vmi, mm, start); for_each_vma_range(vmi, vma, end) { - addr = page_address_in_vma(page, vma); + addr = page_address_in_vma( + folio_page(folio, 0), vma); if (addr != -EFAULT) break; } } if (addr != -EFAULT) { - order = compound_order(page); + order = folio_order(folio); /* We already know the pol, but not the ilx */ mpol_cond_put(get_vma_policy(vma, addr, order, &mmpol.ilx)); /* Set base from which to increment by index */ - mmpol.ilx -= page->index >> order; + mmpol.ilx -= folio->index >> order; } } } From 72741db6836b4fc3c810e33a53f7cb9cf2cd48a7 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 28 Feb 2024 23:49:12 +0100 Subject: [PATCH 402/435] mm: page_alloc: use div64_ul() instead of do_div() Fixes Coccinelle/coccicheck warning reported by do_div.cocci. Compared to do_div(), div64_ul() does not implicitly cast the divisor and does not unnecessarily calculate the remainder. Link: https://lkml.kernel.org/r/20240228224911.1164-2-thorsten.blum@toblux.com Signed-off-by: Thorsten Blum Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 29c982542dcc..cc7f9b322193 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5848,7 +5848,7 @@ static void __setup_per_zone_wmarks(void) spin_lock_irqsave(&zone->lock, flags); tmp = (u64)pages_min * zone_managed_pages(zone); - do_div(tmp, lowmem_pages); + tmp = div64_ul(tmp, lowmem_pages); if (is_highmem(zone) || zone_idx(zone) == ZONE_MOVABLE) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't From 22beb471b46a1a408720498f7895232edab559d1 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 4 Mar 2024 19:07:18 +0800 Subject: [PATCH 403/435] mm: pgtable: correct the wrong comment about ptdesc->__page_flags Patch series "minor fixes and supplement for ptdesc". In this series, the [PATCH 1/3] and [PATCH 2/3] are fixes for some issues discovered during code inspection. The [PATCH 3/3] is a supplement to ptdesc conversion in s390, I don't know why this is not done in the commit 6326c26c1514 ("s390: convert various pgalloc functions to use ptdescs"), maybe I missed something. And since I don't have an s390 environment, I hope kernel test robot can help compile and test, and this is why I did not fold [PATCH 2/3] and [PATCH 3/3] into one patch. This patch (of 3): The commit 32cc0b7c9d50 ("powerpc: add pte_free_defer() for pgtables sharing page") introduced the use of PageActive flag to page table fragments tracking, so the ptdesc->__page_flags is not unused, so correct the wrong comment. Link: https://lkml.kernel.org/r/cover.1709541697.git.zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/cc42d5915fd98fd802f920de243f535efcfe01db.1709541697.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Vishal Moola (Oracle) Cc: Christian Borntraeger Cc: Claudio Imbrenda Cc: Janosch Frank Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a7223ba3ea1e..5ea77969daae 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -419,7 +419,7 @@ FOLIO_MATCH(compound_head, _head_2a); /** * struct ptdesc - Memory descriptor for page tables. - * @__page_flags: Same as page flags. Unused for page tables. + * @__page_flags: Same as page flags. Powerpc only. * @pt_rcu_head: For freeing page table pages. * @pt_list: List of used page tables. Used for s390 and x86. * @_pt_pad_1: Padding that aliases with page's compound head. From ea919671517a46b75f975fcf126e08ccf7e9c09f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 4 Mar 2024 19:07:19 +0800 Subject: [PATCH 404/435] mm: pgtable: add missing pt_index to struct ptdesc In s390, the page->index field is used for gmap (see gmap_shadow_pgt()), so add the corresponding pt_index to struct ptdesc and add a comment to clarify this. Link: https://lkml.kernel.org/r/283624c2af45fb2090b41a6b1b5481bb0a45bad7.1709541697.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Christian Borntraeger Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Hugh Dickins Cc: Janosch Frank Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5ea77969daae..5240bd7bca33 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -425,6 +425,7 @@ FOLIO_MATCH(compound_head, _head_2a); * @_pt_pad_1: Padding that aliases with page's compound head. * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. + * @pt_index: Used for s390 gmap. * @pt_mm: Used for x86 pgds. * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. * @_pt_pad_2: Padding to ensure proper alignment. @@ -450,6 +451,7 @@ struct ptdesc { unsigned long __page_mapping; union { + pgoff_t pt_index; struct mm_struct *pt_mm; atomic_t pt_frag_refcount; }; @@ -475,6 +477,7 @@ TABLE_MATCH(flags, __page_flags); TABLE_MATCH(compound_head, pt_list); TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); +TABLE_MATCH(index, pt_index); TABLE_MATCH(rcu_head, pt_rcu_head); TABLE_MATCH(page_type, __page_type); TABLE_MATCH(_refcount, __page_refcount); From 57b77b75caf02c7f1784ba5f9ea55275bd9a27b4 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 4 Mar 2024 19:07:20 +0800 Subject: [PATCH 405/435] s390: supplement for ptdesc conversion After commit 6326c26c1514 ("s390: convert various pgalloc functions to use ptdescs"), there are still some positions that use page->{lru, index} instead of ptdesc->{pt_list, pt_index}. In order to make the use of ptdesc->{pt_list, pt_index} clearer, it would be better to convert them as well. [zhengqi.arch@bytedance.com: fix build failure] Link: https://lkml.kernel.org/r/20240305072154.26168-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/04beaf3255056ffe131a5ea595736066c1e84756.1709541697.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Christian Borntraeger Cc: Janosch Frank Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgalloc.h | 4 ++-- arch/s390/mm/gmap.c | 38 +++++++++++++++++---------------- arch/s390/mm/pgalloc.c | 8 +++---- 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 502d655fe6ae..7b84ef6dc4b6 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -23,9 +23,9 @@ unsigned long *crst_table_alloc(struct mm_struct *); void crst_table_free(struct mm_struct *, unsigned long *); unsigned long *page_table_alloc(struct mm_struct *); -struct page *page_table_alloc_pgste(struct mm_struct *mm); +struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm); void page_table_free(struct mm_struct *, unsigned long *); -void page_table_free_pgste(struct page *page); +void page_table_free_pgste(struct ptdesc *ptdesc); extern int page_table_allocate_pgste; static inline void crst_table_init(unsigned long *crst, unsigned long entry) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 8da39deb56ca..e43a5a3befd4 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -206,9 +206,11 @@ static void gmap_free(struct gmap *gmap) /* Free additional data for a shadow gmap */ if (gmap_is_shadow(gmap)) { + struct ptdesc *ptdesc, *n; + /* Free all page tables. */ - list_for_each_entry_safe(page, next, &gmap->pt_list, lru) - page_table_free_pgste(page); + list_for_each_entry_safe(ptdesc, n, &gmap->pt_list, pt_list) + page_table_free_pgste(ptdesc); gmap_rmap_radix_tree_free(&gmap->host_to_rmap); /* Release reference to the parent */ gmap_put(gmap->parent); @@ -1348,7 +1350,7 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) { unsigned long *ste; phys_addr_t sto, pgt; - struct page *page; + struct ptdesc *ptdesc; BUG_ON(!gmap_is_shadow(sg)); ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ @@ -1361,9 +1363,9 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) *ste = _SEGMENT_ENTRY_EMPTY; __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = phys_to_page(pgt); - list_del(&page->lru); - page_table_free_pgste(page); + ptdesc = page_ptdesc(phys_to_page(pgt)); + list_del(&ptdesc->pt_list); + page_table_free_pgste(ptdesc); } /** @@ -1377,7 +1379,7 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, unsigned long *sgt) { - struct page *page; + struct ptdesc *ptdesc; phys_addr_t pgt; int i; @@ -1389,9 +1391,9 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, sgt[i] = _SEGMENT_ENTRY_EMPTY; __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = phys_to_page(pgt); - list_del(&page->lru); - page_table_free_pgste(page); + ptdesc = page_ptdesc(phys_to_page(pgt)); + list_del(&ptdesc->pt_list); + page_table_free_pgste(ptdesc); } } @@ -2058,19 +2060,19 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, { unsigned long raddr, origin; unsigned long *table; - struct page *page; + struct ptdesc *ptdesc; phys_addr_t s_pgt; int rc; BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); /* Allocate a shadow page table */ - page = page_table_alloc_pgste(sg->mm); - if (!page) + ptdesc = page_table_alloc_pgste(sg->mm); + if (!ptdesc) return -ENOMEM; - page->index = pgt & _SEGMENT_ENTRY_ORIGIN; + ptdesc->pt_index = pgt & _SEGMENT_ENTRY_ORIGIN; if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; - s_pgt = page_to_phys(page); + ptdesc->pt_index |= GMAP_SHADOW_FAKE_TABLE; + s_pgt = page_to_phys(ptdesc_page(ptdesc)); /* Install shadow page table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ @@ -2088,7 +2090,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, /* mark as invalid as long as the parent table is not protected */ *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; - list_add(&page->lru, &sg->pt_list); + list_add(&ptdesc->pt_list, &sg->pt_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_SEGMENT_ENTRY_INVALID; @@ -2114,7 +2116,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, return rc; out_free: spin_unlock(&sg->guest_table_lock); - page_table_free_pgste(page); + page_table_free_pgste(ptdesc); return rc; } diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 008e487c94a6..abb629d7e131 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -135,7 +135,7 @@ err_p4d: #ifdef CONFIG_PGSTE -struct page *page_table_alloc_pgste(struct mm_struct *mm) +struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm) { struct ptdesc *ptdesc; u64 *table; @@ -147,12 +147,12 @@ struct page *page_table_alloc_pgste(struct mm_struct *mm) memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } - return ptdesc_page(ptdesc); + return ptdesc; } -void page_table_free_pgste(struct page *page) +void page_table_free_pgste(struct ptdesc *ptdesc) { - pagetable_free(page_ptdesc(page)); + pagetable_free(ptdesc); } #endif /* CONFIG_PGSTE */ From 5aa598a72eafcf05239519646ec88638c8894dba Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 2 Mar 2024 14:43:12 +0800 Subject: [PATCH 406/435] mm: memory: fix shift-out-of-bounds in fault_around_bytes_set The rounddown_pow_of_two(0) is undefined, so val = 0 is not allowed in the fault_around_bytes_set(), and leads to shift-out-of-bounds, UBSAN: shift-out-of-bounds in include/linux/log2.h:67:13 shift exponent 4294967295 is too large for 64-bit type 'long unsigned int' CPU: 7 PID: 107 Comm: sh Not tainted 6.8.0-rc6-next-20240301 #294 Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015 Call trace: dump_backtrace+0x94/0xec show_stack+0x18/0x24 dump_stack_lvl+0x78/0x90 dump_stack+0x18/0x24 ubsan_epilogue+0x10/0x44 __ubsan_handle_shift_out_of_bounds+0x98/0x134 fault_around_bytes_set+0xa4/0xb0 simple_attr_write_xsigned.isra.0+0xe4/0x1ac simple_attr_write+0x18/0x24 debugfs_attr_write+0x4c/0x98 vfs_write+0xd0/0x4b0 ksys_write+0x6c/0xfc __arm64_sys_write+0x1c/0x28 invoke_syscall+0x44/0x104 el0_svc_common.constprop.0+0x40/0xe0 do_el0_svc+0x1c/0x28 el0_svc+0x34/0xdc el0t_64_sync_handler+0xc0/0xc4 el0t_64_sync+0x190/0x194 ---[ end trace ]--- Fix it by setting the minimum val to PAGE_SIZE. Link: https://lkml.kernel.org/r/20240302064312.2358924-1-wangkefeng.wang@huawei.com Fixes: 53d36a56d8c4 ("mm: prefer fault_around_pages to fault_around_bytes") Signed-off-by: Kefeng Wang Reported-by: Yue Sun Closes: https://lore.kernel.org/all/CAEkJfYPim6DQqW1GqCiHLdh2-eweqk1fGyXqs3JM+8e1qGge8w@mail.gmail.com/ Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index abd4f33d62c9..e17669d4f72f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4776,7 +4776,8 @@ static int fault_around_bytes_set(void *data, u64 val) * The minimum value is 1 page, however this results in no fault-around * at all. See should_fault_around(). */ - fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL); + val = max(val, PAGE_SIZE); + fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT; return 0; } From e35606e4167dd55d48aa6232c074137577818add Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 5 Mar 2024 07:53:45 +0000 Subject: [PATCH 407/435] mm/zswap: global lru and shrinker shared by all zswap_pools fix Commit bf9b7df23cb3 ("mm/zswap: global lru and shrinker shared by all zswap_pools") introduced a new lock to protect zswap_next_shrink, instead of reusing zswap_pools_lock. But the problem is that it's initialized only when zswap enabled, which causes bug if zswap_memcg_offline_cleanup() called without zswap enabled. Fix it by using DEFINE_SPINLOCK() to statically initialize them and define them as multiple static variables to keep in consistent with the existing global variables in zswap. Link: https://lkml.kernel.org/r/20240305075345.1493214-1-chengming.zhou@linux.dev Fixes: bf9b7df23cb3 ("mm/zswap: global lru and shrinker shared by all zswap_pools") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202403051008.a8cf8a94-lkp@intel.com Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 77 +++++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index da90933c6d20..9a3237752082 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -180,15 +180,16 @@ struct zswap_pool { char tfm_name[CRYPTO_MAX_ALG_NAME]; }; -static struct { - struct list_lru list_lru; - atomic_t nr_stored; - struct shrinker *shrinker; - struct work_struct shrink_work; - struct mem_cgroup *next_shrink; - /* The lock protects next_shrink. */ - spinlock_t shrink_lock; -} zswap; +/* Global LRU lists shared by all zswap pools. */ +static struct list_lru zswap_list_lru; +/* counter of pages stored in all zswap pools. */ +static atomic_t zswap_nr_stored = ATOMIC_INIT(0); + +/* The lock protects zswap_next_shrink updates. */ +static DEFINE_SPINLOCK(zswap_shrink_lock); +static struct mem_cgroup *zswap_next_shrink; +static struct work_struct zswap_shrink_work; +static struct shrinker *zswap_shrinker; /* * struct zswap_entry @@ -798,10 +799,10 @@ void zswap_folio_swapin(struct folio *folio) void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { /* lock out zswap shrinker walking memcg tree */ - spin_lock(&zswap.shrink_lock); - if (zswap.next_shrink == memcg) - zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL); - spin_unlock(&zswap.shrink_lock); + spin_lock(&zswap_shrink_lock); + if (zswap_next_shrink == memcg) + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + spin_unlock(&zswap_shrink_lock); } /********************************* @@ -900,9 +901,9 @@ static void zswap_entry_free(struct zswap_entry *entry) if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { - zswap_lru_del(&zswap.list_lru, entry); + zswap_lru_del(&zswap_list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); - atomic_dec(&zswap.nr_stored); + atomic_dec(&zswap_nr_stored); zswap_pool_put(entry->pool); } if (entry->objcg) { @@ -1274,7 +1275,7 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, nr_protected = atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - lru_size = list_lru_shrink_count(&zswap.list_lru, sc); + lru_size = list_lru_shrink_count(&zswap_list_lru, sc); /* * Abort if we are shrinking into the protected region. @@ -1291,7 +1292,7 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, return SHRINK_STOP; } - shrink_ret = list_lru_shrink_walk(&zswap.list_lru, sc, &shrink_memcg_cb, + shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb, &encountered_page_in_swapcache); if (encountered_page_in_swapcache) @@ -1317,7 +1318,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, #else /* use pool stats instead of memcg stats */ nr_backing = zswap_pool_total_size >> PAGE_SHIFT; - nr_stored = atomic_read(&zswap.nr_stored); + nr_stored = atomic_read(&zswap_nr_stored); #endif if (!nr_stored) @@ -1325,7 +1326,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, nr_protected = atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - nr_freeable = list_lru_shrink_count(&zswap.list_lru, sc); + nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); /* * Subtract the lru size by an estimate of the number of pages * that should be protected. @@ -1374,7 +1375,7 @@ static int shrink_memcg(struct mem_cgroup *memcg) for_each_node_state(nid, N_NORMAL_MEMORY) { unsigned long nr_to_walk = 1; - shrunk += list_lru_walk_one(&zswap.list_lru, nid, memcg, + shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg, &shrink_memcg_cb, NULL, &nr_to_walk); } return shrunk ? 0 : -EAGAIN; @@ -1387,9 +1388,9 @@ static void shrink_worker(struct work_struct *w) /* global reclaim will select cgroup in a round-robin fashion. */ do { - spin_lock(&zswap.shrink_lock); - zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL); - memcg = zswap.next_shrink; + spin_lock(&zswap_shrink_lock); + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + memcg = zswap_next_shrink; /* * We need to retry if we have gone through a full round trip, or if we @@ -1403,7 +1404,7 @@ static void shrink_worker(struct work_struct *w) * memcg is not killed when we are reclaiming. */ if (!memcg) { - spin_unlock(&zswap.shrink_lock); + spin_unlock(&zswap_shrink_lock); if (++failures == MAX_RECLAIM_RETRIES) break; @@ -1413,15 +1414,15 @@ static void shrink_worker(struct work_struct *w) if (!mem_cgroup_tryget_online(memcg)) { /* drop the reference from mem_cgroup_iter() */ mem_cgroup_iter_break(NULL, memcg); - zswap.next_shrink = NULL; - spin_unlock(&zswap.shrink_lock); + zswap_next_shrink = NULL; + spin_unlock(&zswap_shrink_lock); if (++failures == MAX_RECLAIM_RETRIES) break; goto resched; } - spin_unlock(&zswap.shrink_lock); + spin_unlock(&zswap_shrink_lock); ret = shrink_memcg(memcg); /* drop the extra reference */ @@ -1542,7 +1543,7 @@ bool zswap_store(struct folio *folio) if (objcg) { memcg = get_mem_cgroup_from_objcg(objcg); - if (memcg_list_lru_alloc(memcg, &zswap.list_lru, GFP_KERNEL)) { + if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) { mem_cgroup_put(memcg); goto put_pool; } @@ -1573,8 +1574,8 @@ insert_entry: } if (entry->length) { INIT_LIST_HEAD(&entry->lru); - zswap_lru_add(&zswap.list_lru, entry); - atomic_inc(&zswap.nr_stored); + zswap_lru_add(&zswap_list_lru, entry); + atomic_inc(&zswap_nr_stored); } spin_unlock(&tree->lock); @@ -1606,7 +1607,7 @@ check_old: return false; shrink: - queue_work(shrink_wq, &zswap.shrink_work); + queue_work(shrink_wq, &zswap_shrink_work); goto reject; } @@ -1773,16 +1774,14 @@ static int zswap_setup(void) if (!shrink_wq) goto shrink_wq_fail; - zswap.shrinker = zswap_alloc_shrinker(); - if (!zswap.shrinker) + zswap_shrinker = zswap_alloc_shrinker(); + if (!zswap_shrinker) goto shrinker_fail; - if (list_lru_init_memcg(&zswap.list_lru, zswap.shrinker)) + if (list_lru_init_memcg(&zswap_list_lru, zswap_shrinker)) goto lru_fail; - shrinker_register(zswap.shrinker); + shrinker_register(zswap_shrinker); - INIT_WORK(&zswap.shrink_work, shrink_worker); - atomic_set(&zswap.nr_stored, 0); - spin_lock_init(&zswap.shrink_lock); + INIT_WORK(&zswap_shrink_work, shrink_worker); pool = __zswap_pool_create_fallback(); if (pool) { @@ -1801,7 +1800,7 @@ static int zswap_setup(void) return 0; lru_fail: - shrinker_free(zswap.shrinker); + shrinker_free(zswap_shrinker); shrinker_fail: destroy_workqueue(shrink_wq); shrink_wq_fail: From a2aa530d856dc725ae1073feebdc1a23c82ee2eb Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 5 Mar 2024 12:37:41 +0800 Subject: [PATCH 408/435] mm/powerpc: define pXd_large() with pXd_leaf() Patch series "mm/treewide: Replace pXd_large() with pXd_leaf()", v3. These two APIs are mostly always the same. It's confusing to have both of them. Merge them into one. Here I used pXd_leaf() only because pXd_leaf() is a global API which is always defined, while pXd_large() is not. We have yet one more API that is similar which is pXd_huge(), but that's even trickier, so let's do it step by step. Some special cares are taken for ppc and x86, they're done as separate cleanups first. This patch (of 10): The two definitions are the same. The only difference is that pXd_large() is only defined with THP selected, and only on book3s 64bits. Instead of implementing it twice, make pXd_large() a macro to pXd_leaf(). Define it unconditionally just like pXd_leaf(). This helps to prepare merging the two APIs. Link: https://lkml.kernel.org/r/20240305043750.93762-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20240305043750.93762-2-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Jason Gunthorpe Reviewed-by: Mike Rapoport (IBM) Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: "Aneesh Kumar K.V" Cc: "Naveen N. Rao" Cc: Kirill A. Shutemov Cc: Muchun Song Cc: Yang Shi Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/pgtable.h | 16 ++-------------- arch/powerpc/include/asm/pgtable.h | 2 +- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 927d585652bc..d1318e8582ac 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1157,20 +1157,6 @@ pud_hugepage_update(struct mm_struct *mm, unsigned long addr, pud_t *pudp, return pud_val(*pudp); } -/* - * returns true for pmd migration entries, THP, devmap, hugetlb - * But compile time dependent on THP config - */ -static inline int pmd_large(pmd_t pmd) -{ - return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); -} - -static inline int pud_large(pud_t pud) -{ - return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); -} - /* * For radix we should always find H_PAGE_HASHPTE zero. Hence * the below will work for radix too @@ -1455,6 +1441,7 @@ static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_va */ #define pmd_is_leaf pmd_is_leaf #define pmd_leaf pmd_is_leaf +#define pmd_large pmd_leaf static inline bool pmd_is_leaf(pmd_t pmd) { return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); @@ -1462,6 +1449,7 @@ static inline bool pmd_is_leaf(pmd_t pmd) #define pud_is_leaf pud_is_leaf #define pud_leaf pud_is_leaf +#define pud_large pud_leaf static inline bool pud_is_leaf(pud_t pud) { return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 7a1ba8889aea..5928b3c1458d 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -101,7 +101,7 @@ void poking_init(void); extern unsigned long ioremap_bot; extern const pgprot_t protection_map[16]; -#ifndef CONFIG_TRANSPARENT_HUGEPAGE +#ifndef pmd_large #define pmd_large(pmd) 0 #endif From bd18b688220c7225fb50498dabd9f9d0c9988e67 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 5 Mar 2024 12:37:42 +0800 Subject: [PATCH 409/435] mm/powerpc: replace pXd_is_leaf() with pXd_leaf() They're the same macros underneath. Drop pXd_is_leaf(), instead always use pXd_leaf(). At the meantime, instead of renames, drop the pXd_is_leaf() fallback definitions directly in arch/powerpc/include/asm/pgtable.h. because similar fallback macros for pXd_leaf() are already defined in include/linux/pgtable.h. Link: https://lkml.kernel.org/r/20240305043750.93762-3-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Christophe Leroy Reviewed-by: Jason Gunthorpe Reviewed-by: Christophe Leroy Reviewed-by: Mike Rapoport (IBM) Cc: Michael Ellerman Cc: Nicholas Piggin Cc: "Aneesh Kumar K.V" Cc: "Naveen N. Rao" Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Muchun Song Cc: Thomas Gleixner Cc: Vincenzo Frascino Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/pgtable.h | 10 ++++---- arch/powerpc/include/asm/pgtable.h | 24 -------------------- arch/powerpc/kvm/book3s_64_mmu_radix.c | 12 +++++----- arch/powerpc/mm/book3s64/radix_pgtable.c | 14 ++++++------ arch/powerpc/mm/pgtable.c | 6 ++--- arch/powerpc/mm/pgtable_64.c | 6 ++--- arch/powerpc/xmon/xmon.c | 6 ++--- 7 files changed, 26 insertions(+), 52 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index d1318e8582ac..3e99e409774a 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1439,18 +1439,16 @@ static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_va /* * Like pmd_huge() and pmd_large(), but works regardless of config options */ -#define pmd_is_leaf pmd_is_leaf -#define pmd_leaf pmd_is_leaf +#define pmd_leaf pmd_leaf #define pmd_large pmd_leaf -static inline bool pmd_is_leaf(pmd_t pmd) +static inline bool pmd_leaf(pmd_t pmd) { return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); } -#define pud_is_leaf pud_is_leaf -#define pud_leaf pud_is_leaf +#define pud_leaf pud_leaf #define pud_large pud_leaf -static inline bool pud_is_leaf(pud_t pud) +static inline bool pud_leaf(pud_t pud) { return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); } diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 5928b3c1458d..e6edf1cdbc5b 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -182,30 +182,6 @@ static inline void pte_frag_set(mm_context_t *ctx, void *p) } #endif -#ifndef pmd_is_leaf -#define pmd_is_leaf pmd_is_leaf -static inline bool pmd_is_leaf(pmd_t pmd) -{ - return false; -} -#endif - -#ifndef pud_is_leaf -#define pud_is_leaf pud_is_leaf -static inline bool pud_is_leaf(pud_t pud) -{ - return false; -} -#endif - -#ifndef p4d_is_leaf -#define p4d_is_leaf p4d_is_leaf -static inline bool p4d_is_leaf(p4d_t p4d) -{ - return false; -} -#endif - #define pmd_pgtable pmd_pgtable static inline pgtable_t pmd_pgtable(pmd_t pmd) { diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 4a1abb9f7c05..408d98f8a514 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -503,7 +503,7 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full, for (im = 0; im < PTRS_PER_PMD; ++im, ++p) { if (!pmd_present(*p)) continue; - if (pmd_is_leaf(*p)) { + if (pmd_leaf(*p)) { if (full) { pmd_clear(p); } else { @@ -532,7 +532,7 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud, for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) { if (!pud_present(*p)) continue; - if (pud_is_leaf(*p)) { + if (pud_leaf(*p)) { pud_clear(p); } else { pmd_t *pmd; @@ -635,12 +635,12 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, new_pud = pud_alloc_one(kvm->mm, gpa); pmd = NULL; - if (pud && pud_present(*pud) && !pud_is_leaf(*pud)) + if (pud && pud_present(*pud) && !pud_leaf(*pud)) pmd = pmd_offset(pud, gpa); else if (level <= 1) new_pmd = kvmppc_pmd_alloc(); - if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd))) + if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_leaf(*pmd))) new_ptep = kvmppc_pte_alloc(); /* Check if we might have been invalidated; let the guest retry if so */ @@ -658,7 +658,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, new_pud = NULL; } pud = pud_offset(p4d, gpa); - if (pud_is_leaf(*pud)) { + if (pud_leaf(*pud)) { unsigned long hgpa = gpa & PUD_MASK; /* Check if we raced and someone else has set the same thing */ @@ -709,7 +709,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, new_pmd = NULL; } pmd = pmd_offset(pud, gpa); - if (pmd_is_leaf(*pmd)) { + if (pmd_leaf(*pmd)) { unsigned long lgpa = gpa & PMD_MASK; /* Check if we raced and someone else has set the same thing */ diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index c6a4ac766b2b..1f8db10693e3 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -204,14 +204,14 @@ static void radix__change_memory_range(unsigned long start, unsigned long end, pudp = pud_alloc(&init_mm, p4dp, idx); if (!pudp) continue; - if (pud_is_leaf(*pudp)) { + if (pud_leaf(*pudp)) { ptep = (pte_t *)pudp; goto update_the_pte; } pmdp = pmd_alloc(&init_mm, pudp, idx); if (!pmdp) continue; - if (pmd_is_leaf(*pmdp)) { + if (pmd_leaf(*pmdp)) { ptep = pmdp_ptep(pmdp); goto update_the_pte; } @@ -767,7 +767,7 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, if (!pmd_present(*pmd)) continue; - if (pmd_is_leaf(*pmd)) { + if (pmd_leaf(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE)) { if (!direct) @@ -807,7 +807,7 @@ static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, if (!pud_present(*pud)) continue; - if (pud_is_leaf(*pud)) { + if (pud_leaf(*pud)) { if (!IS_ALIGNED(addr, PUD_SIZE) || !IS_ALIGNED(next, PUD_SIZE)) { WARN_ONCE(1, "%s: unaligned range\n", __func__); @@ -845,7 +845,7 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct, if (!p4d_present(*p4d)) continue; - if (p4d_is_leaf(*p4d)) { + if (p4d_leaf(*p4d)) { if (!IS_ALIGNED(addr, P4D_SIZE) || !IS_ALIGNED(next, P4D_SIZE)) { WARN_ONCE(1, "%s: unaligned range\n", __func__); @@ -1554,7 +1554,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) int pud_clear_huge(pud_t *pud) { - if (pud_is_leaf(*pud)) { + if (pud_leaf(*pud)) { pud_clear(pud); return 1; } @@ -1601,7 +1601,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) int pmd_clear_huge(pmd_t *pmd) { - if (pmd_is_leaf(*pmd)) { + if (pmd_leaf(*pmd)) { pmd_clear(pmd); return 1; } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 549a440ed7f6..9e7ba9c3851f 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -410,7 +410,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, if (p4d_none(p4d)) return NULL; - if (p4d_is_leaf(p4d)) { + if (p4d_leaf(p4d)) { ret_pte = (pte_t *)p4dp; goto out; } @@ -432,7 +432,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, if (pud_none(pud)) return NULL; - if (pud_is_leaf(pud)) { + if (pud_leaf(pud)) { ret_pte = (pte_t *)pudp; goto out; } @@ -471,7 +471,7 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, goto out; } - if (pmd_is_leaf(pmd)) { + if (pmd_leaf(pmd)) { ret_pte = (pte_t *)pmdp; goto out; } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 1b366526f4f2..386c6b06eab7 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -100,7 +100,7 @@ EXPORT_SYMBOL(__pte_frag_size_shift); /* 4 level page table */ struct page *p4d_page(p4d_t p4d) { - if (p4d_is_leaf(p4d)) { + if (p4d_leaf(p4d)) { if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) VM_WARN_ON(!p4d_huge(p4d)); return pte_page(p4d_pte(p4d)); @@ -111,7 +111,7 @@ struct page *p4d_page(p4d_t p4d) struct page *pud_page(pud_t pud) { - if (pud_is_leaf(pud)) { + if (pud_leaf(pud)) { if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) VM_WARN_ON(!pud_huge(pud)); return pte_page(pud_pte(pud)); @@ -125,7 +125,7 @@ struct page *pud_page(pud_t pud) */ struct page *pmd_page(pmd_t pmd) { - if (pmd_is_leaf(pmd)) { + if (pmd_leaf(pmd)) { /* * vmalloc_to_page may be called on any vmap address (not only * vmalloc), and it uses pmd_page() etc., when huge vmap is diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index b3b94cd37713..9669c9925225 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -3342,7 +3342,7 @@ static void show_pte(unsigned long addr) return; } - if (p4d_is_leaf(*p4dp)) { + if (p4d_leaf(*p4dp)) { format_pte(p4dp, p4d_val(*p4dp)); return; } @@ -3356,7 +3356,7 @@ static void show_pte(unsigned long addr) return; } - if (pud_is_leaf(*pudp)) { + if (pud_leaf(*pudp)) { format_pte(pudp, pud_val(*pudp)); return; } @@ -3370,7 +3370,7 @@ static void show_pte(unsigned long addr) return; } - if (pmd_is_leaf(*pmdp)) { + if (pmd_leaf(*pmdp)) { format_pte(pmdp, pmd_val(*pmdp)); return; } From dba8e6f34f07674e7b6e33d74b4ff35c82abb13a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 5 Mar 2024 12:37:43 +0800 Subject: [PATCH 410/435] mm/x86: replace p4d_large() with p4d_leaf() p4d_large() is always defined as p4d_leaf(). Merge their usages. Chose p4d_leaf() because p4d_leaf() is a global API, while p4d_large() is not. Only x86 has p4d_leaf() defined as of now. So it also means after this patch we removed all p4d_large() usages. Link: https://lkml.kernel.org/r/20240305043750.93762-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Jason Gunthorpe Reviewed-by: Mike Rapoport (IBM) Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: "Aneesh Kumar K.V" Cc: Christophe Leroy Cc: Dmitry Vyukov Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Muchun Song Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Vincenzo Frascino Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/x86/mm/fault.c | 4 ++-- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/pat/set_memory.c | 4 ++-- arch/x86/mm/pti.c | 2 +- arch/x86/power/hibernate.c | 2 +- arch/x86/xen/mmu_pv.c | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 679b09cfe241..8b69ce3f4115 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -368,7 +368,7 @@ static void dump_pagetable(unsigned long address) goto bad; pr_cont("P4D %lx ", p4d_val(*p4d)); - if (!p4d_present(*p4d) || p4d_large(*p4d)) + if (!p4d_present(*p4d) || p4d_leaf(*p4d)) goto out; pud = pud_offset(p4d, address); @@ -1039,7 +1039,7 @@ spurious_kernel_fault(unsigned long error_code, unsigned long address) if (!p4d_present(*p4d)) return 0; - if (p4d_large(*p4d)) + if (p4d_leaf(*p4d)) return spurious_kernel_fault_check(error_code, (pte_t *) p4d); pud = pud_offset(p4d, address); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ebdbcae48011..d691e7992a9a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1197,7 +1197,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, if (!p4d_present(*p4d)) continue; - BUILD_BUG_ON(p4d_large(*p4d)); + BUILD_BUG_ON(p4d_leaf(*p4d)); pud_base = pud_offset(p4d, 0); remove_pud_table(pud_base, addr, next, altmap, direct); diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index e9b448d1b1b7..5359a9c88099 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -676,7 +676,7 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, return NULL; *level = PG_LEVEL_512G; - if (p4d_large(*p4d) || !p4d_present(*p4d)) + if (p4d_leaf(*p4d) || !p4d_present(*p4d)) return (pte_t *)p4d; pud = pud_offset(p4d, address); @@ -739,7 +739,7 @@ pmd_t *lookup_pmd_address(unsigned long address) return NULL; p4d = p4d_offset(pgd, address); - if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d)) + if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d)) return NULL; pud = pud_offset(p4d, address); diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 669ba1c345b3..dc0a81f5f60e 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -206,7 +206,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) if (!p4d) return NULL; - BUILD_BUG_ON(p4d_large(*p4d) != 0); + BUILD_BUG_ON(p4d_leaf(*p4d) != 0); if (p4d_none(*p4d)) { unsigned long new_pud_page = __get_free_page(gfp); if (WARN_ON_ONCE(!new_pud_page)) diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index 6f955eb1e163..28153789f873 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -165,7 +165,7 @@ int relocate_restore_code(void) pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(relocated_restore_code); p4d = p4d_offset(pgd, relocated_restore_code); - if (p4d_large(*p4d)) { + if (p4d_leaf(*p4d)) { set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); goto out; } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index e21974f2cf2d..12a43a4abebf 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1104,7 +1104,7 @@ static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin) pud_t *pud_tbl; int i; - if (p4d_large(*p4d)) { + if (p4d_leaf(*p4d)) { pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK; xen_free_ro_pages(pa, P4D_SIZE); return; From 83ea65da325c643bd1fea078fb81c5415ed7a83a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 5 Mar 2024 12:37:44 +0800 Subject: [PATCH 411/435] mm/x86: replace pgd_large() with pgd_leaf() pgd_leaf() is a global API while pgd_large() is not. Always use the global pgd_leaf(), then drop pgd_large(). Link: https://lkml.kernel.org/r/20240305043750.93762-5-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Jason Gunthorpe Reviewed-by: Mike Rapoport (IBM) Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: "Aneesh Kumar K.V" Cc: Christophe Leroy Cc: Dmitry Vyukov Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Muchun Song Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Vincenzo Frascino Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/mm/pti.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 69ed0ea0641b..d6e993a5659f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1418,8 +1418,8 @@ static inline bool pgdp_maps_userspace(void *__ptr) return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START); } -#define pgd_leaf pgd_large -static inline int pgd_large(pgd_t pgd) { return 0; } +#define pgd_leaf pgd_leaf +static inline int pgd_leaf(pgd_t pgd) { return 0; } #ifdef CONFIG_PAGE_TABLE_ISOLATION /* diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index dc0a81f5f60e..c17aab24c1b3 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -185,7 +185,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); } - BUILD_BUG_ON(pgd_large(*pgd) != 0); + BUILD_BUG_ON(pgd_leaf(*pgd) != 0); return p4d_offset(pgd, address); } From 924bd6a8c96767a05323d575bdefd664631dce73 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 5 Mar 2024 12:37:45 +0800 Subject: [PATCH 412/435] mm/x86: drop two unnecessary pud_leaf() definitions pud_leaf() has a fallback macro defined in include/linux/pgtable.h already. Drop the extra two for x86. Link: https://lkml.kernel.org/r/20240305043750.93762-6-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Jason Gunthorpe Acked-by: Thomas Gleixner Reviewed-by: Mike Rapoport (IBM) Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: "Aneesh Kumar K.V" Cc: Christophe Leroy Cc: Dmitry Vyukov Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Muchun Song Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Vincenzo Frascino Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 1 - include/asm-generic/pgtable-nopmd.h | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d6e993a5659f..9db7a38a0e9f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1097,7 +1097,6 @@ static inline int pud_bad(pud_t pud) return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; } #else -#define pud_leaf pud_large static inline int pud_large(pud_t pud) { return 0; diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h index 8ffd64e7a24c..fa27e16bbe1b 100644 --- a/include/asm-generic/pgtable-nopmd.h +++ b/include/asm-generic/pgtable-nopmd.h @@ -31,7 +31,6 @@ static inline int pud_none(pud_t pud) { return 0; } static inline int pud_bad(pud_t pud) { return 0; } static inline int pud_present(pud_t pud) { return 1; } static inline int pud_user(pud_t pud) { return 0; } -static inline int pud_leaf(pud_t pud) { return 0; } static inline void pud_clear(pud_t *pud) { } #define pmd_ERROR(pmd) (pud_ERROR((pmd).pud)) From b6c9d5a93b4ce7ff98281d712a809d73462ff615 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 5 Mar 2024 12:37:46 +0800 Subject: [PATCH 413/435] mm/kasan: use pXd_leaf() in shadow_mapped() There is an old trick in shadow_mapped() to use pXd_bad() to detect huge pages. After commit 93fab1b22ef7 ("mm: add generic p?d_leaf() macros") we have a global API for huge mappings. Use that to replace the trick. Link: https://lkml.kernel.org/r/20240305043750.93762-7-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Rapoport (IBM) Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Vincenzo Frascino Cc: "Aneesh Kumar K.V" Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dave Hansen Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Muchun Song Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Thomas Gleixner Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/kasan/shadow.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 9ef84f31833f..d6210ca48dda 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -199,19 +199,12 @@ static bool shadow_mapped(unsigned long addr) pud = pud_offset(p4d, addr); if (pud_none(*pud)) return false; - - /* - * We can't use pud_large() or pud_huge(), the first one is - * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse - * pud_bad(), if pud is bad then it's bad because it's huge. - */ - if (pud_bad(*pud)) + if (pud_leaf(*pud)) return true; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return false; - - if (pmd_bad(*pmd)) + if (pmd_leaf(*pmd)) return true; pte = pte_offset_kernel(pmd, addr); return !pte_none(ptep_get(pte)); From 2f709f7bfd3d13f8878070a79e0983b5fac2225f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 5 Mar 2024 12:37:47 +0800 Subject: [PATCH 414/435] mm/treewide: replace pmd_large() with pmd_leaf() pmd_large() is always defined as pmd_leaf(). Merge their usages. Chose pmd_leaf() because pmd_leaf() is a global API, while pmd_large() is not. Link: https://lkml.kernel.org/r/20240305043750.93762-8-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Jason Gunthorpe Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: "Aneesh Kumar K.V" Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Muchun Song Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Thomas Gleixner Cc: Vincenzo Frascino Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/arm/mm/dump.c | 4 ++-- arch/powerpc/mm/book3s64/pgtable.c | 2 +- arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +- arch/powerpc/mm/pgtable_64.c | 2 +- arch/s390/boot/vmem.c | 2 +- arch/s390/include/asm/pgtable.h | 8 ++++---- arch/s390/mm/gmap.c | 12 ++++++------ arch/s390/mm/hugetlbpage.c | 2 +- arch/s390/mm/pageattr.c | 2 +- arch/s390/mm/pgtable.c | 6 +++--- arch/s390/mm/vmem.c | 6 +++--- arch/sparc/mm/init_64.c | 4 ++-- arch/x86/boot/compressed/ident_map_64.c | 2 +- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/mm/fault.c | 8 ++++---- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 8 ++++---- arch/x86/mm/kasan_init_64.c | 2 +- arch/x86/mm/mem_encrypt_identity.c | 4 ++-- arch/x86/mm/pat/set_memory.c | 4 ++-- arch/x86/mm/pgtable.c | 2 +- arch/x86/mm/pti.c | 4 ++-- arch/x86/power/hibernate.c | 2 +- arch/x86/xen/mmu_pv.c | 4 ++-- drivers/misc/sgi-gru/grufault.c | 2 +- 25 files changed, 49 insertions(+), 49 deletions(-) diff --git a/arch/arm/mm/dump.c b/arch/arm/mm/dump.c index a9381095ab36..cd032522d902 100644 --- a/arch/arm/mm/dump.c +++ b/arch/arm/mm/dump.c @@ -349,12 +349,12 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { addr = start + i * PMD_SIZE; domain = get_domain_name(pmd); - if (pmd_none(*pmd) || pmd_large(*pmd) || !pmd_present(*pmd)) + if (pmd_none(*pmd) || pmd_leaf(*pmd) || !pmd_present(*pmd)) note_page(st, addr, 4, pmd_val(*pmd), domain); else walk_pte(st, pmd, addr, domain); - if (SECTION_SIZE < PMD_SIZE && pmd_large(pmd[1])) { + if (SECTION_SIZE < PMD_SIZE && pmd_leaf(pmd[1])) { addr += SECTION_SIZE; pmd++; domain = get_domain_name(pmd); diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 3438ab72c346..45f526547b27 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -113,7 +113,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); assert_spin_locked(pmd_lockptr(mm, pmdp)); - WARN_ON(!(pmd_large(pmd))); + WARN_ON(!(pmd_leaf(pmd))); #endif trace_hugepage_set_pmd(addr, pmd_val(pmd)); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 1f8db10693e3..5cc4008329be 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -924,7 +924,7 @@ bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, unsigned long addr, unsigned long next) { - int large = pmd_large(*pmdp); + int large = pmd_leaf(*pmdp); if (large) vmemmap_verify(pmdp_ptep(pmdp), node, addr, next); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 386c6b06eab7..9b99113cb51a 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -132,7 +132,7 @@ struct page *pmd_page(pmd_t pmd) * enabled so these checks can't be used. */ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) - VM_WARN_ON(!(pmd_large(pmd) || pmd_huge(pmd))); + VM_WARN_ON(!(pmd_leaf(pmd) || pmd_huge(pmd))); return pte_page(pmd_pte(pmd)); } return virt_to_page(pmd_page_vaddr(pmd)); diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index e3a4500a5a75..348ab02b1028 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -333,7 +333,7 @@ static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long e } pte = boot_pte_alloc(); pmd_populate(&init_mm, pmd, pte); - } else if (pmd_large(*pmd)) { + } else if (pmd_leaf(*pmd)) { continue; } pgtable_pte_populate(pmd, addr, next, mode); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 4b91e65c85d9..431d03d5116b 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -721,7 +721,7 @@ static inline int pmd_large(pmd_t pmd) static inline int pmd_bad(pmd_t pmd) { - if ((pmd_val(pmd) & _SEGMENT_ENTRY_TYPE_MASK) > 0 || pmd_large(pmd)) + if ((pmd_val(pmd) & _SEGMENT_ENTRY_TYPE_MASK) > 0 || pmd_leaf(pmd)) return 1; return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0; } @@ -820,8 +820,8 @@ static inline int pte_protnone(pte_t pte) static inline int pmd_protnone(pmd_t pmd) { - /* pmd_large(pmd) implies pmd_present(pmd) */ - return pmd_large(pmd) && !(pmd_val(pmd) & _SEGMENT_ENTRY_READ); + /* pmd_leaf(pmd) implies pmd_present(pmd) */ + return pmd_leaf(pmd) && !(pmd_val(pmd) & _SEGMENT_ENTRY_READ); } #endif @@ -1385,7 +1385,7 @@ static inline unsigned long pmd_deref(pmd_t pmd) unsigned long origin_mask; origin_mask = _SEGMENT_ENTRY_ORIGIN; - if (pmd_large(pmd)) + if (pmd_leaf(pmd)) origin_mask = _SEGMENT_ENTRY_ORIGIN_LARGE; return (unsigned long)__va(pmd_val(pmd) & origin_mask); } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index e43a5a3befd4..767c6f077b53 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -603,7 +603,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) pmd = pmd_offset(pud, vmaddr); VM_BUG_ON(pmd_none(*pmd)); /* Are we allowed to use huge pages? */ - if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) + if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) return -EFAULT; /* Link gmap segment table entry location to page table. */ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); @@ -615,7 +615,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) rc = radix_tree_insert(&gmap->host_to_guest, vmaddr >> PMD_SHIFT, table); if (!rc) { - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { *table = (pmd_val(*pmd) & _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) | _SEGMENT_ENTRY_GMAP_UC; @@ -945,7 +945,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) } /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ - if (!pmd_large(*pmdp)) + if (!pmd_leaf(*pmdp)) spin_unlock(&gmap->guest_table_lock); return pmdp; } @@ -957,7 +957,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) */ static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) { - if (pmd_large(*pmdp)) + if (pmd_leaf(*pmdp)) spin_unlock(&gmap->guest_table_lock); } @@ -1068,7 +1068,7 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, rc = -EAGAIN; pmdp = gmap_pmd_op_walk(gmap, gaddr); if (pmdp) { - if (!pmd_large(*pmdp)) { + if (!pmd_leaf(*pmdp)) { rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits); if (!rc) { @@ -2500,7 +2500,7 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], if (!pmdp) return; - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) bitmap_fill(bitmap, _PAGE_ENTRIES); } else { diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 297a6d897d5a..1ccb5b40fe92 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -235,7 +235,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, int pmd_huge(pmd_t pmd) { - return pmd_large(pmd); + return pmd_leaf(pmd); } int pud_huge(pud_t pud) diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 631e3a4ee2de..9f55d5a3210c 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -185,7 +185,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, if (pmd_none(*pmdp)) return -EINVAL; next = pmd_addr_end(addr, end); - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { need_split = !!(flags & SET_MEMORY_4K); need_split |= !!(addr & ~PMD_MASK); need_split |= !!(addr + PMD_SIZE > next); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index b71432b15d66..9ac66304d776 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -827,7 +827,7 @@ again: return key ? -EFAULT : 0; } - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { paddr = pmd_val(*pmdp) & HPAGE_MASK; paddr |= addr & ~HPAGE_MASK; /* @@ -938,7 +938,7 @@ again: return 0; } - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { paddr = pmd_val(*pmdp) & HPAGE_MASK; paddr |= addr & ~HPAGE_MASK; cc = page_reset_referenced(paddr); @@ -1002,7 +1002,7 @@ again: return 0; } - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { paddr = pmd_val(*pmdp) & HPAGE_MASK; paddr |= addr & ~HPAGE_MASK; *key = page_get_storage_key(paddr); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index eb100479f7be..afe5edf2a604 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -236,7 +236,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, if (!add) { if (pmd_none(*pmd)) continue; - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE)) { if (!direct) @@ -281,7 +281,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, if (!pte) goto out; pmd_populate(&init_mm, pmd, pte); - } else if (pmd_large(*pmd)) { + } else if (pmd_leaf(*pmd)) { if (!direct) vmemmap_use_sub_pmd(addr, next); continue; @@ -610,7 +610,7 @@ pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) if (!pte) goto out; pmd_populate(&init_mm, pmd, pte); - } else if (WARN_ON_ONCE(pmd_large(*pmd))) { + } else if (WARN_ON_ONCE(pmd_leaf(*pmd))) { goto out; } ptep = pte_offset_kernel(pmd, addr); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index f83017992eaa..5e067b6a4464 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1672,7 +1672,7 @@ bool kern_addr_valid(unsigned long addr) if (pmd_none(*pmd)) return false; - if (pmd_large(*pmd)) + if (pmd_leaf(*pmd)) return pfn_valid(pmd_pfn(*pmd)); pte = pte_offset_kernel(pmd, addr); @@ -2968,7 +2968,7 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, struct mm_struct *mm; pmd_t entry = *pmd; - if (!pmd_large(entry) || !pmd_young(entry)) + if (!pmd_leaf(entry) || !pmd_young(entry)) return; pte = pmd_val(entry); diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index d040080d7edb..71c6e2fdcec7 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -284,7 +284,7 @@ static int set_clr_page_flags(struct x86_mapping_info *info, pudp = pud_offset(p4dp, address); pmdp = pmd_offset(pudp, address); - if (pmd_large(*pmdp)) + if (pmd_leaf(*pmdp)) ptep = split_large_pmd(info, pmdp, address); else ptep = pte_offset_kernel(pmdp, address); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2d6cdeab1f8a..c15123248c52 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3135,7 +3135,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, if (pmd_none(pmd) || !pmd_present(pmd)) goto out; - if (pmd_large(pmd)) + if (pmd_leaf(pmd)) level = PG_LEVEL_2M; out: diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8b69ce3f4115..09417f950343 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -250,7 +250,7 @@ static noinline int vmalloc_fault(unsigned long address) if (!pmd_k) return -1; - if (pmd_large(*pmd_k)) + if (pmd_leaf(*pmd_k)) return 0; pte_k = pte_offset_kernel(pmd_k, address); @@ -319,7 +319,7 @@ static void dump_pagetable(unsigned long address) * And let's rather not kmap-atomic the pte, just in case * it's allocated already: */ - if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) + if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); @@ -384,7 +384,7 @@ static void dump_pagetable(unsigned long address) goto bad; pr_cont("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) + if (!pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); @@ -1053,7 +1053,7 @@ spurious_kernel_fault(unsigned long error_code, unsigned long address) if (!pmd_present(*pmd)) return 0; - if (pmd_large(*pmd)) + if (pmd_leaf(*pmd)) return spurious_kernel_fault_check(error_code, (pte_t *) pmd); pte = pte_offset_kernel(pmd, address); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5c736b707cae..ac41b1e0940d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -463,7 +463,7 @@ void __init native_pagetable_init(void) break; /* should not be large page here */ - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n", pfn, pmd, __pa(pmd)); BUG_ON(1); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d691e7992a9a..2c5490e58f41 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -530,7 +530,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, } if (!pmd_none(*pmd)) { - if (!pmd_large(*pmd)) { + if (!pmd_leaf(*pmd)) { spin_lock(&init_mm.page_table_lock); pte = (pte_t *)pmd_page_vaddr(*pmd); paddr_last = phys_pte_init(pte, paddr, @@ -1114,7 +1114,7 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, if (!pmd_present(*pmd)) continue; - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE)) { if (!direct) @@ -1520,9 +1520,9 @@ void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr, unsigned long next) { - int large = pmd_large(*pmd); + int large = pmd_leaf(*pmd); - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { vmemmap_verify((pte_t *)pmd, node, addr, next); vmemmap_use_sub_pmd(addr, next); } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0302491d799d..f41d26bc9161 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -95,7 +95,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (!pmd_large(*pmd)) + if (!pmd_leaf(*pmd)) kasan_populate_pmd(pmd, addr, next, nid); } while (pmd++, addr = next, addr != end); } diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index d73aeb16417f..bca4fea80579 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -161,7 +161,7 @@ static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) return; pmd = pmd_offset(pud, ppd->vaddr); - if (pmd_large(*pmd)) + if (pmd_leaf(*pmd)) return; set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); @@ -185,7 +185,7 @@ static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte))); } - if (pmd_large(*pmd)) + if (pmd_leaf(*pmd)) return; pte = pte_offset_kernel(pmd, ppd->vaddr); diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 5359a9c88099..b4037fe08eed 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -692,7 +692,7 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, return NULL; *level = PG_LEVEL_2M; - if (pmd_large(*pmd) || !pmd_present(*pmd)) + if (pmd_leaf(*pmd) || !pmd_present(*pmd)) return (pte_t *)pmd; *level = PG_LEVEL_4K; @@ -1229,7 +1229,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) * Try to unmap in 2M chunks. */ while (end - start >= PMD_SIZE) { - if (pmd_large(*pmd)) + if (pmd_leaf(*pmd)) pmd_clear(pmd); else __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0cbc1b8e8e3d..d05dd86ceb41 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -792,7 +792,7 @@ int pud_clear_huge(pud_t *pud) */ int pmd_clear_huge(pmd_t *pmd) { - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { pmd_clear(pmd); return 1; } diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index c17aab24c1b3..0442e8f479a6 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -252,7 +252,7 @@ static pte_t *pti_user_pagetable_walk_pte(unsigned long address) return NULL; /* We can't do anything sensible if we hit a large mapping. */ - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { WARN_ON(1); return NULL; } @@ -341,7 +341,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end, continue; } - if (pmd_large(*pmd) || level == PTI_CLONE_PMD) { + if (pmd_leaf(*pmd) || level == PTI_CLONE_PMD) { target_pmd = pti_user_pagetable_walk_pmd(addr); if (WARN_ON(!target_pmd)) return; diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index 28153789f873..277eaf610e0e 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -175,7 +175,7 @@ int relocate_restore_code(void) goto out; } pmd = pmd_offset(pud, relocated_restore_code); - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); goto out; } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 12a43a4abebf..dde551bbd231 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1059,7 +1059,7 @@ static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin) pte_t *pte_tbl; int i; - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; xen_free_ro_pages(pa, PMD_SIZE); return; @@ -1871,7 +1871,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) if (!pmd_present(pmd)) return 0; pa = pmd_val(pmd) & PTE_PFN_MASK; - if (pmd_large(pmd)) + if (pmd_leaf(pmd)) return pa + (vaddr & ~PMD_MASK); pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index 629edb6486de..3557d78ee47a 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -227,7 +227,7 @@ static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr, if (unlikely(pmd_none(*pmdp))) goto err; #ifdef CONFIG_X86_64 - if (unlikely(pmd_large(*pmdp))) + if (unlikely(pmd_leaf(*pmdp))) pte = ptep_get((pte_t *)pmdp); else #endif From 0a845e0f6348ccfa2dcc8c450ffd1c9ffe8c4add Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 5 Mar 2024 12:37:48 +0800 Subject: [PATCH 415/435] mm/treewide: replace pud_large() with pud_leaf() pud_large() is always defined as pud_leaf(). Merge their usages. Chose pud_leaf() because pud_leaf() is a global API, while pud_large() is not. Link: https://lkml.kernel.org/r/20240305043750.93762-9-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Jason Gunthorpe Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: "Aneesh Kumar K.V" Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Muchun Song Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Thomas Gleixner Cc: Vincenzo Frascino Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/pgtable.c | 2 +- arch/s390/boot/vmem.c | 2 +- arch/s390/include/asm/pgtable.h | 4 ++-- arch/s390/mm/gmap.c | 2 +- arch/s390/mm/hugetlbpage.c | 4 ++-- arch/s390/mm/pageattr.c | 2 +- arch/s390/mm/pgtable.c | 2 +- arch/s390/mm/vmem.c | 6 +++--- arch/sparc/mm/init_64.c | 2 +- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/mm/fault.c | 4 ++-- arch/x86/mm/ident_map.c | 2 +- arch/x86/mm/init_64.c | 4 ++-- arch/x86/mm/kasan_init_64.c | 2 +- arch/x86/mm/mem_encrypt_identity.c | 2 +- arch/x86/mm/pat/set_memory.c | 6 +++--- arch/x86/mm/pgtable.c | 2 +- arch/x86/mm/pti.c | 2 +- arch/x86/power/hibernate.c | 2 +- arch/x86/xen/mmu_pv.c | 4 ++-- 20 files changed, 29 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 45f526547b27..83823db3488b 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -130,7 +130,7 @@ void set_pud_at(struct mm_struct *mm, unsigned long addr, WARN_ON(pte_hw_valid(pud_pte(*pudp))); assert_spin_locked(pud_lockptr(mm, pudp)); - WARN_ON(!(pud_large(pud))); + WARN_ON(!(pud_leaf(pud))); #endif trace_hugepage_set_pud(addr, pud_val(pud)); return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud)); diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 348ab02b1028..09b10bb6e4d0 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -366,7 +366,7 @@ static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long e } pmd = boot_crst_alloc(_SEGMENT_ENTRY_EMPTY); pud_populate(&init_mm, pud, pmd); - } else if (pud_large(*pud)) { + } else if (pud_leaf(*pud)) { continue; } pgtable_pmd_populate(pud, addr, next, mode); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 431d03d5116b..a5f16a244a64 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -730,7 +730,7 @@ static inline int pud_bad(pud_t pud) { unsigned long type = pud_val(pud) & _REGION_ENTRY_TYPE_MASK; - if (type > _REGION_ENTRY_TYPE_R3 || pud_large(pud)) + if (type > _REGION_ENTRY_TYPE_R3 || pud_leaf(pud)) return 1; if (type < _REGION_ENTRY_TYPE_R3) return 0; @@ -1400,7 +1400,7 @@ static inline unsigned long pud_deref(pud_t pud) unsigned long origin_mask; origin_mask = _REGION_ENTRY_ORIGIN; - if (pud_large(pud)) + if (pud_leaf(pud)) origin_mask = _REGION3_ENTRY_ORIGIN_LARGE; return (unsigned long)__va(pud_val(pud) & origin_mask); } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 767c6f077b53..094b43b121cd 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -598,7 +598,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) pud = pud_offset(p4d, vmaddr); VM_BUG_ON(pud_none(*pud)); /* large puds cannot yet be handled */ - if (pud_large(*pud)) + if (pud_leaf(*pud)) return -EFAULT; pmd = pmd_offset(pud, vmaddr); VM_BUG_ON(pmd_none(*pmd)); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 1ccb5b40fe92..c2e8242bd15d 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -224,7 +224,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, if (p4d_present(*p4dp)) { pudp = pud_offset(p4dp, addr); if (pud_present(*pudp)) { - if (pud_large(*pudp)) + if (pud_leaf(*pudp)) return (pte_t *) pudp; pmdp = pmd_offset(pudp, addr); } @@ -240,7 +240,7 @@ int pmd_huge(pmd_t pmd) int pud_huge(pud_t pud) { - return pud_large(pud); + return pud_leaf(pud); } bool __init arch_hugetlb_valid_size(unsigned long size) diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 9f55d5a3210c..01bc8fad64d6 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -274,7 +274,7 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end, if (pud_none(*pudp)) return -EINVAL; next = pud_addr_end(addr, end); - if (pud_large(*pudp)) { + if (pud_leaf(*pudp)) { need_split = !!(flags & SET_MEMORY_4K); need_split |= !!(addr & ~PUD_MASK); need_split |= !!(addr + PUD_SIZE > next); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 9ac66304d776..2c944bafb030 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -470,7 +470,7 @@ static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) return -ENOENT; /* Large PUDs are not supported yet. */ - if (pud_large(*pud)) + if (pud_leaf(*pud)) return -EFAULT; *pmdp = pmd_offset(pud, addr); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index afe5edf2a604..85cddf904cb2 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -329,7 +329,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, if (!add) { if (pud_none(*pud)) continue; - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE)) { pud_clear(pud); @@ -350,7 +350,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, if (!pmd) goto out; pud_populate(&init_mm, pud, pmd); - } else if (pud_large(*pud)) { + } else if (pud_leaf(*pud)) { continue; } ret = modify_pmd_table(pud, addr, next, add, direct, altmap); @@ -599,7 +599,7 @@ pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) if (!pmd) goto out; pud_populate(&init_mm, pud, pmd); - } else if (WARN_ON_ONCE(pud_large(*pud))) { + } else if (WARN_ON_ONCE(pud_leaf(*pud))) { goto out; } pmd = pmd_offset(pud, addr); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 5e067b6a4464..1ca9054d9b97 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1665,7 +1665,7 @@ bool kern_addr_valid(unsigned long addr) if (pud_none(*pud)) return false; - if (pud_large(*pud)) + if (pud_leaf(*pud)) return pfn_valid(pud_pfn(*pud)); pmd = pmd_offset(pud, addr); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c15123248c52..5cb5bc4a72c4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3126,7 +3126,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, if (pud_none(pud) || !pud_present(pud)) goto out; - if (pud_large(pud)) { + if (pud_leaf(pud)) { level = PG_LEVEL_1G; goto out; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 09417f950343..2cc6fa5dc561 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -376,7 +376,7 @@ static void dump_pagetable(unsigned long address) goto bad; pr_cont("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud) || pud_large(*pud)) + if (!pud_present(*pud) || pud_leaf(*pud)) goto out; pmd = pmd_offset(pud, address); @@ -1046,7 +1046,7 @@ spurious_kernel_fault(unsigned long error_code, unsigned long address) if (!pud_present(*pud)) return 0; - if (pud_large(*pud)) + if (pud_leaf(*pud)) return spurious_kernel_fault_check(error_code, (pte_t *) pud); pmd = pmd_offset(pud, address); diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index f50cc210a981..a204a332c71f 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -33,7 +33,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, next = end; /* if this is already a gbpage, this portion is already mapped */ - if (pud_large(*pud)) + if (pud_leaf(*pud)) continue; /* Is using a gbpage allowed? */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 2c5490e58f41..7e177856ee4f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -617,7 +617,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, } if (!pud_none(*pud)) { - if (!pud_large(*pud)) { + if (!pud_leaf(*pud)) { pmd = pmd_offset(pud, 0); paddr_last = phys_pmd_init(pmd, paddr, paddr_end, @@ -1163,7 +1163,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, if (!pud_present(*pud)) continue; - if (pud_large(*pud) && + if (pud_leaf(*pud) && IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE)) { spin_lock(&init_mm.page_table_lock); diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index f41d26bc9161..9dddf19a5571 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -115,7 +115,7 @@ static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (!pud_large(*pud)) + if (!pud_leaf(*pud)) kasan_populate_pud(pud, addr, next, nid); } while (pud++, addr = next, addr != end); } diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index bca4fea80579..7dd30e16294d 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -145,7 +145,7 @@ static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) set_pud(pud, __pud(PUD_FLAGS | __pa(pmd))); } - if (pud_large(*pud)) + if (pud_leaf(*pud)) return NULL; return pud; diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index b4037fe08eed..e3a26f2c7781 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -684,7 +684,7 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, return NULL; *level = PG_LEVEL_1G; - if (pud_large(*pud) || !pud_present(*pud)) + if (pud_leaf(*pud) || !pud_present(*pud)) return (pte_t *)pud; pmd = pmd_offset(pud, address); @@ -743,7 +743,7 @@ pmd_t *lookup_pmd_address(unsigned long address) return NULL; pud = pud_offset(p4d, address); - if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) + if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud)) return NULL; return pmd_offset(pud, address); @@ -1274,7 +1274,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) */ while (end - start >= PUD_SIZE) { - if (pud_large(*pud)) + if (pud_leaf(*pud)) pud_clear(pud); else unmap_pmd_range(pud, start, start + PUD_SIZE); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index d05dd86ceb41..ff690ddc2334 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -777,7 +777,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) */ int pud_clear_huge(pud_t *pud) { - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { pud_clear(pud); return 1; } diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 0442e8f479a6..2e69abf4f852 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -217,7 +217,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) pud = pud_offset(p4d, address); /* The user page tables do not use large mappings: */ - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { WARN_ON(1); return NULL; } diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index 277eaf610e0e..5b81d19cd114 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -170,7 +170,7 @@ int relocate_restore_code(void) goto out; } pud = pud_offset(p4d, relocated_restore_code); - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); goto out; } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index dde551bbd231..54e0d311dcc9 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1082,7 +1082,7 @@ static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin) pmd_t *pmd_tbl; int i; - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; xen_free_ro_pages(pa, PUD_SIZE); return; @@ -1863,7 +1863,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) if (!pud_present(pud)) return 0; pa = pud_val(pud) & PTE_PFN_MASK; - if (pud_large(pud)) + if (pud_leaf(pud)) return pa + (vaddr & ~PUD_MASK); pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * From e72c7c2b88666903f174a82cd5c4e0598601189f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 5 Mar 2024 12:37:49 +0800 Subject: [PATCH 416/435] mm/treewide: drop pXd_large() They're not used anymore, drop all of them. Link: https://lkml.kernel.org/r/20240305043750.93762-10-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Jason Gunthorpe Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: "Aneesh Kumar K.V" Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Muchun Song Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Thomas Gleixner Cc: Vincenzo Frascino Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/arm/include/asm/pgtable-2level.h | 1 - arch/arm/include/asm/pgtable-3level.h | 1 - arch/loongarch/kvm/mmu.c | 2 +- arch/powerpc/include/asm/book3s/64/pgtable.h | 4 +--- arch/powerpc/include/asm/pgtable.h | 4 ---- arch/s390/include/asm/pgtable.h | 8 ++++---- arch/sparc/include/asm/pgtable_64.h | 8 ++++---- arch/x86/include/asm/pgtable.h | 19 +++++++------------ arch/x86/kvm/mmu/mmu.c | 2 +- 9 files changed, 18 insertions(+), 31 deletions(-) diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h index ce543cd9380c..b0a262566eb9 100644 --- a/arch/arm/include/asm/pgtable-2level.h +++ b/arch/arm/include/asm/pgtable-2level.h @@ -213,7 +213,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) #define pmd_pfn(pmd) (__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) -#define pmd_large(pmd) (pmd_val(pmd) & 2) #define pmd_leaf(pmd) (pmd_val(pmd) & 2) #define pmd_bad(pmd) (pmd_val(pmd) & 2) #define pmd_present(pmd) (pmd_val(pmd)) diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 71c3add6417f..4b1d9eb3908a 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -118,7 +118,6 @@ PMD_TYPE_TABLE) #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_SECT) -#define pmd_large(pmd) pmd_sect(pmd) #define pmd_leaf(pmd) pmd_sect(pmd) #define pud_clear(pudp) \ diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index 50a6acd7ffe4..a556cff35740 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -723,7 +723,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, /* * Read each entry once. As above, a non-leaf entry can be promoted to * a huge page _during_ this walk. Re-reading the entry could send the - * walk into the weeks, e.g. p*d_large() returns false (sees the old + * walk into the weeks, e.g. p*d_leaf() returns false (sees the old * value) and then p*d_offset() walks into the target huge page instead * of the old page table (sees the new value). */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 3e99e409774a..df66dce8306f 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1437,17 +1437,15 @@ static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_va } /* - * Like pmd_huge() and pmd_large(), but works regardless of config options + * Like pmd_huge(), but works regardless of config options */ #define pmd_leaf pmd_leaf -#define pmd_large pmd_leaf static inline bool pmd_leaf(pmd_t pmd) { return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); } #define pud_leaf pud_leaf -#define pud_large pud_leaf static inline bool pud_leaf(pud_t pud) { return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index e6edf1cdbc5b..239709a2f68e 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -101,10 +101,6 @@ void poking_init(void); extern unsigned long ioremap_bot; extern const pgprot_t protection_map[16]; -#ifndef pmd_large -#define pmd_large(pmd) 0 -#endif - /* can we use this in kvm */ unsigned long vmalloc_to_phys(void *vmalloc_addr); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index a5f16a244a64..9e08af5b9247 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -705,16 +705,16 @@ static inline int pud_none(pud_t pud) return pud_val(pud) == _REGION3_ENTRY_EMPTY; } -#define pud_leaf pud_large -static inline int pud_large(pud_t pud) +#define pud_leaf pud_leaf +static inline int pud_leaf(pud_t pud) { if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) != _REGION_ENTRY_TYPE_R3) return 0; return !!(pud_val(pud) & _REGION3_ENTRY_LARGE); } -#define pmd_leaf pmd_large -static inline int pmd_large(pmd_t pmd) +#define pmd_leaf pmd_leaf +static inline int pmd_leaf(pmd_t pmd) { return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0; } diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 652af9d63fa2..6ff0a28d5fd1 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -680,8 +680,8 @@ static inline unsigned long pte_special(pte_t pte) return pte_val(pte) & _PAGE_SPECIAL; } -#define pmd_leaf pmd_large -static inline unsigned long pmd_large(pmd_t pmd) +#define pmd_leaf pmd_leaf +static inline unsigned long pmd_leaf(pmd_t pmd) { pte_t pte = __pte(pmd_val(pmd)); @@ -867,8 +867,8 @@ static inline pmd_t *pud_pgtable(pud_t pud) /* only used by the stubbed out hugetlb gup code, should never be called */ #define p4d_page(p4d) NULL -#define pud_leaf pud_large -static inline unsigned long pud_large(pud_t pud) +#define pud_leaf pud_leaf +static inline unsigned long pud_leaf(pud_t pud) { pte_t pte = __pte(pud_val(pud)); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 9db7a38a0e9f..cfc84c55d0e6 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -251,8 +251,8 @@ static inline unsigned long pgd_pfn(pgd_t pgd) return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; } -#define p4d_leaf p4d_large -static inline int p4d_large(p4d_t p4d) +#define p4d_leaf p4d_leaf +static inline int p4d_leaf(p4d_t p4d) { /* No 512 GiB pages yet */ return 0; @@ -260,14 +260,14 @@ static inline int p4d_large(p4d_t p4d) #define pte_page(pte) pfn_to_page(pte_pfn(pte)) -#define pmd_leaf pmd_large -static inline int pmd_large(pmd_t pte) +#define pmd_leaf pmd_leaf +static inline int pmd_leaf(pmd_t pte) { return pmd_flags(pte) & _PAGE_PSE; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */ +/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_leaf */ static inline int pmd_trans_huge(pmd_t pmd) { return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; @@ -1085,8 +1085,8 @@ static inline pmd_t *pud_pgtable(pud_t pud) */ #define pud_page(pud) pfn_to_page(pud_pfn(pud)) -#define pud_leaf pud_large -static inline int pud_large(pud_t pud) +#define pud_leaf pud_leaf +static inline int pud_leaf(pud_t pud) { return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == (_PAGE_PSE | _PAGE_PRESENT); @@ -1096,11 +1096,6 @@ static inline int pud_bad(pud_t pud) { return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; } -#else -static inline int pud_large(pud_t pud) -{ - return 0; -} #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5cb5bc4a72c4..58f5e6b637b4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3110,7 +3110,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, /* * Read each entry once. As above, a non-leaf entry can be promoted to * a huge page _during_ this walk. Re-reading the entry could send the - * walk into the weeks, e.g. p*d_large() returns false (sees the old + * walk into the weeks, e.g. p*d_leaf() returns false (sees the old * value) and then p*d_offset() walks into the target huge page instead * of the old page table (sees the new value). */ From c05995b7ec2a73bf813a8944978e175f8e4ec3ac Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 5 Mar 2024 12:37:50 +0800 Subject: [PATCH 417/435] mm/treewide: align up pXd_leaf() retval across archs Even if pXd_leaf() API is defined globally, it's not clear on the retval, and there are three types used (bool, int, unsigned log). Always return a boolean for pXd_leaf() APIs. Link: https://lkml.kernel.org/r/20240305043750.93762-11-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: "Aneesh Kumar K.V" Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Muchun Song Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Thomas Gleixner Cc: Vincenzo Frascino Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/riscv/include/asm/pgtable-64.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/s390/include/asm/pgtable.h | 4 ++-- arch/sparc/include/asm/pgtable_64.h | 4 ++-- arch/x86/include/asm/pgtable.h | 8 ++++---- include/linux/pgtable.h | 8 ++++---- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index b42017d76924..2c7e1661db01 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -190,7 +190,7 @@ static inline int pud_bad(pud_t pud) } #define pud_leaf pud_leaf -static inline int pud_leaf(pud_t pud) +static inline bool pud_leaf(pud_t pud) { return pud_present(pud) && (pud_val(pud) & _PAGE_LEAF); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index add5cd30ab34..6839520dbcb1 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -241,7 +241,7 @@ static inline int pmd_bad(pmd_t pmd) } #define pmd_leaf pmd_leaf -static inline int pmd_leaf(pmd_t pmd) +static inline bool pmd_leaf(pmd_t pmd) { return pmd_present(pmd) && (pmd_val(pmd) & _PAGE_LEAF); } diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 9e08af5b9247..60950e7a25f5 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -706,7 +706,7 @@ static inline int pud_none(pud_t pud) } #define pud_leaf pud_leaf -static inline int pud_leaf(pud_t pud) +static inline bool pud_leaf(pud_t pud) { if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) != _REGION_ENTRY_TYPE_R3) return 0; @@ -714,7 +714,7 @@ static inline int pud_leaf(pud_t pud) } #define pmd_leaf pmd_leaf -static inline int pmd_leaf(pmd_t pmd) +static inline bool pmd_leaf(pmd_t pmd) { return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0; } diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 6ff0a28d5fd1..4d1bafaba942 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -681,7 +681,7 @@ static inline unsigned long pte_special(pte_t pte) } #define pmd_leaf pmd_leaf -static inline unsigned long pmd_leaf(pmd_t pmd) +static inline bool pmd_leaf(pmd_t pmd) { pte_t pte = __pte(pmd_val(pmd)); @@ -868,7 +868,7 @@ static inline pmd_t *pud_pgtable(pud_t pud) #define p4d_page(p4d) NULL #define pud_leaf pud_leaf -static inline unsigned long pud_leaf(pud_t pud) +static inline bool pud_leaf(pud_t pud) { pte_t pte = __pte(pud_val(pud)); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index cfc84c55d0e6..7621a5acb13e 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -252,7 +252,7 @@ static inline unsigned long pgd_pfn(pgd_t pgd) } #define p4d_leaf p4d_leaf -static inline int p4d_leaf(p4d_t p4d) +static inline bool p4d_leaf(p4d_t p4d) { /* No 512 GiB pages yet */ return 0; @@ -261,7 +261,7 @@ static inline int p4d_leaf(p4d_t p4d) #define pte_page(pte) pfn_to_page(pte_pfn(pte)) #define pmd_leaf pmd_leaf -static inline int pmd_leaf(pmd_t pte) +static inline bool pmd_leaf(pmd_t pte) { return pmd_flags(pte) & _PAGE_PSE; } @@ -1086,7 +1086,7 @@ static inline pmd_t *pud_pgtable(pud_t pud) #define pud_page(pud) pfn_to_page(pud_pfn(pud)) #define pud_leaf pud_leaf -static inline int pud_leaf(pud_t pud) +static inline bool pud_leaf(pud_t pud) { return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == (_PAGE_PSE | _PAGE_PRESENT); @@ -1413,7 +1413,7 @@ static inline bool pgdp_maps_userspace(void *__ptr) } #define pgd_leaf pgd_leaf -static inline int pgd_leaf(pgd_t pgd) { return 0; } +static inline bool pgd_leaf(pgd_t pgd) { return false; } #ifdef CONFIG_PAGE_TABLE_ISOLATION /* diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a36cf4e124b0..85fc7554cd52 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1777,16 +1777,16 @@ typedef unsigned int pgtbl_mod_mask; * Only meaningful when called on a valid entry. */ #ifndef pgd_leaf -#define pgd_leaf(x) 0 +#define pgd_leaf(x) false #endif #ifndef p4d_leaf -#define p4d_leaf(x) 0 +#define p4d_leaf(x) false #endif #ifndef pud_leaf -#define pud_leaf(x) 0 +#define pud_leaf(x) false #endif #ifndef pmd_leaf -#define pmd_leaf(x) 0 +#define pmd_leaf(x) false #endif #ifndef pgd_leaf_size From 82b1c07a0af603e3c47b906c8e991dc96f01688e Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 6 Mar 2024 14:03:56 +0000 Subject: [PATCH 418/435] mm: swap: fix race between free_swap_and_cache() and swapoff() There was previously a theoretical window where swapoff() could run and teardown a swap_info_struct while a call to free_swap_and_cache() was running in another thread. This could cause, amongst other bad possibilities, swap_page_trans_huge_swapped() (called by free_swap_and_cache()) to access the freed memory for swap_map. This is a theoretical problem and I haven't been able to provoke it from a test case. But there has been agreement based on code review that this is possible (see link below). Fix it by using get_swap_device()/put_swap_device(), which will stall swapoff(). There was an extra check in _swap_info_get() to confirm that the swap entry was not free. This isn't present in get_swap_device() because it doesn't make sense in general due to the race between getting the reference and swapoff. So I've added an equivalent check directly in free_swap_and_cache(). Details of how to provoke one possible issue (thanks to David Hildenbrand for deriving this): --8<----- __swap_entry_free() might be the last user and result in "count == SWAP_HAS_CACHE". swapoff->try_to_unuse() will stop as soon as soon as si->inuse_pages==0. So the question is: could someone reclaim the folio and turn si->inuse_pages==0, before we completed swap_page_trans_huge_swapped(). Imagine the following: 2 MiB folio in the swapcache. Only 2 subpages are still references by swap entries. Process 1 still references subpage 0 via swap entry. Process 2 still references subpage 1 via swap entry. Process 1 quits. Calls free_swap_and_cache(). -> count == SWAP_HAS_CACHE [then, preempted in the hypervisor etc.] Process 2 quits. Calls free_swap_and_cache(). -> count == SWAP_HAS_CACHE Process 2 goes ahead, passes swap_page_trans_huge_swapped(), and calls __try_to_reclaim_swap(). __try_to_reclaim_swap()->folio_free_swap()->delete_from_swap_cache()-> put_swap_folio()->free_swap_slot()->swapcache_free_entries()-> swap_entry_free()->swap_range_free()-> ... WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); What stops swapoff to succeed after process 2 reclaimed the swap cache but before process1 finished its call to swap_page_trans_huge_swapped()? --8<----- Link: https://lkml.kernel.org/r/20240306140356.3974886-1-ryan.roberts@arm.com Fixes: 7c00bafee87c ("mm/swap: free swap slots in batch") Closes: https://lore.kernel.org/linux-mm/65a66eb9-41f8-4790-8db2-0c70ea15979f@redhat.com/ Signed-off-by: Ryan Roberts Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Signed-off-by: Andrew Morton --- mm/swapfile.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 2b3a2d85e350..1155a6304119 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1232,6 +1232,11 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, * with get_swap_device() and put_swap_device(), unless the swap * functions call get/put_swap_device() by themselves. * + * Note that when only holding the PTL, swapoff might succeed immediately + * after freeing a swap entry. Therefore, immediately after + * __swap_entry_free(), the swap info might become stale and should not + * be touched without a prior get_swap_device(). + * * Check whether swap entry is valid in the swap device. If so, * return pointer to swap_info_struct, and keep the swap entry valid * via preventing the swap device from being swapoff, until @@ -1609,13 +1614,19 @@ int free_swap_and_cache(swp_entry_t entry) if (non_swap_entry(entry)) return 1; - p = _swap_info_get(entry); + p = get_swap_device(entry); if (p) { + if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) { + put_swap_device(p); + return 0; + } + count = __swap_entry_free(p, entry); if (count == SWAP_HAS_CACHE && !swap_page_trans_huge_swapped(p, entry)) __try_to_reclaim_swap(p, swp_offset(entry), TTRS_UNMAPPED | TTRS_FULL); + put_swap_device(p); } return p != NULL; } From 84d6ac31c34f2a6e056f967e6105160b6e2891c4 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Wed, 6 Mar 2024 13:32:16 +0100 Subject: [PATCH 419/435] mm,page_owner: check for null stack_record before bumping its refcount Patch series "page_owner: Fixup and cleanup". This patchset consists of a fixup by an error that was reported by intel robot, where it seems to be that by the time page_owner gets initialized, stackdepot has already depleted its allocation space and returns 0-handles, turning that into null stack_records when trying to retrieve the stack_record. I was not able to reproduce that from the config because it booted fine for me, but when setting e.g: dummy_handle to 0 artificially, I could see the same error that was reported. The second patch is a cleanup that can also lead to a compilation warning. This patch (of 2): Although the retrieval of the stack_records for {dummy,failure}_handle happen when page_owner gets initialized, there seems to be some situations where stackdepot space has been already depleted by then, so we get 0-handles which make stack_records being NULL for those cases. Be careful to 1) only bump stack_records refcount and 2) only access stack_record fields if we actually have a non-null stack_record between hands. Link: https://lkml.kernel.org/r/20240306123217.29774-1-osalvador@suse.de Link: https://lkml.kernel.org/r/20240306123217.29774-2-osalvador@suse.de Fixes: 4bedfb314bdd ("mm,page_owner: implement the tracking of the stacks count") Signed-off-by: Oscar Salvador Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202403051032.e2f865a-lkp@intel.com Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Marco Elver Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_owner.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 033e349f6479..7163a1c44ccf 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -107,8 +107,10 @@ static __init void init_page_owner(void) /* Initialize dummy and failure stacks and link them to stack_list */ dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle); failure_stack.stack_record = __stack_depot_get_stack_record(failure_handle); - refcount_set(&dummy_stack.stack_record->count, 1); - refcount_set(&failure_stack.stack_record->count, 1); + if (dummy_stack.stack_record) + refcount_set(&dummy_stack.stack_record->count, 1); + if (failure_stack.stack_record) + refcount_set(&failure_stack.stack_record->count, 1); dummy_stack.next = &failure_stack; stack_list = &dummy_stack; } @@ -856,6 +858,9 @@ static int stack_print(struct seq_file *m, void *v) unsigned long nr_entries; struct stack_record *stack_record = stack->stack_record; + if (!stack->stack_record) + return 0; + nr_entries = stack_record->size; entries = stack_record->entries; stack_count = refcount_read(&stack_record->count) - 1; From 4839e79c7eae747810c81848729e050eb5440547 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Wed, 6 Mar 2024 13:32:17 +0100 Subject: [PATCH 420/435] mm,page_owner: drop unnecessary check stackdepot only saves stack_records which size is greather than 0, so we cannot possibly have empty stack_records. Drop the check. Link: https://lkml.kernel.org/r/20240306123217.29774-3-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: kernel test robot Cc: Marco Elver Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_owner.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 7163a1c44ccf..e7139952ffd9 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -865,8 +865,7 @@ static int stack_print(struct seq_file *m, void *v) entries = stack_record->entries; stack_count = refcount_read(&stack_record->count) - 1; - if (!nr_entries || nr_entries < 0 || stack_count < 1 || - stack_count < page_owner_stack_threshold) + if (stack_count < 1 || stack_count < page_owner_stack_threshold) return 0; for (i = 0; i < nr_entries; i++) From 58f327f2ce80f9c7b4a70e9cf017ae8810d44a20 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Wed, 6 Mar 2024 16:38:09 +0800 Subject: [PATCH 421/435] filemap: avoid unnecessary major faults in filemap_fault() A major fault occurred when using mlockall(MCL_CURRENT | MCL_FUTURE) in application, which leading to an unexpected issue[1]. This is caused by temporarily cleared PTE during a read+clear/modify/write update of the PTE, eg, do_numa_page()/change_pte_range(). For the data segment of the user-mode program, the global variable area is a private mapping. After the pagecache is loaded, the private anonymous page is generated after the COW is triggered. Mlockall can lock COW pages (anonymous pages), but the original file pages cannot be locked and may be reclaimed. If the global variable (private anon page) is accessed when vmf->pte is zeroed in numa fault, a file page fault will be triggered. At this time, the original private file page may have been reclaimed. If the page cache is not available at this time, a major fault will be triggered and the file will be read, causing additional overhead. This issue affects our traffic analysis service. The inbound traffic is heavy. If a major fault occurs, the I/O schedule is triggered and the original I/O is suspended. Generally, the I/O schedule is 0.7 ms. If other applications are operating disks, the system needs to wait for more than 10 ms. However, the inbound traffic is heavy and the NIC buffer is small. As a result, packet loss occurs. But the traffic analysis service can't tolerate packet loss. Fix this by holding PTL and rechecking the PTE in filemap_fault() before triggering a major fault. We do this check only if vma is VM_LOCKED to reduce the performance impact in common scenarios. In our product environment, there were 7 major faults every 12 hours. After the patch is applied, no major fault have been triggered. Testing file page read and write page fault performance in ext4 and ramdisk using will-it-scale[2] on a x86 physical machine. The data is the average change compared with the mainline after the patch is applied. The test results are within the range of fluctuation. We do this check only if vma is VM_LOCKED, therefore, no performance regressions is caused for most common cases. The test results are as follows: processes processes_idle threads threads_idle ext4 private file write: 0.22% 0.26% 1.21% -0.15% ext4 private file read: 0.03% 1.00% 1.39% 0.34% ext4 shared file write: -0.50% -0.02% -0.14% -0.02% ramdisk private file write: 0.07% 0.02% 0.53% 0.04% ramdisk private file read: 0.01% 1.60% -0.32% -0.02% [1] https://lore.kernel.org/linux-mm/9e62fd9a-bee0-52bf-50a7-498fa17434ee@huawei.com/ [2] https://github.com/antonblanchard/will-it-scale/ Link: https://lkml.kernel.org/r/20240306083809.1236634-1-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Signed-off-by: Kefeng Wang Suggested-by: "Huang, Ying" Suggested-by: David Hildenbrand Reviewed-by: "Huang, Ying" Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/filemap.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index b4858d89f1b1..31ab455c4537 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3181,6 +3181,48 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, return fpin; } +static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = 0; + pte_t *ptep; + + /* + * We might have COW'ed a pagecache folio and might now have an mlocked + * anon folio mapped. The original pagecache folio is not mlocked and + * might have been evicted. During a read+clear/modify/write update of + * the PTE, such as done in do_numa_page()/change_pte_range(), we + * temporarily clear the PTE under PT lock and might detect it here as + * "none" when not holding the PT lock. + * + * Not rechecking the PTE under PT lock could result in an unexpected + * major fault in an mlock'ed region. Recheck only for this special + * scenario while holding the PT lock, to not degrade non-mlocked + * scenarios. Recheck the PTE without PT lock firstly, thereby reducing + * the number of times we hold PT lock. + */ + if (!(vma->vm_flags & VM_LOCKED)) + return 0; + + if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) + return 0; + + ptep = pte_offset_map(vmf->pmd, vmf->address); + if (unlikely(!ptep)) + return VM_FAULT_NOPAGE; + + if (unlikely(!pte_none(ptep_get_lockless(ptep)))) { + ret = VM_FAULT_NOPAGE; + } else { + spin_lock(vmf->ptl); + if (unlikely(!pte_none(ptep_get(ptep)))) + ret = VM_FAULT_NOPAGE; + spin_unlock(vmf->ptl); + } + pte_unmap(ptep); + return ret; +} + /** * filemap_fault - read in file data for page fault handling * @vmf: struct vm_fault containing details of the fault @@ -3236,6 +3278,10 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) mapping_locked = true; } } else { + ret = filemap_fault_recheck_pte_none(vmf); + if (unlikely(ret)) + return ret; + /* No page in the page cache at all */ count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); From 47932e7048df9156e96133ee90fb3e9df68dbd15 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 11 Mar 2024 19:18:34 +0000 Subject: [PATCH 422/435] mm: remove folio from deferred split list before uncharging it When freeing a large folio, we must remove it from the deferred split list before we uncharge it as each memcg has its own deferred split list (with associated lock) and removing a folio from the deferred split list while holding the wrong lock will corrupt that list and cause various related problems. Link: https://lore.kernel.org/linux-mm/367a14f7-340e-4b29-90ae-bc3fcefdd5f4@arm.com/ Link: https://lkml.kernel.org/r/20240311191835.312162-1-willy@infradead.org Fixes: f77171d241e3 (mm: allow non-hugetlb large folios to be batch processed) Fixes: 29f3843026cf (mm: free folios directly in move_folios_to_lru()) Fixes: bc2ff4cbc329 (mm: free folios in a batch in shrink_folio_list()) Signed-off-by: Matthew Wilcox (Oracle) Debugged-by: Ryan Roberts Tested-by: Ryan Roberts Signed-off-by: Andrew Morton --- mm/swap.c | 3 +++ mm/vmscan.c | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/mm/swap.c b/mm/swap.c index 6b697d33fa5b..e43a5911b170 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1012,6 +1012,9 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) free_huge_folio(folio); continue; } + if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); __page_cache_release(folio, &lruvec, &flags); diff --git a/mm/vmscan.c b/mm/vmscan.c index e3349b75f15b..61606fa83504 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1413,6 +1413,9 @@ free_it: */ nr_reclaimed += nr_pages; + if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); if (folio_batch_add(&free_folios, folio) == 0) { mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); @@ -1819,6 +1822,9 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, if (unlikely(folio_put_testzero(folio))) { __folio_clear_lru_flags(folio); + if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); if (folio_batch_add(&free_folios, folio) == 0) { spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_folios(&free_folios); From b555895c313511830762dbb2f469587a822c1759 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 6 Mar 2024 21:27:30 +0000 Subject: [PATCH 423/435] mm: fix list corruption in put_pages_list My recent change to put_pages_list() dereferences folio->lru.next after returning the folio to the page allocator. Usually this is now on the pcp list with other free folios, so we try to free an already-free folio. This only happens with lists that have more than 15 entries, so it wasn't immediately discovered. Revert to using list_for_each_safe() so we dereference lru.next before disposing of the folio. Link: https://lkml.kernel.org/r/20240306212749.1823380-1-willy@infradead.org Fixes: 24835f899c01 ("mm: use free_unref_folios() in put_pages_list()") Signed-off-by: Matthew Wilcox (Oracle) Reported-by: "Borah, Chaitanya Kumar" Closes: https://lore.kernel.org/intel-gfx/SJ1PR11MB61292145F3B79DA58ADDDA63B9232@SJ1PR11MB6129.namprd11.prod.outlook.com/ Signed-off-by: Andrew Morton --- mm/swap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index e43a5911b170..500a09a48dfd 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -152,10 +152,10 @@ EXPORT_SYMBOL(__folio_put); void put_pages_list(struct list_head *pages) { struct folio_batch fbatch; - struct folio *folio; + struct folio *folio, *next; folio_batch_init(&fbatch); - list_for_each_entry(folio, pages, lru) { + list_for_each_entry_safe(folio, next, pages, lru) { if (!folio_put_testzero(folio)) continue; if (folio_test_large(folio)) { From b14d1671ddd3463b931fcdc442e0a74f8ae71406 Mon Sep 17 00:00:00 2001 From: James Houghton Date: Thu, 7 Mar 2024 01:02:50 +0000 Subject: [PATCH 424/435] mm: add an explicit smp_wmb() to UFFDIO_CONTINUE Users of UFFDIO_CONTINUE may reasonably assume that a write memory barrier is included as part of UFFDIO_CONTINUE. That is, a user may believe that all writes it has done to a page that it is now UFFDIO_CONTINUE'ing are guaranteed to be visible to anyone subsequently reading the page through the newly mapped virtual memory region. Today, such a user happens to be correct. mmget_not_zero(), for example, is called as part of UFFDIO_CONTINUE (and comes before any PTE updates), and it implicitly gives us a write barrier. To be resilient against future changes, include an explicit smp_wmb(). While we're at it, optimize the smp_wmb() that is already incidentally present for the HugeTLB case. Merely making a syscall does not generally imply the memory ordering constraints that we need (including on x86). Link: https://lkml.kernel.org/r/20240307010250.3847179-1-jthoughton@google.com Signed-off-by: James Houghton Reviewed-by: Peter Xu Cc: Axel Rasmussen Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 17 +++++++++++++---- mm/userfaultfd.c | 9 +++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb17e5c22759..23ef240ba48a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6780,11 +6780,20 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, } /* - * The memory barrier inside __folio_mark_uptodate makes sure that - * preceding stores to the page contents become visible before - * the set_pte_at() write. + * If we just allocated a new page, we need a memory barrier to ensure + * that preceding stores to the page become visible before the + * set_pte_at() write. The memory barrier inside __folio_mark_uptodate + * is what we need. + * + * In the case where we have not allocated a new page (is_continue), + * the page must already be uptodate. UFFDIO_CONTINUE already includes + * an earlier smp_wmb() to ensure that prior stores will be visible + * before the set_pte_at() write. */ - __folio_mark_uptodate(folio); + if (!is_continue) + __folio_mark_uptodate(folio); + else + WARN_ON_ONCE(!folio_test_uptodate(folio)); /* Add shared, newly allocated pages to the page cache. */ if (vm_shared && !is_continue) { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 4744d6a96f96..a0331ba9ae2a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -845,6 +845,15 @@ ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, uffd_flags_t flags) { + + /* + * A caller might reasonably assume that UFFDIO_CONTINUE contains an + * smp_wmb() to ensure that any writes to the about-to-be-mapped page by + * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to + * subsequent loads from the page through the newly mapped address range. + */ + smp_wmb(); + return mfill_atomic(ctx, start, 0, len, uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); } From d221dd5fea6415cc6c7acb7d68c8e5dfd61c3902 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 4 Mar 2024 15:27:37 +0900 Subject: [PATCH 425/435] mm, vmscan: retry kswapd's priority loop with cache_trim_mode off on failure With cache_trim_mode on, reclaim logic doesn't bother reclaiming anon pages. However, it should be more careful to use the mode because it's going to prevent anon pages from being reclaimed even if there are a huge number of anon pages that are cold and should be reclaimed. Even worse, that leads kswapd_failures to reach MAX_RECLAIM_RETRIES and stopping kswapd from functioning until direct reclaim eventually works to resume kswapd. So kswapd needs to retry its scan priority loop with cache_trim_mode off again if the mode doesn't work for reclaim. The problematic behavior can be reproduced by: CONFIG_NUMA_BALANCING enabled sysctl_numa_balancing_mode set to NUMA_BALANCING_MEMORY_TIERING numa node0 (8GB local memory, 16 CPUs) numa node1 (8GB slow tier memory, no CPUs) Sequence: 1) echo 3 > /proc/sys/vm/drop_caches 2) To emulate the system with full of cold memory in local DRAM, run the following dummy program and never touch the region: mmap(0, 8 * 1024 * 1024 * 1024, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_POPULATE, -1, 0); 3) Run any memory intensive work e.g. XSBench. 4) Check if numa balancing is working e.i. promotion/demotion. 5) Iterate 1) ~ 4) until numa balancing stops. With this, you could see that promotion/demotion are not working because kswapd has stopped due to ->kswapd_failures >= MAX_RECLAIM_RETRIES. Interesting vmstat delta's differences between before and after are like: +-----------------------+-------------------------------+ | interesting vmstat | before | after | +-----------------------+-------------------------------+ | nr_inactive_anon | 321935 | 1664772 | | nr_active_anon | 1780700 | 437834 | | nr_inactive_file | 30425 | 40882 | | nr_active_file | 14961 | 3012 | | pgpromote_success | 356 | 1293122 | | pgpromote_candidate | 21953245 | 1824148 | | pgactivate | 1844523 | 3311907 | | pgdeactivate | 50634 | 1554069 | | pgfault | 31100294 | 6518806 | | pgdemote_kswapd | 30856 | 2230821 | | pgscan_kswapd | 1861981 | 7667629 | | pgscan_anon | 1822930 | 7610583 | | pgscan_file | 39051 | 57046 | | pgsteal_anon | 386 | 2192033 | | pgsteal_file | 30470 | 38788 | | pageoutrun | 30 | 412 | | numa_hint_faults | 27418279 | 2875955 | | numa_pages_migrated | 356 | 1293122 | +-----------------------+-------------------------------+ Link: https://lkml.kernel.org/r/20240304082118.20499-1-byungchul@sk.com Signed-off-by: Byungchul Park Cc: Johannes Weiner Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 61606fa83504..741f16929e72 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -108,6 +108,12 @@ struct scan_control { /* Can folios be swapped as part of reclaim? */ unsigned int may_swap:1; + /* Not allow cache_trim_mode to be turned on as part of reclaim? */ + unsigned int no_cache_trim_mode:1; + + /* Has cache_trim_mode failed at least once? */ + unsigned int cache_trim_mode_failed:1; + /* Proactive reclaim invoked by userspace through memory.reclaim */ unsigned int proactive:1; @@ -2271,7 +2277,8 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) * anonymous pages. */ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); - if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE) && + !sc->no_cache_trim_mode) sc->cache_trim_mode = 1; else sc->cache_trim_mode = 0; @@ -5986,6 +5993,8 @@ again: */ if (reclaimable) pgdat->kswapd_failures = 0; + else if (sc->cache_trim_mode) + sc->cache_trim_mode_failed = 1; } /* @@ -6918,6 +6927,16 @@ restart: sc.priority--; } while (sc.priority >= 1); + /* + * Restart only if it went through the priority loop all the way, + * but cache_trim_mode didn't work. + */ + if (!sc.nr_reclaimed && sc.priority < 1 && + !sc.no_cache_trim_mode && sc.cache_trim_mode_failed) { + sc.no_cache_trim_mode = 1; + goto restart; + } + if (!sc.nr_reclaimed) pgdat->kswapd_failures++; From 1412ecb3d256e5d23c6bd150e110cef6e3736844 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 7 Mar 2024 13:18:53 -0500 Subject: [PATCH 426/435] mm/huge_memory: check new folio order when split a folio A folio can only be split into lower orders. Since there are no new_order checks in debugfs, any new_order can be passed via debugfs into split_huge_page_to_list_to_order(). Check new_order to make sure it is smaller than input folio order. Link: https://lkml.kernel.org/r/20240307181854.138928-1-zi.yan@sent.com Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Signed-off-by: Zi Yan Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-mm/7dda9283-b437-4cf8-ab0d-83c330deb9c0@moroto.mountain/ Cc: David Hildenbrand Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/huge_memory.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a81a09236c16..57fca7bffd20 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3052,6 +3052,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (new_order >= folio_order(folio)) + return -EINVAL; + /* Cannot split anonymous THP to order-1 */ if (new_order == 1 && folio_test_anon(folio)) { VM_WARN_ONCE(1, "Cannot split to order-1 folio"); From 2394aef616cf38fbf2e797c6845ccd35d76ce256 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Thu, 7 Mar 2024 13:18:54 -0500 Subject: [PATCH 427/435] mm/huge_memory: skip invalid debugfs new_order input for folio split User can put arbitrary new_order via debugfs for folio split test. Although new_order check is added to split_huge_page_to_list_order() in the prior commit, these two additional checks can avoid unnecessary folio locking and split_folio_to_order() calls. Link: https://lkml.kernel.org/r/20240307181854.138928-2-zi.yan@sent.com Signed-off-by: Zi Yan Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-mm/7dda9283-b437-4cf8-ab0d-83c330deb9c0@moroto.mountain/ Cc: David Hildenbrand Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/huge_memory.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 57fca7bffd20..9859aa4f7553 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3486,6 +3486,9 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, if (!is_transparent_hugepage(folio)) goto next; + if (new_order >= folio_order(folio)) + goto next; + total++; /* * For folios with private, split_huge_page_to_list_to_order() @@ -3553,6 +3556,9 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, total++; nr_pages = folio_nr_pages(folio); + if (new_order >= folio_order(folio)) + goto next; + if (!folio_trylock(folio)) goto next; From 2fd570c1d802e7d1b7df2fbc3784875686427f78 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 6 Mar 2024 15:37:12 -0700 Subject: [PATCH 428/435] selftests/mm: dont fail testsuite due to a lack of hugepages Patch series "selftests/mm: Improve Hugepage Test Handling in MM Selftests", v2. This series addresses issues related to hugepage requirements in the MM selftests, ensuring tests are skipped rather than failing when the necessary hugepage count is not met. This adjustment allows for a more graceful handling for systems with insufficient hugepages, preventing unnecessary test failures and improving the overall robustness of the test suite. This patch (of 3): On systems that have large core counts and large page sizes, but limited memory, the userfaultfd test hugepage requirement is too large. Exiting early due to missing one test's requirements is a rather aggressive strategy, and prevents a lot of other tests from running. Remove the early exit to prevent this. Link: https://lkml.kernel.org/r/20240306223714.320681-1-npache@redhat.com Link: https://lkml.kernel.org/r/20240306223714.320681-2-npache@redhat.com Fixes: ee00479d6702 ("selftests: vm: Try harder to allocate huge pages") Signed-off-by: Nico Pache Cc: Ben Hutchings Cc: Muhammad Usama Anjum Cc: Shuah Khan Cc: Muchun Song Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 02dc1c38bd76..a4f2e4d1da10 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -175,7 +175,6 @@ if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then if [ "$freepgs" -lt "$needpgs" ]; then printf "Not enough huge pages available (%d < %d)\n" \ "$freepgs" "$needpgs" - exit 1 fi else echo "no hugetlbfs support in kernel?" From 5a6aa60d18236c2f8ebc438db3e80e61714a8904 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 6 Mar 2024 15:37:13 -0700 Subject: [PATCH 429/435] selftests/mm: skip uffd hugetlb tests with insufficient hugepages Now that run_vmtests.sh does not guarantee that the correct hugepage count is available, add a check inside the userfaultfd hugetlb test to verify the nr_hugepages count before continuing. Link: https://lkml.kernel.org/r/20240306223714.320681-3-npache@redhat.com Signed-off-by: Nico Pache Cc: Ben Hutchings Cc: Muchun Song Cc: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-stress.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index 7e83829bbb33..f78bab0f3d45 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -441,6 +441,12 @@ int main(int argc, char **argv) parse_test_type_arg(argv[1]); bytes = atol(argv[2]) * 1024 * 1024; + if (test_type == TEST_HUGETLB && + get_free_hugepages() < bytes / page_size) { + printf("skip: Skipping userfaultfd... not enough hugepages\n"); + return KSFT_SKIP; + } + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); nr_pages_per_cpu = bytes / page_size / nr_cpus; From 84d147df13346461384ebc0ce2c1783bb9c17d1a Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 6 Mar 2024 15:37:14 -0700 Subject: [PATCH 430/435] selftests/mm: skip the hugetlb-madvise tests on unmet hugepage requirements Now that run_vmtests.sh does not guarantee that the correct hugepage count is available, skip the hugetlb-madvise test if the requirements are not met rather than failing. Link: https://lkml.kernel.org/r/20240306223714.320681-4-npache@redhat.com Signed-off-by: Nico Pache Cc: Ben Hutchings Cc: Muchun Song Cc: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugetlb-madvise.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index f32d99565c5e..e74107185324 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -19,6 +19,7 @@ #include #include #include "vm_util.h" +#include "../kselftest.h" #define MIN_FREE_PAGES 20 #define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ @@ -78,7 +79,7 @@ int main(int argc, char **argv) free_hugepages = get_free_hugepages(); if (free_hugepages < MIN_FREE_PAGES) { printf("Not enough free huge pages to test, exiting!\n"); - exit(1); + exit(KSFT_SKIP); } fd = memfd_create(argv[0], MFD_HUGETLB); From c087a5c324e5577f2d5abee5627c9264c2001ed9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 11 Mar 2024 12:10:45 -0400 Subject: [PATCH 431/435] mm: recover pud_leaf() definitions in nopmd case This reverts one change in commit 924bd6a8c967 ("mm/x86: drop two unnecessary pud_leaf() definitions"). One issue with that is it broke nopmd builds for at least both arm64 and riscv (CONFIG_PGTABLE_LEVELS=2). The other issue is it was overlooked that it's a common change rather than x86 specific (relevant to the commit message of the commit). Normally there's no need for empty definition of pXd_leaf() because of the fallback functions, however this logic may not apply to pgtable-nopmd.h, because that's a header that can even be used by arch *pgtable.h headers, which can use the *_leaf() definitions _before_ the fallback functions are defined. Leave it there to pass PGTABLE_LEVELS=2 builds. Link: https://lkml.kernel.org/r/Ze8vFNV9YSdgC2S7@x1n Fixes: 924bd6a8c967 ("mm/x86: drop two unnecessary pud_leaf() definitions") Signed-off-by: Peter Xu Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403090900.OwPqmRuI-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202403101607.a42gaLOS-lkp@intel.com/ Cc: Jason Gunthorpe Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/asm-generic/pgtable-nopmd.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h index fa27e16bbe1b..8ffd64e7a24c 100644 --- a/include/asm-generic/pgtable-nopmd.h +++ b/include/asm-generic/pgtable-nopmd.h @@ -31,6 +31,7 @@ static inline int pud_none(pud_t pud) { return 0; } static inline int pud_bad(pud_t pud) { return 0; } static inline int pud_present(pud_t pud) { return 1; } static inline int pud_user(pud_t pud) { return 0; } +static inline int pud_leaf(pud_t pud) { return 0; } static inline void pud_clear(pud_t *pud) { } #define pmd_ERROR(pmd) (pud_ERROR((pmd).pud)) From cd197c3a2040100fd8668b33e72b07d4790b39d7 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 8 Mar 2024 22:27:21 +1300 Subject: [PATCH 432/435] mm: prohibit the last subpage from reusing the entire large folio In a Copy-on-Write (CoW) scenario, the last subpage will reuse the entire large folio, resulting in the waste of (nr_pages - 1) pages. This wasted memory remains allocated until it is either unmapped or memory reclamation occurs. The following small program can serve as evidence of this behavior main() { #define SIZE 1024 * 1024 * 1024UL void *p = malloc(SIZE); memset(p, 0x11, SIZE); if (fork() == 0) _exit(0); memset(p, 0x12, SIZE); printf("done\n"); while(1); } For example, using a 1024KiB mTHP by: echo always > /sys/kernel/mm/transparent_hugepage/hugepages-1024kB/enabled (1) w/o the patch, it takes 2GiB, Before running the test program, / # free -m total used free shared buff/cache available Mem: 5754 84 5692 0 17 5669 Swap: 0 0 0 / # /a.out & / # done After running the test program, / # free -m total used free shared buff/cache available Mem: 5754 2149 3627 0 19 3605 Swap: 0 0 0 (2) w/ the patch, it takes 1GiB only, Before running the test program, / # free -m total used free shared buff/cache available Mem: 5754 89 5687 0 17 5664 Swap: 0 0 0 / # /a.out & / # done After running the test program, / # free -m total used free shared buff/cache available Mem: 5754 1122 4655 0 17 4632 Swap: 0 0 0 This patch migrates the last subpage to a small folio and immediately returns the large folio to the system. It benefits both memory availability and anti-fragmentation. Link: https://lkml.kernel.org/r/20240308092721.144735-1-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: David Hildenbrand Cc: Ryan Roberts Cc: Lance Yang Signed-off-by: Andrew Morton --- mm/memory.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index e17669d4f72f..f2bc6dd15eb8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3498,6 +3498,16 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) static bool wp_can_reuse_anon_folio(struct folio *folio, struct vm_area_struct *vma) { + /* + * We could currently only reuse a subpage of a large folio if no + * other subpages of the large folios are still mapped. However, + * let's just consistently not reuse subpages even if we could + * reuse in that scenario, and give back a large folio a bit + * sooner. + */ + if (folio_test_large(folio)) + return false; + /* * We have to verify under folio lock: these early checks are * just an optimization to avoid locking the folio and freeing From 82634d7e24271698e50a3ec811e5f50de790a65f Mon Sep 17 00:00:00 2001 From: Qiang Zhang Date: Tue, 12 Mar 2024 16:04:23 +0800 Subject: [PATCH 433/435] memtest: use {READ,WRITE}_ONCE in memory scanning memtest failed to find bad memory when compiled with clang. So use {WRITE,READ}_ONCE to access memory to avoid compiler over optimization. Link: https://lkml.kernel.org/r/20240312080422.691222-1-qiang4.zhang@intel.com Signed-off-by: Qiang Zhang Cc: Bill Wendling Cc: Justin Stitt Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Signed-off-by: Andrew Morton --- mm/memtest.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memtest.c b/mm/memtest.c index 32f3e9dda837..c2c609c39119 100644 --- a/mm/memtest.c +++ b/mm/memtest.c @@ -51,10 +51,10 @@ static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size last_bad = 0; for (p = start; p < end; p++) - *p = pattern; + WRITE_ONCE(*p, pattern); for (p = start; p < end; p++, start_phys_aligned += incr) { - if (*p == pattern) + if (READ_ONCE(*p) == pattern) continue; if (start_phys_aligned == last_bad + incr) { last_bad += incr; From 6c303f1af356f5f6847146ab0199dc3fe61d8f46 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 22 Feb 2024 21:11:34 +1300 Subject: [PATCH 434/435] crypto: introduce: acomp_is_async to expose if comp drivers might sleep acomp's users might want to know if acomp is really async to optimize themselves. One typical user which can benefit from exposed async stat is zswap. In zswap, zsmalloc is the most commonly used allocator for (and perhaps the only one). For zsmalloc, we cannot sleep while we map the compressed memory, so we copy it to a temporary buffer. By knowing the alg won't sleep can help zswap to avoid the need for a buffer. This shows noticeable improvement in load/store latency of zswap. Link: https://lkml.kernel.org/r/20240222081135.173040-2-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: Herbert Xu Acked-by: Chris Li Cc: Chengming Zhou Cc: Dan Streetman Cc: David S. Miller Cc: Johannes Weiner Cc: Nhat Pham Cc: Seth Jennings Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/crypto/acompress.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h index 574cffc90730..80e243611fe2 100644 --- a/include/crypto/acompress.h +++ b/include/crypto/acompress.h @@ -160,6 +160,12 @@ static inline void acomp_request_set_tfm(struct acomp_req *req, req->base.tfm = crypto_acomp_tfm(tfm); } +static inline bool acomp_is_async(struct crypto_acomp *tfm) +{ + return crypto_comp_alg_common(tfm)->base.cra_flags & + CRYPTO_ALG_ASYNC; +} + static inline struct crypto_acomp *crypto_acomp_reqtfm(struct acomp_req *req) { return __crypto_acomp_tfm(req->base.tfm); From 270700dd06ca41a4779c19eb46608f076bb7d40e Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 22 Feb 2024 21:11:35 +1300 Subject: [PATCH 435/435] mm/zswap: remove the memcpy if acomp is not sleepable Most compressors are actually CPU-based and won't sleep during compression and decompression. We should remove the redundant memcpy for them. This patch checks if the algorithm is sleepable by testing the CRYPTO_ALG_ASYNC algorithm flag. Generally speaking, async and sleepable are semantically similar but not equal. But for compress drivers, they are basically equal at least due to the below facts. Firstly, scompress drivers - crypto/deflate.c, lz4.c, zstd.c, lzo.c etc have no sleep. Secondly, zRAM has been using these scompress drivers for years in atomic contexts, and never worried those drivers going to sleep. One exception is that an async driver can sometimes still return synchronously per Herbert's clarification. In this case, we are still having a redundant memcpy. But we can't know if one particular acomp request will sleep or not unless crypto can expose more details for each specific request from offload drivers. Link: https://lkml.kernel.org/r/20240222081135.173040-3-21cnbao@gmail.com Signed-off-by: Barry Song Tested-by: Chengming Zhou Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Acked-by: Chris Li Acked-by: Johannes Weiner Cc: Dan Streetman Cc: David S. Miller Cc: Herbert Xu Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zswap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 9a3237752082..9dec853647c8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -162,6 +162,7 @@ struct crypto_acomp_ctx { struct crypto_wait wait; u8 *buffer; struct mutex mutex; + bool is_sleepable; }; /* @@ -951,6 +952,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) goto acomp_fail; } acomp_ctx->acomp = acomp; + acomp_ctx->is_sleepable = acomp_is_async(acomp); req = acomp_request_alloc(acomp_ctx->acomp); if (!req) { @@ -1078,7 +1080,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page) mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); - if (!zpool_can_sleep_mapped(zpool)) { + if (acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) { memcpy(acomp_ctx->buffer, src, entry->length); src = acomp_ctx->buffer; zpool_unmap_handle(zpool, entry->handle); @@ -1092,7 +1094,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page) BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); mutex_unlock(&acomp_ctx->mutex); - if (zpool_can_sleep_mapped(zpool)) + if (!acomp_ctx->is_sleepable || zpool_can_sleep_mapped(zpool)) zpool_unmap_handle(zpool, entry->handle); }