diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 0dc85556dec5..ec98e526167e 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -145,6 +145,7 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, static const struct mm_walk_ops subpage_walk_ops = { .pmd_entry = subpage_walk_pmd_entry, + .walk_lock = PGWALK_WRLOCK_VERIFY, }; static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c index ea3d61de065b..161d0b34c2cb 100644 --- a/arch/riscv/mm/pageattr.c +++ b/arch/riscv/mm/pageattr.c @@ -102,6 +102,7 @@ static const struct mm_walk_ops pageattr_ops = { .pmd_entry = pageattr_pmd_entry, .pte_entry = pageattr_pte_entry, .pte_hole = pageattr_pte_hole, + .walk_lock = PGWALK_RDLOCK, }; static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask, diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 9c8af31be970..906a7bfc2a78 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2514,6 +2514,7 @@ static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, static const struct mm_walk_ops thp_split_walk_ops = { .pmd_entry = thp_split_walk_pmd_entry, + .walk_lock = PGWALK_WRLOCK_VERIFY, }; static inline void thp_split_mm(struct mm_struct *mm) @@ -2565,6 +2566,7 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start, static const struct mm_walk_ops zap_zero_walk_ops = { .pmd_entry = __zap_zero_pages, + .walk_lock = PGWALK_WRLOCK, }; /* @@ -2655,6 +2657,7 @@ static const struct mm_walk_ops enable_skey_walk_ops = { .hugetlb_entry = __s390_enable_skey_hugetlb, .pte_entry = __s390_enable_skey_pte, .pmd_entry = __s390_enable_skey_pmd, + .walk_lock = PGWALK_WRLOCK, }; int s390_enable_skey(void) @@ -2692,6 +2695,7 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr, static const struct mm_walk_ops reset_cmma_walk_ops = { .pte_entry = __s390_reset_cmma, + .walk_lock = PGWALK_WRLOCK, }; void s390_reset_cmma(struct mm_struct *mm) @@ -2728,6 +2732,7 @@ static int s390_gather_pages(pte_t *ptep, unsigned long addr, static const struct mm_walk_ops gather_pages_ops = { .pte_entry = s390_gather_pages, + .walk_lock = PGWALK_RDLOCK, }; /* diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 581691e4be49..7ec16879756e 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -725,6 +725,11 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, struct folio *folio = fbatch.folios[i]; folio_lock(folio); + if (unlikely(folio->mapping != mapping)) { + /* Exclude folios removed from the address space */ + folio_unlock(folio); + continue; + } head = folio_buffers(folio); if (!head) { create_empty_buffers(&folio->page, i_blocksize(inode), 0); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 507cd4e59d07..fafff1bd34cd 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -587,8 +587,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, bool migration = false; if (pmd_present(*pmd)) { - /* FOLL_DUMP will return -EFAULT on huge zero page */ - page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + page = vm_normal_page_pmd(vma, addr, *pmd); } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { swp_entry_t entry = pmd_to_swp_entry(*pmd); @@ -758,12 +757,14 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, static const struct mm_walk_ops smaps_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, + .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops smaps_shmem_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, .pte_hole = smaps_pte_hole, + .walk_lock = PGWALK_RDLOCK, }; /* @@ -1245,6 +1246,7 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, static const struct mm_walk_ops clear_refs_walk_ops = { .pmd_entry = clear_refs_pte_range, .test_walk = clear_refs_test_walk, + .walk_lock = PGWALK_WRLOCK, }; static ssize_t clear_refs_write(struct file *file, const char __user *buf, @@ -1622,6 +1624,7 @@ static const struct mm_walk_ops pagemap_ops = { .pmd_entry = pagemap_pmd_range, .pte_hole = pagemap_pte_hole, .hugetlb_entry = pagemap_hugetlb_range, + .walk_lock = PGWALK_RDLOCK, }; /* @@ -1935,6 +1938,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, static const struct mm_walk_ops show_numa_ops = { .hugetlb_entry = gather_hugetlb_stats, .pmd_entry = gather_pte_stats, + .walk_lock = PGWALK_RDLOCK, }; /* diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 20284387b841..e718dbe928ba 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -25,9 +25,6 @@ static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) #endif vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf); -struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, - unsigned int flags); bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next); int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, diff --git a/include/linux/mm.h b/include/linux/mm.h index 406ab9ea818f..34f9dba17c1a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3421,15 +3421,24 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required. */ -static inline bool gup_can_follow_protnone(unsigned int flags) +static inline bool gup_can_follow_protnone(struct vm_area_struct *vma, + unsigned int flags) { /* - * FOLL_FORCE has to be able to make progress even if the VMA is - * inaccessible. Further, FOLL_FORCE access usually does not represent - * application behaviour and we should avoid triggering NUMA hinting - * faults. + * If callers don't want to honor NUMA hinting faults, no need to + * determine if we would actually have to trigger a NUMA hinting fault. */ - return flags & FOLL_FORCE; + if (!(flags & FOLL_HONOR_NUMA_FAULT)) + return true; + + /* + * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs. + * + * Requiring a fault here even for inaccessible VMAs would mean that + * FOLL_FORCE cannot make any progress, because handle_mm_fault() + * refuses to process NUMA hinting faults in inaccessible VMAs. + */ + return !vma_is_accessible(vma); } typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5e74ce4a28cd..7d30dc4ff0ff 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1286,6 +1286,15 @@ enum { FOLL_PCI_P2PDMA = 1 << 10, /* allow interrupts from generic signals */ FOLL_INTERRUPTIBLE = 1 << 11, + /* + * Always honor (trigger) NUMA hinting faults. + * + * FOLL_WRITE implicitly honors NUMA hinting faults because a + * PROT_NONE-mapped page is not writable (exceptions with FOLL_FORCE + * apply). get_user_pages_fast_only() always implicitly honors NUMA + * hinting faults. + */ + FOLL_HONOR_NUMA_FAULT = 1 << 12, /* See also internal only FOLL flags in mm/internal.h */ }; diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 27a6df448ee5..27cd1e59ccf7 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -6,6 +6,16 @@ struct mm_walk; +/* Locking requirement during a page walk. */ +enum page_walk_lock { + /* mmap_lock should be locked for read to stabilize the vma tree */ + PGWALK_RDLOCK = 0, + /* vma will be write-locked during the walk */ + PGWALK_WRLOCK = 1, + /* vma is expected to be already write-locked during the walk */ + PGWALK_WRLOCK_VERIFY = 2, +}; + /** * struct mm_walk_ops - callbacks for walk_page_range * @pgd_entry: if set, called for each non-empty PGD (top-level) entry @@ -66,6 +76,7 @@ struct mm_walk_ops { int (*pre_vma)(unsigned long start, unsigned long end, struct mm_walk *walk); void (*post_vma)(struct mm_walk *walk); + enum page_walk_lock walk_lock; }; /* diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4dd73cf936a6..f723024e1426 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4265,6 +4265,10 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) * mas_wr_append: Attempt to append * @wr_mas: the maple write state * + * This is currently unsafe in rcu mode since the end of the node may be cached + * by readers while the node contents may be updated which could result in + * inaccurate information. + * * Return: True if appended, false otherwise */ static inline bool mas_wr_append(struct ma_wr_state *wr_mas) @@ -4274,6 +4278,9 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas) struct ma_state *mas = wr_mas->mas; unsigned char node_pivots = mt_pivots[wr_mas->type]; + if (mt_in_rcu(mas->tree)) + return false; + if (mas->offset != wr_mas->node_end) return false; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 1a31065b2036..976b9bd02a1b 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1136,7 +1136,6 @@ static void set_iter_tags(struct radix_tree_iter *iter, void __rcu **radix_tree_iter_resume(void __rcu **slot, struct radix_tree_iter *iter) { - slot++; iter->index = __radix_tree_iter_add(iter, 1); iter->next_index = iter->index; iter->tags = 0; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 2fcc9731528a..e0e59d420fca 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -386,6 +386,7 @@ out: static const struct mm_walk_ops damon_mkold_ops = { .pmd_entry = damon_mkold_pmd_entry, .hugetlb_entry = damon_mkold_hugetlb_entry, + .walk_lock = PGWALK_RDLOCK, }; static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) @@ -525,6 +526,7 @@ out: static const struct mm_walk_ops damon_young_ops = { .pmd_entry = damon_young_pmd_entry, .hugetlb_entry = damon_young_hugetlb_entry, + .walk_lock = PGWALK_RDLOCK, }; static bool damon_va_young(struct mm_struct *mm, unsigned long addr, diff --git a/mm/gup.c b/mm/gup.c index 76d222ccc3ff..6e2f9e9d6537 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -597,7 +597,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; - if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) + if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags)) goto no_page; page = vm_normal_page(vma, address, pte); @@ -714,7 +714,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); - if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags)) + if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) return no_page_table(vma, flags); ptl = pmd_lock(mm, pmd); @@ -851,6 +851,10 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, if (WARN_ON_ONCE(foll_flags & FOLL_PIN)) return NULL; + /* + * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect + * to fail on PROT_NONE-mapped pages. + */ page = follow_page_mask(vma, address, foll_flags, &ctx); if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); @@ -2227,6 +2231,13 @@ static bool is_valid_gup_args(struct page **pages, int *locked, gup_flags |= FOLL_UNLOCKABLE; } + /* + * For now, always trigger NUMA hinting faults. Some GUP users like + * KVM require the hint to be as the calling context of GUP is + * functionally similar to a memory reference from task context. + */ + gup_flags |= FOLL_HONOR_NUMA_FAULT; + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) @@ -2551,7 +2562,14 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, struct page *page; struct folio *folio; - if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) + /* + * Always fallback to ordinary GUP on PROT_NONE-mapped pages: + * pte_access_permitted() better should reject these pages + * either way: otherwise, GUP-fast might succeed in + * cases where ordinary GUP would fail due to VMA access + * permissions. + */ + if (pte_protnone(pte)) goto pte_unmap; if (!pte_access_permitted(pte, flags & FOLL_WRITE)) @@ -2970,8 +2988,8 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))) { - if (pmd_protnone(pmd) && - !gup_can_follow_protnone(flags)) + /* See gup_pte_range() */ + if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, @@ -3151,7 +3169,7 @@ static int internal_get_user_pages_fast(unsigned long start, if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | FOLL_FAST_ONLY | FOLL_NOFAULT | - FOLL_PCI_P2PDMA))) + FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT))) return -EINVAL; if (gup_flags & FOLL_PIN) diff --git a/mm/hmm.c b/mm/hmm.c index 855e25e59d8f..277ddcab4947 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -562,6 +562,7 @@ static const struct mm_walk_ops hmm_walk_ops = { .pte_hole = hmm_vma_walk_hole, .hugetlb_entry = hmm_vma_walk_hugetlb_entry, .test_walk = hmm_vma_walk_test, + .walk_lock = PGWALK_RDLOCK, }; /** diff --git a/mm/huge_memory.c b/mm/huge_memory.c index eb3678360b97..164d22365bde 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1467,8 +1467,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) return ERR_PTR(-EFAULT); - /* Full NUMA hinting faults to serialise migration in fault paths */ - if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags)) + if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags)) return NULL; if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page)) @@ -1613,7 +1612,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * If other processes are mapping this folio, we couldn't discard * the folio unless they all do MADV_FREE so let's skip the folio. */ - if (folio_mapcount(folio) != 1) + if (folio_estimated_sharers(folio) != 1) goto out; if (!folio_trylock(folio)) diff --git a/mm/internal.h b/mm/internal.h index a7d9e980429a..8ed127c1c808 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -924,6 +924,13 @@ int migrate_device_coherent_page(struct page *page); struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); int __must_check try_grab_page(struct page *page, unsigned int flags); +/* + * mm/huge_memory.c + */ +struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmd, + unsigned int flags); + enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, @@ -997,6 +1004,16 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) smp_rmb(); + /* + * During GUP-fast we might not get called on the head page for a + * hugetlb page that is mapped using cont-PTE, because GUP-fast does + * not work with the abstracted hugetlb PTEs that always point at the + * head page. For hugetlb, PageAnonExclusive only applies on the head + * page (as it cannot be partially COW-shared), so lookup the head page. + */ + if (unlikely(!PageHead(page) && PageHuge(page))) + page = compound_head(page); + /* * Note that PageKsm() pages cannot be exclusive, and consequently, * cannot get pinned. diff --git a/mm/ksm.c b/mm/ksm.c index d20d7662419b..d7b5b95e936e 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -455,6 +455,12 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex static const struct mm_walk_ops break_ksm_ops = { .pmd_entry = break_ksm_pmd_entry, + .walk_lock = PGWALK_RDLOCK, +}; + +static const struct mm_walk_ops break_ksm_lock_vma_ops = { + .pmd_entry = break_ksm_pmd_entry, + .walk_lock = PGWALK_WRLOCK, }; /* @@ -470,16 +476,17 @@ static const struct mm_walk_ops break_ksm_ops = { * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ -static int break_ksm(struct vm_area_struct *vma, unsigned long addr) +static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) { vm_fault_t ret = 0; + const struct mm_walk_ops *ops = lock_vma ? + &break_ksm_lock_vma_ops : &break_ksm_ops; do { int ksm_page; cond_resched(); - ksm_page = walk_page_range_vma(vma, addr, addr + 1, - &break_ksm_ops, NULL); + ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL); if (WARN_ON_ONCE(ksm_page < 0)) return ksm_page; if (!ksm_page) @@ -565,7 +572,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item) mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) - break_ksm(vma, addr); + break_ksm(vma, addr, false); mmap_read_unlock(mm); } @@ -871,7 +878,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) * in cmp_and_merge_page on one of the rmap_items we would be removing. */ static int unmerge_ksm_pages(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, bool lock_vma) { unsigned long addr; int err = 0; @@ -882,7 +889,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, if (signal_pending(current)) err = -ERESTARTSYS; else - err = break_ksm(vma, addr); + err = break_ksm(vma, addr, lock_vma); } return err; } @@ -1029,7 +1036,7 @@ static int unmerge_and_remove_all_rmap_items(void) if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; err = unmerge_ksm_pages(vma, - vma->vm_start, vma->vm_end); + vma->vm_start, vma->vm_end, false); if (err) goto error; } @@ -2530,7 +2537,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma) return 0; if (vma->anon_vma) { - err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end); + err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true); if (err) return err; } @@ -2668,7 +2675,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, return 0; /* just ignore the advice */ if (vma->anon_vma) { - err = unmerge_ksm_pages(vma, start, end); + err = unmerge_ksm_pages(vma, start, end, true); if (err) return err; } diff --git a/mm/madvise.c b/mm/madvise.c index 886f06066622..ec30f48f8f2e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -233,6 +233,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, static const struct mm_walk_ops swapin_walk_ops = { .pmd_entry = swapin_walk_pmd_entry, + .walk_lock = PGWALK_RDLOCK, }; static void shmem_swapin_range(struct vm_area_struct *vma, @@ -383,7 +384,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, folio = pfn_folio(pmd_pfn(orig_pmd)); /* Do not interfere with other mappings of this folio */ - if (folio_mapcount(folio) != 1) + if (folio_estimated_sharers(folio) != 1) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) @@ -457,7 +458,7 @@ regular_folio: if (folio_test_large(folio)) { int err; - if (folio_mapcount(folio) != 1) + if (folio_estimated_sharers(folio) != 1) break; if (pageout_anon_only_filter && !folio_test_anon(folio)) break; @@ -534,6 +535,7 @@ regular_folio: static const struct mm_walk_ops cold_walk_ops = { .pmd_entry = madvise_cold_or_pageout_pte_range, + .walk_lock = PGWALK_RDLOCK, }; static void madvise_cold_page_range(struct mmu_gather *tlb, @@ -678,7 +680,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (folio_test_large(folio)) { int err; - if (folio_mapcount(folio) != 1) + if (folio_estimated_sharers(folio) != 1) break; if (!folio_trylock(folio)) break; @@ -757,6 +759,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, static const struct mm_walk_ops madvise_free_walk_ops = { .pmd_entry = madvise_free_pte_range, + .walk_lock = PGWALK_RDLOCK, }; static int madvise_free_single_vma(struct vm_area_struct *vma, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e8ca4bdcb03c..315fd5f45e3c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6024,6 +6024,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, static const struct mm_walk_ops precharge_walk_ops = { .pmd_entry = mem_cgroup_count_precharge_pte_range, + .walk_lock = PGWALK_RDLOCK, }; static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) @@ -6303,6 +6304,7 @@ put: /* get_mctgt_type() gets & locks the page */ static const struct mm_walk_ops charge_walk_ops = { .pmd_entry = mem_cgroup_move_charge_pte_range, + .walk_lock = PGWALK_RDLOCK, }; static void mem_cgroup_move_charge(void) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9a285038d765..fe121fdb05f7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -831,6 +831,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, static const struct mm_walk_ops hwp_walk_ops = { .pmd_entry = hwpoison_pte_range, .hugetlb_entry = hwpoison_hugetlb_range, + .walk_lock = PGWALK_RDLOCK, }; /* @@ -2740,10 +2741,13 @@ retry: if (ret > 0) { ret = soft_offline_in_use_page(page); } else if (ret == 0) { - if (!page_handle_poison(page, true, false) && try_again) { - try_again = false; - flags &= ~MF_COUNT_INCREASED; - goto retry; + if (!page_handle_poison(page, true, false)) { + if (try_again) { + try_again = false; + flags &= ~MF_COUNT_INCREASED; + goto retry; + } + ret = -EBUSY; } } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c53f8beeb507..ec2eaceffd74 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -718,6 +718,14 @@ static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, + .walk_lock = PGWALK_RDLOCK, +}; + +static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { + .hugetlb_entry = queue_folios_hugetlb, + .pmd_entry = queue_folios_pte_range, + .test_walk = queue_pages_test_walk, + .walk_lock = PGWALK_WRLOCK, }; /* @@ -738,7 +746,7 @@ static const struct mm_walk_ops queue_pages_walk_ops = { static int queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist) + struct list_head *pagelist, bool lock_vma) { int err; struct queue_pages qp = { @@ -749,8 +757,10 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, .end = end, .first = NULL, }; + const struct mm_walk_ops *ops = lock_vma ? + &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; - err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); + err = walk_page_range(mm, start, end, ops, &qp); if (!qp.first) /* whole range in hole */ @@ -1078,7 +1088,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, vma = find_vma(mm, 0); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, - flags | MPOL_MF_DISCONTIG_OK, &pagelist); + flags | MPOL_MF_DISCONTIG_OK, &pagelist, false); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, @@ -1321,12 +1331,8 @@ static long do_mbind(unsigned long start, unsigned long len, * Lock the VMAs before scanning for pages to migrate, to ensure we don't * miss a concurrently inserted page. */ - vma_iter_init(&vmi, mm, start); - for_each_vma_range(vmi, vma, end) - vma_start_write(vma); - ret = queue_pages_range(mm, start, end, nmask, - flags | MPOL_MF_INVERT, &pagelist); + flags | MPOL_MF_INVERT, &pagelist, true); if (ret < 0) { err = ret; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 8365158460ed..d5f492356e3e 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -279,6 +279,7 @@ next: static const struct mm_walk_ops migrate_vma_walk_ops = { .pmd_entry = migrate_vma_collect_pmd, .pte_hole = migrate_vma_collect_hole, + .walk_lock = PGWALK_RDLOCK, }; /* diff --git a/mm/mincore.c b/mm/mincore.c index b7f7a516b26c..dad3622cc963 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -176,6 +176,7 @@ static const struct mm_walk_ops mincore_walk_ops = { .pmd_entry = mincore_pte_range, .pte_hole = mincore_unmapped_range, .hugetlb_entry = mincore_hugetlb, + .walk_lock = PGWALK_RDLOCK, }; /* diff --git a/mm/mlock.c b/mm/mlock.c index 0a0c996c5c21..479e09d0994c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -371,6 +371,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, { static const struct mm_walk_ops mlock_walk_ops = { .pmd_entry = mlock_pte_range, + .walk_lock = PGWALK_WRLOCK_VERIFY, }; /* diff --git a/mm/mprotect.c b/mm/mprotect.c index 6f658d483704..3aef1340533a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -568,6 +568,7 @@ static const struct mm_walk_ops prot_none_walk_ops = { .pte_entry = prot_none_pte_entry, .hugetlb_entry = prot_none_hugetlb_entry, .test_walk = prot_none_test, + .walk_lock = PGWALK_WRLOCK, }; int diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2022333805d3..9b2d23fbf4d3 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -400,6 +400,33 @@ static int __walk_page_range(unsigned long start, unsigned long end, return err; } +static inline void process_mm_walk_lock(struct mm_struct *mm, + enum page_walk_lock walk_lock) +{ + if (walk_lock == PGWALK_RDLOCK) + mmap_assert_locked(mm); + else + mmap_assert_write_locked(mm); +} + +static inline void process_vma_walk_lock(struct vm_area_struct *vma, + enum page_walk_lock walk_lock) +{ +#ifdef CONFIG_PER_VMA_LOCK + switch (walk_lock) { + case PGWALK_WRLOCK: + vma_start_write(vma); + break; + case PGWALK_WRLOCK_VERIFY: + vma_assert_write_locked(vma); + break; + case PGWALK_RDLOCK: + /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ + break; + } +#endif +} + /** * walk_page_range - walk page table with caller specific callbacks * @mm: mm_struct representing the target process of page table walk @@ -459,7 +486,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, if (!walk.mm) return -EINVAL; - mmap_assert_locked(walk.mm); + process_mm_walk_lock(walk.mm, ops->walk_lock); vma = find_vma(walk.mm, start); do { @@ -474,6 +501,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, if (ops->pte_hole) err = ops->pte_hole(start, next, -1, &walk); } else { /* inside vma */ + process_vma_walk_lock(vma, ops->walk_lock); walk.vma = vma; next = min(end, vma->vm_end); vma = find_vma(mm, vma->vm_end); @@ -549,7 +577,8 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; - mmap_assert_locked(walk.mm); + process_mm_walk_lock(walk.mm, ops->walk_lock); + process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(start, end, &walk); } @@ -566,7 +595,8 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, if (!walk.mm) return -EINVAL; - mmap_assert_locked(walk.mm); + process_mm_walk_lock(walk.mm, ops->walk_lock); + process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } diff --git a/mm/shmem.c b/mm/shmem.c index f5af4b943e42..d963c747dabc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -806,14 +806,16 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, XA_STATE(xas, &mapping->i_pages, start); struct page *page; unsigned long swapped = 0; + unsigned long max = end - 1; rcu_read_lock(); - xas_for_each(&xas, page, end - 1) { + xas_for_each(&xas, page, max) { if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) swapped++; - + if (xas.xa_index == max) + break; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 93cf99aba335..228a4a5312f2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2979,6 +2979,10 @@ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) free_vm_area(area); return NULL; } + + flush_cache_vmap((unsigned long)area->addr, + (unsigned long)area->addr + count * PAGE_SIZE); + return area->addr; } EXPORT_SYMBOL_GPL(vmap_pfn); diff --git a/mm/vmscan.c b/mm/vmscan.c index 1080209a568b..2fe4a11d63f4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4284,6 +4284,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ static const struct mm_walk_ops mm_walk_ops = { .test_walk = should_skip_vma, .p4d_entry = walk_pud_range, + .walk_lock = PGWALK_RDLOCK, }; int err; @@ -4853,16 +4854,17 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg) spin_lock_irq(&pgdat->memcg_lru.lock); - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + if (hlist_nulls_unhashed(&lruvec->lrugen.list)) + goto unlock; gen = lruvec->lrugen.gen; - hlist_nulls_del_rcu(&lruvec->lrugen.list); + hlist_nulls_del_init_rcu(&lruvec->lrugen.list); pgdat->memcg_lru.nr_memcgs[gen]--; if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); - +unlock: spin_unlock_irq(&pgdat->memcg_lru.lock); } } @@ -5434,8 +5436,10 @@ restart: rcu_read_lock(); hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { - if (op) + if (op) { lru_gen_rotate_memcg(lruvec, op); + op = 0; + } mem_cgroup_put(memcg); @@ -5443,7 +5447,7 @@ restart: memcg = lruvec_memcg(lruvec); if (!mem_cgroup_tryget(memcg)) { - op = 0; + lru_gen_release_memcg(memcg); memcg = NULL; continue; } diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c index 54d09b820ed4..c037553cfd92 100644 --- a/tools/testing/selftests/cachestat/test_cachestat.c +++ b/tools/testing/selftests/cachestat/test_cachestat.c @@ -4,10 +4,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -15,6 +17,8 @@ #include "../kselftest.h" +#define NR_TESTS 9 + static const char * const dev_files[] = { "/dev/zero", "/dev/null", "/dev/urandom", "/proc/version", "/proc" @@ -90,6 +94,20 @@ out: return ret; } +/* + * fsync() is implemented via noop_fsync() on tmpfs. This makes the fsync() + * test fail below, so we need to check for test file living on a tmpfs. + */ +static bool is_on_tmpfs(int fd) +{ + struct statfs statfs_buf; + + if (fstatfs(fd, &statfs_buf)) + return false; + + return statfs_buf.f_type == TMPFS_MAGIC; +} + /* * Open/create the file at filename, (optionally) write random data to it * (exactly num_pages), then test the cachestat syscall on this file. @@ -97,13 +115,13 @@ out: * If test_fsync == true, fsync the file, then check the number of dirty * pages. */ -bool test_cachestat(const char *filename, bool write_random, bool create, - bool test_fsync, unsigned long num_pages, int open_flags, - mode_t open_mode) +static int test_cachestat(const char *filename, bool write_random, bool create, + bool test_fsync, unsigned long num_pages, + int open_flags, mode_t open_mode) { size_t PS = sysconf(_SC_PAGESIZE); int filesize = num_pages * PS; - bool ret = true; + int ret = KSFT_PASS; long syscall_ret; struct cachestat cs; struct cachestat_range cs_range = { 0, filesize }; @@ -112,7 +130,7 @@ bool test_cachestat(const char *filename, bool write_random, bool create, if (fd == -1) { ksft_print_msg("Unable to create/open file.\n"); - ret = false; + ret = KSFT_FAIL; goto out; } else { ksft_print_msg("Create/open %s\n", filename); @@ -121,7 +139,7 @@ bool test_cachestat(const char *filename, bool write_random, bool create, if (write_random) { if (!write_exactly(fd, filesize)) { ksft_print_msg("Unable to access urandom.\n"); - ret = false; + ret = KSFT_FAIL; goto out1; } } @@ -132,7 +150,7 @@ bool test_cachestat(const char *filename, bool write_random, bool create, if (syscall_ret) { ksft_print_msg("Cachestat returned non-zero.\n"); - ret = false; + ret = KSFT_FAIL; goto out1; } else { @@ -142,15 +160,17 @@ bool test_cachestat(const char *filename, bool write_random, bool create, if (cs.nr_cache + cs.nr_evicted != num_pages) { ksft_print_msg( "Total number of cached and evicted pages is off.\n"); - ret = false; + ret = KSFT_FAIL; } } } if (test_fsync) { - if (fsync(fd)) { + if (is_on_tmpfs(fd)) { + ret = KSFT_SKIP; + } else if (fsync(fd)) { ksft_print_msg("fsync fails.\n"); - ret = false; + ret = KSFT_FAIL; } else { syscall_ret = syscall(cachestat_nr, fd, &cs_range, &cs, 0); @@ -161,13 +181,13 @@ bool test_cachestat(const char *filename, bool write_random, bool create, print_cachestat(&cs); if (cs.nr_dirty) { - ret = false; + ret = KSFT_FAIL; ksft_print_msg( "Number of dirty should be zero after fsync.\n"); } } else { ksft_print_msg("Cachestat (after fsync) returned non-zero.\n"); - ret = false; + ret = KSFT_FAIL; goto out1; } } @@ -236,13 +256,29 @@ out: int main(void) { - int ret = 0; + int ret; + + ksft_print_header(); + + ret = syscall(__NR_cachestat, -1, NULL, NULL, 0); + if (ret == -1 && errno == ENOSYS) + ksft_exit_skip("cachestat syscall not available\n"); + + ksft_set_plan(NR_TESTS); + + if (ret == -1 && errno == EBADF) { + ksft_test_result_pass("bad file descriptor recognized\n"); + ret = 0; + } else { + ksft_test_result_fail("bad file descriptor ignored\n"); + ret = 1; + } for (int i = 0; i < 5; i++) { const char *dev_filename = dev_files[i]; if (test_cachestat(dev_filename, false, false, false, - 4, O_RDONLY, 0400)) + 4, O_RDONLY, 0400) == KSFT_PASS) ksft_test_result_pass("cachestat works with %s\n", dev_filename); else { ksft_test_result_fail("cachestat fails with %s\n", dev_filename); @@ -251,13 +287,27 @@ int main(void) } if (test_cachestat("tmpfilecachestat", true, true, - true, 4, O_CREAT | O_RDWR, 0400 | 0600)) + false, 4, O_CREAT | O_RDWR, 0600) == KSFT_PASS) ksft_test_result_pass("cachestat works with a normal file\n"); else { ksft_test_result_fail("cachestat fails with normal file\n"); ret = 1; } + switch (test_cachestat("tmpfilecachestat", true, true, + true, 4, O_CREAT | O_RDWR, 0600)) { + case KSFT_FAIL: + ksft_test_result_fail("cachestat fsync fails with normal file\n"); + ret = KSFT_FAIL; + break; + case KSFT_PASS: + ksft_test_result_pass("cachestat fsync works with a normal file\n"); + break; + case KSFT_SKIP: + ksft_test_result_skip("tmpfilecachestat is on tmpfs\n"); + break; + } + if (test_cachestat_shmem()) ksft_test_result_pass("cachestat works with a shmem file\n"); else { diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 1b2cec9d18a4..ed2e50bb1e76 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -75,11 +75,11 @@ static int test_kmem_basic(const char *root) sleep(1); slab1 = cg_read_key_long(cg, "memory.stat", "slab "); - if (slab1 <= 0) + if (slab1 < 0) goto cleanup; current = cg_read_long(cg, "memory.current"); - if (current <= 0) + if (current < 0) goto cleanup; if (slab1 < slab0 / 2 && current < slab0 / 2) diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 4adaad1b822f..20294553a5dd 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -57,9 +57,14 @@ enum { #define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) /* Just the flags we need, copied from mm.h: */ -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite */ +#ifndef FOLL_WRITE +#define FOLL_WRITE 0x01 /* check pte is writable */ +#endif + +#ifndef FOLL_LONGTERM +#define FOLL_LONGTERM 0x100 /* mapping lifetime is indefinite */ +#endif FIXTURE(hmm) { int fd;