diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index f4f4077b97aa..53750224e176 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1003,6 +1003,35 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define arch_start_context_switch(prev) do {} while (0) #endif +/* + * When replacing an anonymous page by a real (!non) swap entry, we clear + * PG_anon_exclusive from the page and instead remember whether the flag was + * set in the swp pte. During fork(), we have to mark the entry as !exclusive + * (possibly shared). On swapin, we use that information to restore + * PG_anon_exclusive, which is very helpful in cases where we might have + * additional (e.g., FOLL_GET) references on a page and wouldn't be able to + * detect exclusivity. + * + * These functions don't apply to non-swap entries (e.g., migration, hwpoison, + * ...). + */ +#ifndef __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return pte; +} + +static inline int pte_swp_exclusive(pte_t pte) +{ + return false; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return pte; +} +#endif + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index e476a8fed537..d1b728904d4e 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -26,6 +26,8 @@ /* Clear all flags but only keep swp_entry_t related information */ static inline pte_t pte_swp_clear_flags(pte_t pte) { + if (pte_swp_exclusive(pte)) + pte = pte_swp_clear_exclusive(pte); if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_soft_dirty(pte); if (pte_swp_uffd_wp(pte)) diff --git a/mm/memory.c b/mm/memory.c index a75040a47fcc..bca60092b4d5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -792,6 +792,11 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, &src_mm->mmlist); spin_unlock(&mmlist_lock); } + /* Mark the swap entry as shared. */ + if (pte_swp_exclusive(*src_pte)) { + pte = pte_swp_clear_exclusive(*src_pte); + set_pte_at(src_mm, addr, src_pte, pte); + } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { page = pfn_swap_entry_to_page(entry); @@ -3563,6 +3568,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) struct page *page = NULL, *swapcache; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; + bool exclusive = false; swp_entry_t entry; pte_t pte; int locked; @@ -3728,6 +3734,46 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); BUG_ON(PageAnon(page) && PageAnonExclusive(page)); + /* + * Check under PT lock (to protect against concurrent fork() sharing + * the swap entry concurrently) for certainly exclusive pages. + */ + if (!PageKsm(page)) { + /* + * Note that pte_swp_exclusive() == false for architectures + * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE. + */ + exclusive = pte_swp_exclusive(vmf->orig_pte); + if (page != swapcache) { + /* + * We have a fresh page that is not exposed to the + * swapcache -> certainly exclusive. + */ + exclusive = true; + } else if (exclusive && PageWriteback(page) && + (swp_swap_info(entry)->flags & SWP_STABLE_WRITES)) { + /* + * This is tricky: not all swap backends support + * concurrent page modifications while under writeback. + * + * So if we stumble over such a page in the swapcache + * we must not set the page exclusive, otherwise we can + * map it writable without further checks and modify it + * while still under writeback. + * + * For these problematic swap backends, simply drop the + * exclusive marker: this is perfectly fine as we start + * writeback only if we fully unmapped the page and + * there are no unexpected references on the page after + * unmapping succeeded. After fully unmapped, no + * further GUP references (FOLL_GET and FOLL_PIN) can + * appear, so dropping the exclusive marker and mapping + * it only R/O is fine. + */ + exclusive = false; + } + } + /* * Remove the swap entry and conditionally try to free up the swapcache. * We're already holding a reference on the page but haven't mapped it @@ -3742,11 +3788,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) pte = mk_pte(page, vma->vm_page_prot); /* - * Same logic as in do_wp_page(); however, optimize for fresh pages - * that are certainly not shared because we just allocated them without - * exposing them to the swapcache. + * Same logic as in do_wp_page(); however, optimize for pages that are + * certainly not shared either because we just allocated them without + * exposing them to the swapcache or because the swap entry indicates + * exclusivity. */ - if (!PageKsm(page) && (page != swapcache || page_count(page) == 1)) { + if (!PageKsm(page) && (exclusive || page_count(page) == 1)) { if (vmf->flags & FAULT_FLAG_WRITE) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; diff --git a/mm/rmap.c b/mm/rmap.c index f96cc7eb23ec..86ed2865210d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1698,14 +1698,15 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, break; } /* - * Note: We *don't* remember yet if the page was mapped - * exclusively in the swap entry, so swapin code has - * to re-determine that manually and might detect the - * page as possibly shared, for example, if there are - * other references on the page or if the page is under - * writeback. We made sure that there are no GUP pins - * on the page that would rely on it, so for GUP pins - * this is fine. + * Note: We *don't* remember if the page was mapped + * exclusively in the swap pte if the architecture + * doesn't support __HAVE_ARCH_PTE_SWP_EXCLUSIVE. In + * that case, swapin code has to re-determine that + * manually and might detect the page as possibly + * shared, for example, if there are other references on + * the page or if the page is under writeback. We made + * sure that there are no GUP pins on the page that + * would rely on it, so for GUP pins this is fine. */ if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); @@ -1716,6 +1717,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, dec_mm_counter(mm, MM_ANONPAGES); inc_mm_counter(mm, MM_SWAPENTS); swp_pte = swp_entry_to_pte(entry); + if (anon_exclusive) + swp_pte = pte_swp_mkexclusive(swp_pte); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) diff --git a/mm/swapfile.c b/mm/swapfile.c index a7847324d476..7279b2d2d71d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1804,7 +1804,18 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); if (page == swapcache) { - page_add_anon_rmap(page, vma, addr, RMAP_NONE); + rmap_t rmap_flags = RMAP_NONE; + + /* + * See do_swap_page(): PageWriteback() would be problematic. + * However, we do a wait_on_page_writeback() just before this + * call and have the page locked. + */ + VM_BUG_ON_PAGE(PageWriteback(page), page); + if (pte_swp_exclusive(*pte)) + rmap_flags |= RMAP_EXCLUSIVE; + + page_add_anon_rmap(page, vma, addr, rmap_flags); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma);