diff --git a/MAINTAINERS b/MAINTAINERS index 7a2f487ea49a..a55d4ef91b0b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5179,6 +5179,7 @@ T: git git://people.freedesktop.org/~thomash/linux S: Supported F: drivers/gpu/drm/vmwgfx/ F: include/uapi/drm/vmwgfx_drm.h +F: mm/as_dirty_helpers.c DRM DRIVERS M: David Airlie diff --git a/include/linux/mm.h b/include/linux/mm.h index 3d06ce2a64af..a0bc2a82917e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2685,7 +2685,14 @@ struct pfn_range_apply { }; extern int apply_to_pfn_range(struct pfn_range_apply *closure, unsigned long address, unsigned long size); - +unsigned long apply_as_wrprotect(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr); +unsigned long apply_as_clean(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr, + pgoff_t bitmap_pgoff, + unsigned long *bitmap, + pgoff_t *start, + pgoff_t *end); #ifdef CONFIG_PAGE_POISONING extern bool page_poisoning_enabled(void); extern void kernel_poison_pages(struct page *page, int numpages, int enable); diff --git a/mm/Kconfig b/mm/Kconfig index f0c76ba47695..5006d0e6a5c7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -765,4 +765,7 @@ config GUP_BENCHMARK config ARCH_HAS_PTE_SPECIAL bool +config AS_DIRTY_HELPERS + bool + endmenu diff --git a/mm/Makefile b/mm/Makefile index ac5e5ba78874..f5d412bbc2f7 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -104,3 +104,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o +obj-$(CONFIG_AS_DIRTY_HELPERS) += as_dirty_helpers.o diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c new file mode 100644 index 000000000000..f600e31534fb --- /dev/null +++ b/mm/as_dirty_helpers.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +/** + * struct apply_as - Closure structure for apply_as_range + * @base: struct pfn_range_apply we derive from + * @start: Address of first modified pte + * @end: Address of last modified pte + 1 + * @total: Total number of modified ptes + * @vma: Pointer to the struct vm_area_struct we're currently operating on + */ +struct apply_as { + struct pfn_range_apply base; + unsigned long start; + unsigned long end; + unsigned long total; + struct vm_area_struct *vma; +}; + +/** + * apply_pt_wrprotect - Leaf pte callback to write-protect a pte + * @pte: Pointer to the pte + * @token: Page table token, see apply_to_pfn_range() + * @addr: The virtual page address + * @closure: Pointer to a struct pfn_range_apply embedded in a + * struct apply_as + * + * The function write-protects a pte and records the range in + * virtual address space of touched ptes for efficient range TLB flushes. + * + * Return: Always zero. + */ +static int apply_pt_wrprotect(pte_t *pte, pgtable_t token, + unsigned long addr, + struct pfn_range_apply *closure) +{ + struct apply_as *aas = container_of(closure, typeof(*aas), base); + pte_t ptent = *pte; + + if (pte_write(ptent)) { + pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte); + + ptent = pte_wrprotect(old_pte); + ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent); + aas->total++; + aas->start = min(aas->start, addr); + aas->end = max(aas->end, addr + PAGE_SIZE); + } + + return 0; +} + +/** + * struct apply_as_clean - Closure structure for apply_as_clean + * @base: struct apply_as we derive from + * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap + * @bitmap: Bitmap with one bit for each page offset in the address_space range + * covered. + * @start: Address_space page offset of first modified pte relative + * to @bitmap_pgoff + * @end: Address_space page offset of last modified pte relative + * to @bitmap_pgoff + */ +struct apply_as_clean { + struct apply_as base; + pgoff_t bitmap_pgoff; + unsigned long *bitmap; + pgoff_t start; + pgoff_t end; +}; + +/** + * apply_pt_clean - Leaf pte callback to clean a pte + * @pte: Pointer to the pte + * @token: Page table token, see apply_to_pfn_range() + * @addr: The virtual page address + * @closure: Pointer to a struct pfn_range_apply embedded in a + * struct apply_as_clean + * + * The function cleans a pte and records the range in + * virtual address space of touched ptes for efficient TLB flushes. + * It also records dirty ptes in a bitmap representing page offsets + * in the address_space, as well as the first and last of the bits + * touched. + * + * Return: Always zero. + */ +static int apply_pt_clean(pte_t *pte, pgtable_t token, + unsigned long addr, + struct pfn_range_apply *closure) +{ + struct apply_as *aas = container_of(closure, typeof(*aas), base); + struct apply_as_clean *clean = container_of(aas, typeof(*clean), base); + pte_t ptent = *pte; + + if (pte_dirty(ptent)) { + pgoff_t pgoff = ((addr - aas->vma->vm_start) >> PAGE_SHIFT) + + aas->vma->vm_pgoff - clean->bitmap_pgoff; + pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte); + + ptent = pte_mkclean(old_pte); + ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent); + + aas->total++; + aas->start = min(aas->start, addr); + aas->end = max(aas->end, addr + PAGE_SIZE); + + __set_bit(pgoff, clean->bitmap); + clean->start = min(clean->start, pgoff); + clean->end = max(clean->end, pgoff + 1); + } + + return 0; +} + +/** + * apply_as_range - Apply a pte callback to all PTEs pointing into a range + * of an address_space. + * @mapping: Pointer to the struct address_space + * @aas: Closure structure + * @first_index: First page offset in the address_space + * @nr: Number of incremental page offsets to cover + * + * Return: Number of ptes touched. Note that this number might be larger + * than @nr if there are overlapping vmas + */ +static unsigned long apply_as_range(struct address_space *mapping, + struct apply_as *aas, + pgoff_t first_index, pgoff_t nr) +{ + struct vm_area_struct *vma; + pgoff_t vba, vea, cba, cea; + unsigned long start_addr, end_addr; + struct mmu_notifier_range range; + + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, + first_index + nr - 1) { + unsigned long vm_flags = READ_ONCE(vma->vm_flags); + + /* + * We can only do advisory flag tests below, since we can't + * require the vm's mmap_sem to be held to protect the flags. + * Therefore, callers that strictly depend on specific mmap + * flags to remain constant throughout the operation must + * either ensure those flags are immutable for all relevant + * vmas or can't use this function. Fixing this properly would + * require the vma::vm_flags to be protected by a separate + * lock taken after the i_mmap_lock + */ + + /* Skip non-applicable VMAs */ + if ((vm_flags & (VM_SHARED | VM_WRITE)) != + (VM_SHARED | VM_WRITE)) + continue; + + /* Warn on and skip VMAs whose flags indicate illegal usage */ + if (WARN_ON((vm_flags & (VM_HUGETLB | VM_IO)) != VM_IO)) + continue; + + /* Clip to the vma */ + vba = vma->vm_pgoff; + vea = vba + vma_pages(vma); + cba = first_index; + cba = max(cba, vba); + cea = first_index + nr; + cea = min(cea, vea); + + /* Translate to virtual address */ + start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; + end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; + if (start_addr >= end_addr) + continue; + + aas->base.mm = vma->vm_mm; + aas->vma = vma; + aas->start = end_addr; + aas->end = start_addr; + + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, + vma, vma->vm_mm, start_addr, end_addr); + mmu_notifier_invalidate_range_start(&range); + + /* Needed when we only change protection? */ + flush_cache_range(vma, start_addr, end_addr); + + /* + * We're not using tlb_gather_mmu() since typically + * only a small subrange of PTEs are affected. + */ + inc_tlb_flush_pending(vma->vm_mm); + + /* Should not error since aas->base.alloc == 0 */ + WARN_ON(apply_to_pfn_range(&aas->base, start_addr, + end_addr - start_addr)); + if (aas->end > aas->start) + flush_tlb_range(vma, aas->start, aas->end); + + mmu_notifier_invalidate_range_end(&range); + dec_tlb_flush_pending(vma->vm_mm); + } + i_mmap_unlock_read(mapping); + + return aas->total; +} + +/** + * apply_as_wrprotect - Write-protect all ptes in an address_space range + * @mapping: The address_space we want to write protect + * @first_index: The first page offset in the range + * @nr: Number of incremental page offsets to cover + * + * WARNING: This function should only be used for address spaces whose + * vmas are marked VM_IO and that do not contain huge pages. + * To avoid interference with COW'd pages, vmas not marked VM_SHARED are + * simply skipped. + * + * Return: The number of ptes actually write-protected. Note that + * already write-protected ptes are not counted. + */ +unsigned long apply_as_wrprotect(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr) +{ + struct apply_as aas = { + .base = { + .alloc = 0, + .ptefn = apply_pt_wrprotect, + }, + .total = 0, + }; + + return apply_as_range(mapping, &aas, first_index, nr); +} +EXPORT_SYMBOL_GPL(apply_as_wrprotect); + +/** + * apply_as_clean - Clean all ptes in an address_space range + * @mapping: The address_space we want to clean + * @first_index: The first page offset in the range + * @nr: Number of incremental page offsets to cover + * @bitmap_pgoff: The page offset of the first bit in @bitmap + * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to + * cover the whole range @first_index..@first_index + @nr. + * @start: Pointer to number of the first set bit in @bitmap. + * is modified as new bits are set by the function. + * @end: Pointer to the number of the last set bit in @bitmap. + * none set. The value is modified as new bits are set by the function. + * + * Note: When this function returns there is no guarantee that a CPU has + * not already dirtied new ptes. However it will not clean any ptes not + * reported in the bitmap. + * + * If a caller needs to make sure all dirty ptes are picked up and none + * additional are added, it first needs to write-protect the address-space + * range and make sure new writers are blocked in page_mkwrite() or + * pfn_mkwrite(). And then after a TLB flush following the write-protection + * pick up all dirty bits. + * + * WARNING: This function should only be used for address spaces whose + * vmas are marked VM_IO and that do not contain huge pages. + * To avoid interference with COW'd pages, vmas not marked VM_SHARED are + * simply skipped. + * + * Return: The number of dirty ptes actually cleaned. + */ +unsigned long apply_as_clean(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr, + pgoff_t bitmap_pgoff, + unsigned long *bitmap, + pgoff_t *start, + pgoff_t *end) +{ + bool none_set = (*start >= *end); + struct apply_as_clean clean = { + .base = { + .base = { + .alloc = 0, + .ptefn = apply_pt_clean, + }, + .total = 0, + }, + .bitmap_pgoff = bitmap_pgoff, + .bitmap = bitmap, + .start = none_set ? nr : *start, + .end = none_set ? 0 : *end, + }; + unsigned long ret = apply_as_range(mapping, &clean.base, first_index, + nr); + + *start = clean.start; + *end = clean.end; + return ret; +} +EXPORT_SYMBOL_GPL(apply_as_clean);