diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index a18ec7f63888..56b4ee5a6c9e 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -71,6 +71,9 @@ #define MADV_COLD 20 /* deactivate these pages */ #define MADV_PAGEOUT 21 /* reclaim these pages */ +#define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */ +#define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 57dc2ac4f8bd..40b210c65a5a 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -98,6 +98,9 @@ #define MADV_COLD 20 /* deactivate these pages */ #define MADV_PAGEOUT 21 /* reclaim these pages */ +#define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */ +#define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index ab78cba446ed..9e3c010c0f61 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -52,6 +52,9 @@ #define MADV_COLD 20 /* deactivate these pages */ #define MADV_PAGEOUT 21 /* reclaim these pages */ +#define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */ +#define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */ + #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index e5e643752947..b3a22095371b 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -106,6 +106,9 @@ #define MADV_COLD 20 /* deactivate these pages */ #define MADV_PAGEOUT 21 /* reclaim these pages */ +#define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */ +#define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index f94f65d429be..1567a3294c3d 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -72,6 +72,9 @@ #define MADV_COLD 20 /* deactivate these pages */ #define MADV_PAGEOUT 21 /* reclaim these pages */ +#define MADV_POPULATE_READ 22 /* populate (prefault) page tables readable */ +#define MADV_POPULATE_WRITE 23 /* populate (prefault) page tables writable */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/gup.c b/mm/gup.c index 8651309f8ec3..728d996767cb 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1500,6 +1500,64 @@ long populate_vma_page_range(struct vm_area_struct *vma, NULL, NULL, locked); } +/* + * faultin_vma_page_range() - populate (prefault) page tables inside the + * given VMA range readable/writable + * + * This takes care of mlocking the pages, too, if VM_LOCKED is set. + * + * @vma: target vma + * @start: start address + * @end: end address + * @write: whether to prefault readable or writable + * @locked: whether the mmap_lock is still held + * + * Returns either number of processed pages in the vma, or a negative error + * code on error (see __get_user_pages()). + * + * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and + * covered by the VMA. + * + * If @locked is NULL, it may be held for read or write and will be unperturbed. + * + * If @locked is non-NULL, it must held for read only and may be released. If + * it's released, *@locked will be set to 0. + */ +long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end, bool write, int *locked) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long nr_pages = (end - start) / PAGE_SIZE; + int gup_flags; + + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); + VM_BUG_ON_VMA(start < vma->vm_start, vma); + VM_BUG_ON_VMA(end > vma->vm_end, vma); + mmap_assert_locked(mm); + + /* + * FOLL_TOUCH: Mark page accessed and thereby young; will also mark + * the page dirty with FOLL_WRITE -- which doesn't make a + * difference with !FOLL_FORCE, because the page is writable + * in the page table. + * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit + * a poisoned page. + * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT. + * !FOLL_FORCE: Require proper access permissions. + */ + gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON; + if (write) + gup_flags |= FOLL_WRITE; + + /* + * See check_vma_flags(): Will return -EFAULT on incompatible mappings + * or with insufficient permissions. + */ + return __get_user_pages(mm, start, nr_pages, gup_flags, + NULL, NULL, locked); +} + /* * __mm_populate - populate and/or mlock pages within a range of address space. * diff --git a/mm/internal.h b/mm/internal.h index d9c2b33cd2ab..f38bc5d75ab7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -345,6 +345,9 @@ void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); #ifdef CONFIG_MMU extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); +extern long faultin_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + bool write, int *locked); extern void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); static inline void munlock_vma_pages_all(struct vm_area_struct *vma) diff --git a/mm/madvise.c b/mm/madvise.c index 63e489e5bfdb..6d3d348b17f4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -53,6 +53,8 @@ static int madvise_need_mmap_write(int behavior) case MADV_COLD: case MADV_PAGEOUT: case MADV_FREE: + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -822,6 +824,61 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, return -EINVAL; } +static long madvise_populate(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + int behavior) +{ + const bool write = behavior == MADV_POPULATE_WRITE; + struct mm_struct *mm = vma->vm_mm; + unsigned long tmp_end; + int locked = 1; + long pages; + + *prev = vma; + + while (start < end) { + /* + * We might have temporarily dropped the lock. For example, + * our VMA might have been split. + */ + if (!vma || start >= vma->vm_end) { + vma = find_vma(mm, start); + if (!vma || start < vma->vm_start) + return -ENOMEM; + } + + tmp_end = min_t(unsigned long, end, vma->vm_end); + /* Populate (prefault) page tables readable/writable. */ + pages = faultin_vma_page_range(vma, start, tmp_end, write, + &locked); + if (!locked) { + mmap_read_lock(mm); + locked = 1; + *prev = NULL; + vma = NULL; + } + if (pages < 0) { + switch (pages) { + case -EINTR: + return -EINTR; + case -EFAULT: /* Incompatible mappings / permissions. */ + return -EINVAL; + case -EHWPOISON: + return -EHWPOISON; + default: + pr_warn_once("%s: unhandled return value: %ld\n", + __func__, pages); + fallthrough; + case -ENOMEM: + return -ENOMEM; + } + } + start += pages * PAGE_SIZE; + } + return 0; +} + /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. @@ -935,6 +992,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, case MADV_FREE: case MADV_DONTNEED: return madvise_dontneed_free(vma, prev, start, end, behavior); + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + return madvise_populate(vma, prev, start, end, behavior); default: return madvise_behavior(vma, prev, start, end, behavior); } @@ -955,6 +1015,8 @@ madvise_behavior_valid(int behavior) case MADV_FREE: case MADV_COLD: case MADV_PAGEOUT: + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: @@ -1042,6 +1104,10 @@ process_madvise_behavior_valid(int behavior) * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. + * MADV_POPULATE_READ - populate (prefault) page tables readable by + * triggering read faults if required + * MADV_POPULATE_WRITE - populate (prefault) page tables writable by + * triggering write faults if required * * return values: * zero - success