diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e619ac10cd23..9104952d323d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -821,6 +821,7 @@ asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); +asmlinkage long sys_mseal(unsigned long start, size_t len, unsigned long flags); asmlinkage long sys_mbind(unsigned long start, unsigned long len, unsigned long mode, const unsigned long __user *nmask, diff --git a/mm/Makefile b/mm/Makefile index 85f29ef7bedd..8fb85acda1b1 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -43,6 +43,10 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o endif +ifdef CONFIG_64BIT +mmu-$(CONFIG_MMU) += mseal.o +endif + obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page-writeback.o folio-compat.o \ readahead.o swap.o truncate.o vmscan.o shrinker.o \ diff --git a/mm/internal.h b/mm/internal.h index 2adabe369403..b2c75b12014e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1435,6 +1435,43 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); +#ifdef CONFIG_64BIT +/* VM is sealed, in vm_flags */ +#define VM_SEALED _BITUL(63) +#endif + +#ifdef CONFIG_64BIT +static inline int can_do_mseal(unsigned long flags) +{ + if (flags) + return -EINVAL; + + return 0; +} + +bool can_modify_mm(struct mm_struct *mm, unsigned long start, + unsigned long end); +bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, + unsigned long end, int behavior); +#else +static inline int can_do_mseal(unsigned long flags) +{ + return -EPERM; +} + +static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + return true; +} + +static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, + unsigned long end, int behavior) +{ + return true; +} +#endif + #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) diff --git a/mm/madvise.c b/mm/madvise.c index c8ba3f3eb54d..a77893462b92 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1401,6 +1401,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * -EIO - an I/O error occurred while paging in data. * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. + * -EPERM - memory is sealed. */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { @@ -1444,6 +1445,15 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh start = untagged_addr_remote(mm, start); end = start + len; + /* + * Check if the address range is sealed for do_madvise(). + * can_modify_mm_madv assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) { + error = -EPERM; + goto out; + } + blk_start_plug(&plug); switch (behavior) { case MADV_POPULATE_READ: @@ -1456,6 +1466,8 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh break; } blk_finish_plug(&plug); + +out: if (write) mmap_write_unlock(mm); else diff --git a/mm/mmap.c b/mm/mmap.c index d6d8ab119b72..83b4682ec85c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1255,6 +1255,16 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; + /* + * addr is returned from get_unmapped_area, + * There are two cases: + * 1> MAP_FIXED == false + * unallocated memory, no need to check sealing. + * 1> MAP_FIXED == true + * sealing is checked inside mmap_region when + * do_vmi_munmap is called. + */ + if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) @@ -2727,6 +2737,14 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, if (end == start) return -EINVAL; + /* + * Check if memory is sealed before arch_unmap. + * Prevent unmapping a sealed VMA. + * can_modify_mm assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm(mm, start, end))) + return -EPERM; + /* arch_unmap() might do unmaps itself. */ arch_unmap(mm, start, end); @@ -2789,7 +2807,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Unmap any existing mapping in the area */ - if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) + error = do_vmi_munmap(&vmi, mm, addr, len, uf, false); + if (error == -EPERM) + return error; + else if (error) return -ENOMEM; /* @@ -3139,6 +3160,14 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; + /* + * Check if memory is sealed before arch_unmap. + * Prevent unmapping a sealed VMA. + * can_modify_mm assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm(mm, start, end))) + return -EPERM; + arch_unmap(mm, start, end); return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); } diff --git a/mm/mprotect.c b/mm/mprotect.c index 94878c39ee32..8c6cd8825273 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -744,6 +745,15 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } } + /* + * checking if memory is sealed. + * can_modify_mm assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm(current->mm, start, end))) { + error = -EPERM; + goto out; + } + prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; diff --git a/mm/mremap.c b/mm/mremap.c index f5aba752d35f..5f96bc5ee918 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -902,7 +902,25 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if ((mm->map_count + 2) >= sysctl_max_map_count - 3) return -ENOMEM; + /* + * In mremap_to(). + * Move a VMA to another location, check if src addr is sealed. + * + * Place can_modify_mm here because mremap_to() + * does its own checking for address range, and we only + * check the sealing after passing those checks. + * + * can_modify_mm assumes we have acquired the lock on MM. + */ + if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) + return -EPERM; + if (flags & MREMAP_FIXED) { + /* + * In mremap_to(). + * VMA is moved to dst address, and munmap dst first. + * do_munmap will check if dst is sealed. + */ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); if (ret) goto out; @@ -1061,6 +1079,19 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } + /* + * Below is shrink/expand case (not mremap_to()) + * Check if src address is sealed, if so, reject. + * In other words, prevent shrinking or expanding a sealed VMA. + * + * Place can_modify_mm here so we can keep the logic related to + * shrink/expand together. + */ + if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) { + ret = -EPERM; + goto out; + } + /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. diff --git a/mm/mseal.c b/mm/mseal.c new file mode 100644 index 000000000000..bf783bba8ed0 --- /dev/null +++ b/mm/mseal.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement mseal() syscall. + * + * Copyright (c) 2023,2024 Google, Inc. + * + * Author: Jeff Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static inline bool vma_is_sealed(struct vm_area_struct *vma) +{ + return (vma->vm_flags & VM_SEALED); +} + +static inline void set_vma_sealed(struct vm_area_struct *vma) +{ + vm_flags_set(vma, VM_SEALED); +} + +/* + * check if a vma is sealed for modification. + * return true, if modification is allowed. + */ +static bool can_modify_vma(struct vm_area_struct *vma) +{ + if (unlikely(vma_is_sealed(vma))) + return false; + + return true; +} + +static bool is_madv_discard(int behavior) +{ + return behavior & + (MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED | + MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK); +} + +static bool is_ro_anon(struct vm_area_struct *vma) +{ + /* check anonymous mapping. */ + if (vma->vm_file || vma->vm_flags & VM_SHARED) + return false; + + /* + * check for non-writable: + * PROT=RO or PKRU is not writeable. + */ + if (!(vma->vm_flags & VM_WRITE) || + !arch_vma_access_permitted(vma, true, false, false)) + return true; + + return false; +} + +/* + * Check if the vmas of a memory range are allowed to be modified. + * the memory ranger can have a gap (unallocated memory). + * return true, if it is allowed. + */ +bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + + VMA_ITERATOR(vmi, mm, start); + + /* going through each vma to check. */ + for_each_vma_range(vmi, vma, end) { + if (unlikely(!can_modify_vma(vma))) + return false; + } + + /* Allow by default. */ + return true; +} + +/* + * Check if the vmas of a memory range are allowed to be modified by madvise. + * the memory ranger can have a gap (unallocated memory). + * return true, if it is allowed. + */ +bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end, + int behavior) +{ + struct vm_area_struct *vma; + + VMA_ITERATOR(vmi, mm, start); + + if (!is_madv_discard(behavior)) + return true; + + /* going through each vma to check. */ + for_each_vma_range(vmi, vma, end) + if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma))) + return false; + + /* Allow by default. */ + return true; +} + +static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, vm_flags_t newflags) +{ + int ret = 0; + vm_flags_t oldflags = vma->vm_flags; + + if (newflags == oldflags) + goto out; + + vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; + } + + set_vma_sealed(vma); +out: + *prev = vma; + return ret; +} + +/* + * Check for do_mseal: + * 1> start is part of a valid vma. + * 2> end is part of a valid vma. + * 3> No gap (unallocated address) between start and end. + * 4> map is sealable. + */ +static int check_mm_seal(unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + unsigned long nstart = start; + + VMA_ITERATOR(vmi, current->mm, start); + + /* going through each vma to check. */ + for_each_vma_range(vmi, vma, end) { + if (vma->vm_start > nstart) + /* unallocated memory found. */ + return -ENOMEM; + + if (vma->vm_end >= end) + return 0; + + nstart = vma->vm_end; + } + + return -ENOMEM; +} + +/* + * Apply sealing. + */ +static int apply_mm_seal(unsigned long start, unsigned long end) +{ + unsigned long nstart; + struct vm_area_struct *vma, *prev; + + VMA_ITERATOR(vmi, current->mm, start); + + vma = vma_iter_load(&vmi); + /* + * Note: check_mm_seal should already checked ENOMEM case. + * so vma should not be null, same for the other ENOMEM cases. + */ + prev = vma_prev(&vmi); + if (start > vma->vm_start) + prev = vma; + + nstart = start; + for_each_vma_range(vmi, vma, end) { + int error; + unsigned long tmp; + vm_flags_t newflags; + + newflags = vma->vm_flags | VM_SEALED; + tmp = vma->vm_end; + if (tmp > end) + tmp = end; + error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); + if (error) + return error; + nstart = vma_iter_end(&vmi); + } + + return 0; +} + +/* + * mseal(2) seals the VM's meta data from + * selected syscalls. + * + * addr/len: VM address range. + * + * The address range by addr/len must meet: + * start (addr) must be in a valid VMA. + * end (addr + len) must be in a valid VMA. + * no gap (unallocated memory) between start and end. + * start (addr) must be page aligned. + * + * len: len will be page aligned implicitly. + * + * Below VMA operations are blocked after sealing. + * 1> Unmapping, moving to another location, and shrinking + * the size, via munmap() and mremap(), can leave an empty + * space, therefore can be replaced with a VMA with a new + * set of attributes. + * 2> Moving or expanding a different vma into the current location, + * via mremap(). + * 3> Modifying a VMA via mmap(MAP_FIXED). + * 4> Size expansion, via mremap(), does not appear to pose any + * specific risks to sealed VMAs. It is included anyway because + * the use case is unclear. In any case, users can rely on + * merging to expand a sealed VMA. + * 5> mprotect and pkey_mprotect. + * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED) + * for anonymous memory, when users don't have write permission to the + * memory. Those behaviors can alter region contents by discarding pages, + * effectively a memset(0) for anonymous memory. + * + * flags: reserved. + * + * return values: + * zero: success. + * -EINVAL: + * invalid input flags. + * start address is not page aligned. + * Address arange (start + len) overflow. + * -ENOMEM: + * addr is not a valid address (not allocated). + * end (start + len) is not a valid address. + * a gap (unallocated memory) between start and end. + * -EPERM: + * - In 32 bit architecture, sealing is not supported. + * Note: + * user can call mseal(2) multiple times, adding a seal on an + * already sealed memory is a no-action (no error). + * + * unseal() is not supported. + */ +static int do_mseal(unsigned long start, size_t len_in, unsigned long flags) +{ + size_t len; + int ret = 0; + unsigned long end; + struct mm_struct *mm = current->mm; + + ret = can_do_mseal(flags); + if (ret) + return ret; + + start = untagged_addr(start); + if (!PAGE_ALIGNED(start)) + return -EINVAL; + + len = PAGE_ALIGN(len_in); + /* Check to see whether len was rounded up from small -ve to zero. */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + /* + * First pass, this helps to avoid + * partial sealing in case of error in input address range, + * e.g. ENOMEM error. + */ + ret = check_mm_seal(start, end); + if (ret) + goto out; + + /* + * Second pass, this should success, unless there are errors + * from vma_modify_flags, e.g. merge/split error, or process + * reaching the max supported VMAs, however, those cases shall + * be rare. + */ + ret = apply_mm_seal(start, end); + +out: + mmap_write_unlock(current->mm); + return ret; +} + +SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, + flags) +{ + return do_mseal(start, len, flags); +}