diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f6c7ff316daf..30e7709a5121 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3992,6 +3992,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, unsigned long src_addr, struct page **pagep) { + int vm_shared = dst_vma->vm_flags & VM_SHARED; struct hstate *h = hstate_vma(dst_vma); pte_t _dst_pte; spinlock_t *ptl; @@ -4028,6 +4029,18 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); set_page_huge_active(page); + /* + * If shared, add to page cache + */ + if (vm_shared) { + struct address_space *mapping = dst_vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); + + ret = huge_add_to_page_cache(page, mapping, idx); + if (ret) + goto out_release_nounlock; + } + ptl = huge_pte_lockptr(h, dst_mm, dst_pte); spin_lock(ptl); @@ -4035,8 +4048,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!huge_pte_none(huge_ptep_get(dst_pte))) goto out_release_unlock; - ClearPagePrivate(page); - hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + if (vm_shared) { + page_dup_rmap(page, true); + } else { + ClearPagePrivate(page); + hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + } _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); if (dst_vma->vm_flags & VM_WRITE) @@ -4053,11 +4070,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); + if (vm_shared) + unlock_page(page); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); +out_release_nounlock: + if (vm_shared) + unlock_page(page); put_page(page); goto out; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index a0817cc470b0..1e5c2f94e8a3 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -154,6 +154,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long len, bool zeropage) { + int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; + int vm_shared = dst_vma->vm_flags & VM_SHARED; ssize_t err; pte_t *dst_pte; unsigned long src_addr, dst_addr; @@ -204,14 +206,14 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, goto out_unlock; /* - * Make sure the vma is not shared, that the remaining dst - * range is both valid and fully within a single existing vma. + * Make sure the remaining dst range is both valid and + * fully within a single existing vma. */ - if (dst_vma->vm_flags & VM_SHARED) - goto out_unlock; if (dst_start < dst_vma->vm_start || dst_start + len > dst_vma->vm_end) goto out_unlock; + + vm_shared = dst_vma->vm_flags & VM_SHARED; } if (WARN_ON(dst_addr & (vma_hpagesize - 1) || @@ -225,11 +227,13 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, goto out_unlock; /* - * Ensure the dst_vma has a anon_vma. + * If not shared, ensure the dst_vma has a anon_vma. */ err = -ENOMEM; - if (unlikely(anon_vma_prepare(dst_vma))) - goto out_unlock; + if (!vm_shared) { + if (unlikely(anon_vma_prepare(dst_vma))) + goto out_unlock; + } h = hstate_vma(dst_vma); @@ -266,6 +270,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, dst_addr, src_addr, &page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); + vm_alloc_shared = vm_shared; cond_resched(); @@ -305,18 +310,49 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, if (page) { /* * We encountered an error and are about to free a newly - * allocated huge page. It is possible that there was a - * reservation associated with the page that has been - * consumed. See the routine restore_reserve_on_error - * for details. Unfortunately, we can not call - * restore_reserve_on_error now as it would require holding - * mmap_sem. Clear the PagePrivate flag so that the global + * allocated huge page. + * + * Reservation handling is very subtle, and is different for + * private and shared mappings. See the routine + * restore_reserve_on_error for details. Unfortunately, we + * can not call restore_reserve_on_error now as it would + * require holding mmap_sem. + * + * If a reservation for the page existed in the reservation + * map of a private mapping, the map was modified to indicate + * the reservation was consumed when the page was allocated. + * We clear the PagePrivate flag now so that the global * reserve count will not be incremented in free_huge_page. * The reservation map will still indicate the reservation * was consumed and possibly prevent later page allocation. - * This is better than leaking a global reservation. + * This is better than leaking a global reservation. If no + * reservation existed, it is still safe to clear PagePrivate + * as no adjustments to reservation counts were made during + * allocation. + * + * The reservation map for shared mappings indicates which + * pages have reservations. When a huge page is allocated + * for an address with a reservation, no change is made to + * the reserve map. In this case PagePrivate will be set + * to indicate that the global reservation count should be + * incremented when the page is freed. This is the desired + * behavior. However, when a huge page is allocated for an + * address without a reservation a reservation entry is added + * to the reservation map, and PagePrivate will not be set. + * When the page is freed, the global reserve count will NOT + * be incremented and it will appear as though we have leaked + * reserved page. In this case, set PagePrivate so that the + * global reserve count will be incremented to match the + * reservation map entry which was created. + * + * Note that vm_alloc_shared is based on the flags of the vma + * for which the page was originally allocated. dst_vma could + * be different or NULL on error. */ - ClearPagePrivate(page); + if (vm_alloc_shared) + SetPagePrivate(page); + else + ClearPagePrivate(page); put_page(page); } BUG_ON(copied < 0); @@ -372,8 +408,14 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, dst_vma = find_vma(dst_mm, dst_start); if (!dst_vma) goto out_unlock; - if (!vma_is_shmem(dst_vma) && dst_vma->vm_flags & VM_SHARED) + /* + * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but + * it will overwrite vm_ops, so vma_is_anonymous must return false. + */ + if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && + dst_vma->vm_flags & VM_SHARED)) goto out_unlock; + if (dst_start < dst_vma->vm_start || dst_start + len > dst_vma->vm_end) goto out_unlock;