From 61e21cf2d2c3cc5e60e8d0a62a77e250fccda62c Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 27 Sep 2023 17:44:01 +0800 Subject: [PATCH 01/20] mm/page_alloc: correct start page when guard page debug is enabled When guard page debug is enabled and set_page_guard returns success, we miss to forward page to point to start of next split range and we will do split unexpectedly in page range without target page. Move start page update before set_page_guard to fix this. As we split to wrong target page, then splited pages are not able to merge back to original order when target page is put back and splited pages except target page is not usable. To be specific: Consider target page is the third page in buddy page with order 2. | buddy-2 | Page | Target | Page | After break down to target page, we will only set first page to Guard because of bug. | Guard | Page | Target | Page | When we try put_page_back_buddy with target page, the buddy page of target if neither guard nor buddy, Then it's not able to construct original page with order 2 | Guard | Page | buddy-0 | Page | All pages except target page is not in free list and is not usable. Link: https://lkml.kernel.org/r/20230927094401.68205-1-shikemeng@huaweicloud.com Fixes: 06be6ff3d2ec ("mm,hwpoison: rework soft offline for free pages") Signed-off-by: Kemeng Shi Acked-by: Naoya Horiguchi Cc: Matthew Wilcox (Oracle) Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 95546f376302..85741403948f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6475,6 +6475,7 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, next_page = page; current_buddy = page + size; } + page = next_page; if (set_page_guard(zone, current_buddy, high, migratetype)) continue; @@ -6482,7 +6483,6 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, if (current_buddy != target) { add_to_free_list(current_buddy, zone, high, migratetype); set_buddy_order(current_buddy, high); - page = next_page; } } } From 51f625377561e5b167da2db5aafb7ee268f691c5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 28 Sep 2023 13:24:32 -0400 Subject: [PATCH 02/20] mm/mempolicy: fix set_mempolicy_home_node() previous VMA pointer The two users of mbind_range() are expecting that mbind_range() will update the pointer to the previous VMA, or return an error. However, set_mempolicy_home_node() does not call mbind_range() if there is no VMA policy. The fix is to update the pointer to the previous VMA prior to continuing iterating the VMAs when there is no policy. Users may experience a WARN_ON() during VMA policy updates when updating a range of VMAs on the home node. Link: https://lkml.kernel.org/r/20230928172432.2246534-1-Liam.Howlett@oracle.com Link: https://lore.kernel.org/linux-mm/CALcu4rbT+fMVNaO_F2izaCT+e7jzcAciFkOvk21HGJsmLcUuwQ@mail.gmail.com/ Fixes: f4e9e0e69468 ("mm/mempolicy: fix use-after-free of VMA iterator") Signed-off-by: Liam R. Howlett Reported-by: Yikebaer Aizezi Closes: https://lore.kernel.org/linux-mm/CALcu4rbT+fMVNaO_F2izaCT+e7jzcAciFkOvk21HGJsmLcUuwQ@mail.gmail.com/ Reviewed-by: Lorenzo Stoakes Cc: Signed-off-by: Andrew Morton --- mm/mempolicy.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f1b00d6ac7ee..29ebf1e7898c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1543,8 +1543,10 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le * the home node for vmas we already updated before. */ old = vma_policy(vma); - if (!old) + if (!old) { + prev = vma; continue; + } if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { err = -EOPNOTSUPP; break; From e0f81ab1e4f42ffece6440dc78f583eb352b9a71 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Fri, 29 Sep 2023 10:19:41 -0700 Subject: [PATCH 03/20] mm: fix vm_brk_flags() to not bail out while holding lock Calling vm_brk_flags() with flags set other than VM_EXEC will exit the function without releasing the mmap_write_lock. Just do the sanity check before the lock is acquired. This doesn't fix an actual issue since no caller sets a flag other than VM_EXEC. Link: https://lkml.kernel.org/r/20230929171937.work.697-kees@kernel.org Fixes: 2e7ce7d354f2 ("mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()") Signed-off-by: Sebastian Ott Signed-off-by: Kees Cook Reviewed-by: Liam R. Howlett Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/mmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index b56a7f0c9f85..7ed286662839 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3143,13 +3143,13 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (!len) return 0; - if (mmap_write_lock_killable(mm)) - return -EINTR; - /* Until we need other flags, refuse anything except VM_EXEC. */ if ((flags & (~VM_EXEC)) != 0) return -EINVAL; + if (mmap_write_lock_killable(mm)) + return -EINTR; + ret = check_brk_limits(addr, len); if (ret) goto limits_failed; From 1419430c8abb5a00590169068590dd54d86590ba Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 29 Sep 2023 14:30:39 -0400 Subject: [PATCH 04/20] mmap: fix vma_iterator in error path of vma_merge() During the error path, the vma iterator may not be correctly positioned or set to the correct range. Undo the vma_prev() call by resetting to the passed in address. Re-walking to the same range will fix the range to the area previously passed in. Users would notice increased cycles as vma_merge() would be called an extra time with vma == prev, and thus would fail to merge and return. Link: https://lore.kernel.org/linux-mm/CAG48ez12VN1JAOtTNMY+Y2YnsU45yL5giS-Qn=ejtiHpgJAbdQ@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230929183041.2835469-2-Liam.Howlett@oracle.com Fixes: 18b098af2890 ("vma_merge: set vma iterator to correct position.") Signed-off-by: Liam R. Howlett Reported-by: Jann Horn Closes: https://lore.kernel.org/linux-mm/CAG48ez12VN1JAOtTNMY+Y2YnsU45yL5giS-Qn=ejtiHpgJAbdQ@mail.gmail.com/ Reviewed-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- mm/mmap.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 7ed286662839..a0917ed26057 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -975,7 +975,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, /* Error in anon_vma clone. */ if (err) - return NULL; + goto anon_vma_fail; if (vma_start < vma->vm_start || vma_end > vma->vm_end) vma_expanded = true; @@ -988,7 +988,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, } if (vma_iter_prealloc(vmi, vma)) - return NULL; + goto prealloc_fail; init_multi_vma_prep(&vp, vma, adjust, remove, remove2); VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && @@ -1016,6 +1016,12 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_complete(&vp, vmi, mm); khugepaged_enter_vma(res, vm_flags); return res; + +prealloc_fail: +anon_vma_fail: + vma_iter_set(vmi, addr); + vma_iter_load(vmi); + return NULL; } /* From 824135c46b00df7fb369ec7f1f8607427bbebeb0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 29 Sep 2023 14:30:40 -0400 Subject: [PATCH 05/20] mmap: fix error paths with dup_anon_vma() When the calling function fails after the dup_anon_vma(), the duplication of the anon_vma is not being undone. Add the necessary unlink_anon_vma() call to the error paths that are missing them. This issue showed up during inspection of the error path in vma_merge() for an unrelated vma iterator issue. Users may experience increased memory usage, which may be problematic as the failure would likely be caused by a low memory situation. Link: https://lkml.kernel.org/r/20230929183041.2835469-3-Liam.Howlett@oracle.com Fixes: d4af56c5c7c6 ("mm: start tracking VMAs with maple tree") Signed-off-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Jann Horn Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- mm/mmap.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index a0917ed26057..9e018d8dd7d6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -583,11 +583,12 @@ again: * dup_anon_vma() - Helper function to duplicate anon_vma * @dst: The destination VMA * @src: The source VMA + * @dup: Pointer to the destination VMA when successful. * * Returns: 0 on success. */ static inline int dup_anon_vma(struct vm_area_struct *dst, - struct vm_area_struct *src) + struct vm_area_struct *src, struct vm_area_struct **dup) { /* * Easily overlooked: when mprotect shifts the boundary, make sure the @@ -595,9 +596,15 @@ static inline int dup_anon_vma(struct vm_area_struct *dst, * anon pages imported. */ if (src->anon_vma && !dst->anon_vma) { + int ret; + vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; - return anon_vma_clone(dst, src); + ret = anon_vma_clone(dst, src); + if (ret) + return ret; + + *dup = dst; } return 0; @@ -624,6 +631,7 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *next) { + struct vm_area_struct *anon_dup = NULL; bool remove_next = false; struct vma_prepare vp; @@ -633,7 +641,7 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, remove_next = true; vma_start_write(next); - ret = dup_anon_vma(vma, next); + ret = dup_anon_vma(vma, next, &anon_dup); if (ret) return ret; } @@ -661,6 +669,8 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; nomem: + if (anon_dup) + unlink_anon_vmas(anon_dup); return -ENOMEM; } @@ -860,6 +870,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, { struct vm_area_struct *curr, *next, *res; struct vm_area_struct *vma, *adjust, *remove, *remove2; + struct vm_area_struct *anon_dup = NULL; struct vma_prepare vp; pgoff_t vma_pgoff; int err = 0; @@ -927,18 +938,18 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_start_write(next); remove = next; /* case 1 */ vma_end = next->vm_end; - err = dup_anon_vma(prev, next); + err = dup_anon_vma(prev, next, &anon_dup); if (curr) { /* case 6 */ vma_start_write(curr); remove = curr; remove2 = next; if (!next->anon_vma) - err = dup_anon_vma(prev, curr); + err = dup_anon_vma(prev, curr, &anon_dup); } } else if (merge_prev) { /* case 2 */ if (curr) { vma_start_write(curr); - err = dup_anon_vma(prev, curr); + err = dup_anon_vma(prev, curr, &anon_dup); if (end == curr->vm_end) { /* case 7 */ remove = curr; } else { /* case 5 */ @@ -954,7 +965,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_end = addr; adjust = next; adj_start = -(prev->vm_end - addr); - err = dup_anon_vma(next, prev); + err = dup_anon_vma(next, prev, &anon_dup); } else { /* * Note that cases 3 and 8 are the ONLY ones where prev @@ -968,7 +979,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_pgoff = curr->vm_pgoff; vma_start_write(curr); remove = curr; - err = dup_anon_vma(next, curr); + err = dup_anon_vma(next, curr, &anon_dup); } } } @@ -1018,6 +1029,9 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, return res; prealloc_fail: + if (anon_dup) + unlink_anon_vmas(anon_dup); + anon_vma_fail: vma_iter_set(vmi, addr); vma_iter_load(vmi); From 117b1bb0cbc7f5feab4fd251737869958987808c Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 28 Sep 2023 17:18:45 +0200 Subject: [PATCH 06/20] riscv: handle VM_FAULT_[HWPOISON|HWPOISON_LARGE] faults instead of panicking Patch series "Fix set_huge_pte_at()". A recent report [1] from Ryan for arm64 revealed that we do not handle swap entries when setting a hugepage backed by a NAPOT region (the contpte riscv equivalent). As explained in [1], the issue was discovered by a new test in kselftest which uses poison entries, but the symptoms are different from arm64 though: - the riscv kernel bugs because we do not handle VM_FAULT_HWPOISON*, this is fixed by patch 1, - after that, the test passes because the first pte_napot() fails (the poison entry does not have the N bit set), and then we only set the first page table entry covering the NAPOT hugepage, which is enough for hugetlb_fault() to correctly raise a VM_FAULT_HWPOISON wherever we write in this mapping since only this first page table entry is checked (see https://elixir.bootlin.com/linux/v6.6-rc3/source/mm/hugetlb.c#L6071). But this seems fragile so patch 2 sets all page table entries of a NAPOT mapping. [1]: https://lore.kernel.org/linux-arm-kernel/20230922115804.2043771-1-ryan.roberts@arm.com/ This patch (of 2): We used to panic when such faults were encountered but we should handle those faults gracefully for userspace by sending a SIGBUS to the process, like most architectures do. Link: https://lkml.kernel.org/r/20230928151846.8229-1-alexghiti@rivosinc.com Link: https://lkml.kernel.org/r/20230928151846.8229-2-alexghiti@rivosinc.com Signed-off-by: Alexandre Ghiti Acked-by: Palmer Dabbelt Cc: Albert Ou Cc: Andrew Jones Cc: Conor Dooley Cc: Paul Walmsley Cc: Qinglin Pan Signed-off-by: Andrew Morton --- arch/riscv/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 6115d7514972..90d4ba36d1d0 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -72,7 +72,7 @@ static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_f } pagefault_out_of_memory(); return; - } else if (fault & VM_FAULT_SIGBUS) { + } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) { /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) { no_context(regs, addr); From 1de195dd0e05d9cba43dec16f83d4ee32af94dd2 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 28 Sep 2023 17:18:46 +0200 Subject: [PATCH 07/20] riscv: fix set_huge_pte_at() for NAPOT mappings when a swap entry is set We used to determine the number of page table entries to set for a NAPOT hugepage by using the pte value which actually fails when the pte to set is a swap entry. So take advantage of a recent fix for arm64 reported in [1] which introduces the size of the mapping as an argument of set_huge_pte_at(): we can then use this size to compute the number of page table entries to set for a NAPOT region. Link: https://lkml.kernel.org/r/20230928151846.8229-3-alexghiti@rivosinc.com Fixes: 82a1a1f3bfb6 ("riscv: mm: support Svnapot in hugetlb page") Signed-off-by: Alexandre Ghiti Reported-by: Ryan Roberts Closes: https://lore.kernel.org/linux-arm-kernel/20230922115804.2043771-1-ryan.roberts@arm.com/ [1] Reviewed-by: Andrew Jones Cc: Albert Ou Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Qinglin Pan Cc: Conor Dooley Signed-off-by: Andrew Morton --- arch/riscv/mm/hugetlbpage.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index e4a2ace92dbe..b52f0210481f 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -183,15 +183,22 @@ void set_huge_pte_at(struct mm_struct *mm, pte_t pte, unsigned long sz) { + unsigned long hugepage_shift; int i, pte_num; - if (!pte_napot(pte)) { - set_pte_at(mm, addr, ptep, pte); - return; - } + if (sz >= PGDIR_SIZE) + hugepage_shift = PGDIR_SHIFT; + else if (sz >= P4D_SIZE) + hugepage_shift = P4D_SHIFT; + else if (sz >= PUD_SIZE) + hugepage_shift = PUD_SHIFT; + else if (sz >= PMD_SIZE) + hugepage_shift = PMD_SHIFT; + else + hugepage_shift = PAGE_SHIFT; - pte_num = napot_pte_num(napot_cont_order(pte)); - for (i = 0; i < pte_num; i++, ptep++, addr += PAGE_SIZE) + pte_num = sz >> hugepage_shift; + for (i = 0; i < pte_num; i++, ptep++, addr += (1 << hugepage_shift)) set_pte_at(mm, addr, ptep, pte); } From 229e2253766c7cdfe024f1fe280020cc4711087c Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Tue, 3 Oct 2023 10:48:56 -0400 Subject: [PATCH 08/20] mm/migrate: fix do_pages_move for compat pointers do_pages_move does not handle compat pointers for the page list. correctly. Add in_compat_syscall check and appropriate get_user fetch when iterating the page list. It makes the syscall in compat mode (32-bit userspace, 64-bit kernel) work the same way as the native 32-bit syscall again, restoring the behavior before my broken commit 5b1b561ba73c ("mm: simplify compat_sys_move_pages"). More specifically, my patch moved the parsing of the 'pages' array from the main entry point into do_pages_stat(), which left the syscall working correctly for the 'stat' operation (nodes = NULL), while the 'move' operation (nodes != NULL) is now missing the conversion and interprets 'pages' as an array of 64-bit pointers instead of the intended 32-bit userspace pointers. It is possible that nobody noticed this bug because the few applications that actually call move_pages are unlikely to run in compat mode because of their large memory requirements, but this clearly fixes a user-visible regression and should have been caught by ltp. Link: https://lkml.kernel.org/r/20231003144857.752952-1-gregory.price@memverge.com Fixes: 5b1b561ba73c ("mm: simplify compat_sys_move_pages") Signed-off-by: Gregory Price Reported-by: Arnd Bergmann Co-developed-by: Arnd Bergmann Cc: Jonathan Cameron Cc: Signed-off-by: Andrew Morton --- mm/migrate.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 2053b54556ca..06086dc9da28 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2162,6 +2162,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, const int __user *nodes, int __user *status, int flags) { + compat_uptr_t __user *compat_pages = (void __user *)pages; int current_node = NUMA_NO_NODE; LIST_HEAD(pagelist); int start, i; @@ -2174,8 +2175,17 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, int node; err = -EFAULT; - if (get_user(p, pages + i)) - goto out_flush; + if (in_compat_syscall()) { + compat_uptr_t cp; + + if (get_user(cp, compat_pages + i)) + goto out_flush; + + p = compat_ptr(cp); + } else { + if (get_user(p, pages + i)) + goto out_flush; + } if (get_user(node, nodes + i)) goto out_flush; From 969d63e1af3b3abe35a49b08218f3125131ac32f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 6 Oct 2023 12:00:24 -0400 Subject: [PATCH 09/20] mm: zswap: fix pool refcount bug around shrink_worker() When a zswap store fails due to the limit, it acquires a pool reference and queues the shrinker. When the shrinker runs, it drops the reference. However, there can be multiple store attempts before the shrinker wakes up and runs once. This results in reference leaks and eventual saturation warnings for the pool refcount. Fix this by dropping the reference again when the shrinker is already queued. This ensures one reference per shrinker run. Link: https://lkml.kernel.org/r/20231006160024.170748-1-hannes@cmpxchg.org Fixes: 45190f01dd40 ("mm/zswap.c: add allocation hysteresis if pool limit is hit") Signed-off-by: Johannes Weiner Reported-by: Chris Mason Acked-by: Nhat Pham Cc: Vitaly Wool Cc: Domenico Cerasuolo Cc: [5.6+] Signed-off-by: Andrew Morton --- mm/zswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 083c693602b8..37d2b1cb2ecb 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1383,8 +1383,8 @@ reject: shrink: pool = zswap_pool_last_get(); - if (pool) - queue_work(shrink_wq, &pool->shrink_work); + if (pool && !queue_work(shrink_wq, &pool->shrink_work)) + zswap_pool_put(pool); goto reject; } From 92fe9dcbe4e109a7ce6bab3e452210a35b0ab493 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 5 Oct 2023 23:59:06 -0400 Subject: [PATCH 10/20] hugetlbfs: clear resv_map pointer if mmap fails Patch series "hugetlbfs: close race between MADV_DONTNEED and page fault", v7. Malloc libraries, like jemalloc and tcalloc, take decisions on when to call madvise independently from the code in the main application. This sometimes results in the application page faulting on an address, right after the malloc library has shot down the backing memory with MADV_DONTNEED. Usually this is harmless, because we always have some 4kB pages sitting around to satisfy a page fault. However, with hugetlbfs systems often allocate only the exact number of huge pages that the application wants. Due to TLB batching, hugetlbfs MADV_DONTNEED will free pages outside of any lock taken on the page fault path, which can open up the following race condition: CPU 1 CPU 2 MADV_DONTNEED unmap page shoot down TLB entry page fault fail to allocate a huge page killed with SIGBUS free page Fix that race by extending the hugetlb_vma_lock locking scheme to also cover private hugetlb mappings (with resv_map), and pulling the locking from __unmap_hugepage_final_range into helper functions called from zap_page_range_single. This ensures page faults stay locked out of the MADV_DONTNEED VMA until the huge pages have actually been freed. This patch (of 3): Hugetlbfs leaves a dangling pointer in the VMA if mmap fails. This has not been a problem so far, but other code in this patch series tries to follow that pointer. Link: https://lkml.kernel.org/r/20231006040020.3677377-1-riel@surriel.com Link: https://lkml.kernel.org/r/20231006040020.3677377-2-riel@surriel.com Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing") Signed-off-by: Mike Kravetz Signed-off-by: Rik van Riel Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 52d26072dfda..467fedd42453 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1138,8 +1138,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); - set_vma_private_data(vma, (get_vma_private_data(vma) & - HPAGE_RESV_MASK) | (unsigned long)map); + set_vma_private_data(vma, (unsigned long)map); } static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) @@ -6811,8 +6810,10 @@ out_err: */ if (chg >= 0 && add < 0) region_abort(resv_map, from, to, regions_needed); - if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { kref_put(&resv_map->refs, resv_map_release); + set_vma_resv_map(vma, NULL); + } return false; } From bf4916922c60f43efaa329744b3eef539aa6a2b2 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 5 Oct 2023 23:59:07 -0400 Subject: [PATCH 11/20] hugetlbfs: extend hugetlb_vma_lock to private VMAs Extend the locking scheme used to protect shared hugetlb mappings from truncate vs page fault races, in order to protect private hugetlb mappings (with resv_map) against MADV_DONTNEED. Add a read-write semaphore to the resv_map data structure, and use that from the hugetlb_vma_(un)lock_* functions, in preparation for closing the race between MADV_DONTNEED and page faults. Link: https://lkml.kernel.org/r/20231006040020.3677377-3-riel@surriel.com Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing") Signed-off-by: Rik van Riel Reviewed-by: Mike Kravetz Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 ++++++ mm/hugetlb.c | 41 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a30686e649f7..065ec022cbb0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -60,6 +60,7 @@ struct resv_map { long adds_in_progress; struct list_head region_cache; long region_cache_count; + struct rw_semaphore rw_sema; #ifdef CONFIG_CGROUP_HUGETLB /* * On private mappings, the counter to uncharge reservations is stored @@ -1233,6 +1234,11 @@ static inline bool __vma_shareable_lock(struct vm_area_struct *vma) return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data; } +static inline bool __vma_private_lock(struct vm_area_struct *vma) +{ + return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data; +} + /* * Safe version of huge_pte_offset() to check the locks. See comments * above huge_pte_offset(). diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 467fedd42453..d218ee2b6a99 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -97,6 +97,7 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end); +static struct resv_map *vma_resv_map(struct vm_area_struct *vma); static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -267,6 +268,10 @@ void hugetlb_vma_lock_read(struct vm_area_struct *vma) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_read(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + down_read(&resv_map->rw_sema); } } @@ -276,6 +281,10 @@ void hugetlb_vma_unlock_read(struct vm_area_struct *vma) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; up_read(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + up_read(&resv_map->rw_sema); } } @@ -285,6 +294,10 @@ void hugetlb_vma_lock_write(struct vm_area_struct *vma) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_write(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + down_write(&resv_map->rw_sema); } } @@ -294,17 +307,27 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; up_write(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + up_write(&resv_map->rw_sema); } } int hugetlb_vma_trylock_write(struct vm_area_struct *vma) { - struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; - if (!__vma_shareable_lock(vma)) - return 1; + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; - return down_write_trylock(&vma_lock->rw_sema); + return down_write_trylock(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + return down_write_trylock(&resv_map->rw_sema); + } + + return 1; } void hugetlb_vma_assert_locked(struct vm_area_struct *vma) @@ -313,6 +336,10 @@ void hugetlb_vma_assert_locked(struct vm_area_struct *vma) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; lockdep_assert_held(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + lockdep_assert_held(&resv_map->rw_sema); } } @@ -345,6 +372,11 @@ static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; __hugetlb_vma_unlock_write_put(vma_lock); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + /* no free for anon vmas, but still need to unlock */ + up_write(&resv_map->rw_sema); } } @@ -1068,6 +1100,7 @@ struct resv_map *resv_map_alloc(void) kref_init(&resv_map->refs); spin_lock_init(&resv_map->lock); INIT_LIST_HEAD(&resv_map->regions); + init_rwsem(&resv_map->rw_sema); resv_map->adds_in_progress = 0; /* From 2820b0f09be99f6406784b03a22dfc83e858449d Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 5 Oct 2023 23:59:08 -0400 Subject: [PATCH 12/20] hugetlbfs: close race between MADV_DONTNEED and page fault Malloc libraries, like jemalloc and tcalloc, take decisions on when to call madvise independently from the code in the main application. This sometimes results in the application page faulting on an address, right after the malloc library has shot down the backing memory with MADV_DONTNEED. Usually this is harmless, because we always have some 4kB pages sitting around to satisfy a page fault. However, with hugetlbfs systems often allocate only the exact number of huge pages that the application wants. Due to TLB batching, hugetlbfs MADV_DONTNEED will free pages outside of any lock taken on the page fault path, which can open up the following race condition: CPU 1 CPU 2 MADV_DONTNEED unmap page shoot down TLB entry page fault fail to allocate a huge page killed with SIGBUS free page Fix that race by pulling the locking from __unmap_hugepage_final_range into helper functions called from zap_page_range_single. This ensures page faults stay locked out of the MADV_DONTNEED VMA until the huge pages have actually been freed. Link: https://lkml.kernel.org/r/20231006040020.3677377-4-riel@surriel.com Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing") Signed-off-by: Rik van Riel Reviewed-by: Mike Kravetz Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 35 +++++++++++++++++++++++++++++++++-- mm/hugetlb.c | 36 +++++++++++++++++++++++------------- mm/memory.c | 13 ++++++++----- 3 files changed, 64 insertions(+), 20 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 065ec022cbb0..47d25a5e1933 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -139,7 +139,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *, zap_flags_t); -void __unmap_hugepage_range_final(struct mmu_gather *tlb, +void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags); @@ -246,6 +246,25 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); +extern void __hugetlb_zap_begin(struct vm_area_struct *vma, + unsigned long *begin, unsigned long *end); +extern void __hugetlb_zap_end(struct vm_area_struct *vma, + struct zap_details *details); + +static inline void hugetlb_zap_begin(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ + if (is_vm_hugetlb_page(vma)) + __hugetlb_zap_begin(vma, start, end); +} + +static inline void hugetlb_zap_end(struct vm_area_struct *vma, + struct zap_details *details) +{ + if (is_vm_hugetlb_page(vma)) + __hugetlb_zap_end(vma, details); +} + void hugetlb_vma_lock_read(struct vm_area_struct *vma); void hugetlb_vma_unlock_read(struct vm_area_struct *vma); void hugetlb_vma_lock_write(struct vm_area_struct *vma); @@ -297,6 +316,18 @@ static inline void adjust_range_if_pmd_sharing_possible( { } +static inline void hugetlb_zap_begin( + struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ +} + +static inline void hugetlb_zap_end( + struct vm_area_struct *vma, + struct zap_details *details) +{ +} + static inline struct page *hugetlb_follow_page_mask( struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned int *page_mask) @@ -442,7 +473,7 @@ static inline long hugetlb_change_protection( return 0; } -static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, +static inline void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d218ee2b6a99..1301ba7b2c9a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5306,9 +5306,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, return len + old_addr - old_end; } -static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct page *ref_page, zap_flags_t zap_flags) +void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct page *ref_page, zap_flags_t zap_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -5437,16 +5437,25 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct tlb_flush_mmu_tlbonly(tlb); } -void __unmap_hugepage_range_final(struct mmu_gather *tlb, - struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page, - zap_flags_t zap_flags) +void __hugetlb_zap_begin(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) { - hugetlb_vma_lock_write(vma); - i_mmap_lock_write(vma->vm_file->f_mapping); + if (!vma->vm_file) /* hugetlbfs_file_mmap error */ + return; - /* mmu notification performed in caller */ - __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); + adjust_range_if_pmd_sharing_possible(vma, start, end); + hugetlb_vma_lock_write(vma); + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); +} + +void __hugetlb_zap_end(struct vm_area_struct *vma, + struct zap_details *details) +{ + zap_flags_t zap_flags = details ? details->zap_flags : 0; + + if (!vma->vm_file) /* hugetlbfs_file_mmap error */ + return; if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ /* @@ -5459,11 +5468,12 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, * someone else. */ __hugetlb_vma_unlock_write_free(vma); - i_mmap_unlock_write(vma->vm_file->f_mapping); } else { - i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); } + + if (vma->vm_file) + i_mmap_unlock_write(vma->vm_file->f_mapping); } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, diff --git a/mm/memory.c b/mm/memory.c index 6c264d2f969c..517221f01303 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1683,7 +1683,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) { zap_flags_t zap_flags = details ? details->zap_flags : 0; - __unmap_hugepage_range_final(tlb, vma, start, end, + __unmap_hugepage_range(tlb, vma, start, end, NULL, zap_flags); } } else @@ -1728,8 +1728,12 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); do { - unmap_single_vma(tlb, vma, start_addr, end_addr, &details, + unsigned long start = start_addr; + unsigned long end = end_addr; + hugetlb_zap_begin(vma, &start, &end); + unmap_single_vma(tlb, vma, start, end, &details, mm_wr_locked); + hugetlb_zap_end(vma, &details); } while ((vma = mas_find(mas, tree_end - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); } @@ -1753,9 +1757,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); - if (is_vm_hugetlb_page(vma)) - adjust_range_if_pmd_sharing_possible(vma, &range.start, - &range.end); + hugetlb_zap_begin(vma, &range.start, &range.end); tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); @@ -1766,6 +1768,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unmap_single_vma(&tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); + hugetlb_zap_end(vma, details); } /** From babddbfb7d7d70ae7f10fedd75a45d8ad75fdddf Mon Sep 17 00:00:00 2001 From: Haibo Li Date: Mon, 9 Oct 2023 15:37:48 +0800 Subject: [PATCH 13/20] kasan: print the original fault addr when access invalid shadow when the checked address is illegal,the corresponding shadow address from kasan_mem_to_shadow may have no mapping in mmu table. Access such shadow address causes kernel oops. Here is a sample about oops on arm64(VA 39bit) with KASAN_SW_TAGS and KASAN_OUTLINE on: [ffffffb80aaaaaaa] pgd=000000005d3ce003, p4d=000000005d3ce003, pud=000000005d3ce003, pmd=0000000000000000 Internal error: Oops: 0000000096000006 [#1] PREEMPT SMP Modules linked in: CPU: 3 PID: 100 Comm: sh Not tainted 6.6.0-rc1-dirty #43 Hardware name: linux,dummy-virt (DT) pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __hwasan_load8_noabort+0x5c/0x90 lr : do_ib_ob+0xf4/0x110 ffffffb80aaaaaaa is the shadow address for efffff80aaaaaaaa. The problem is reading invalid shadow in kasan_check_range. The generic kasan also has similar oops. It only reports the shadow address which causes oops but not the original address. Commit 2f004eea0fc8("x86/kasan: Print original address on #GP") introduce to kasan_non_canonical_hook but limit it to KASAN_INLINE. This patch extends it to KASAN_OUTLINE mode. Link: https://lkml.kernel.org/r/20231009073748.159228-1-haibo.li@mediatek.com Fixes: 2f004eea0fc8("x86/kasan: Print original address on #GP") Signed-off-by: Haibo Li Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: AngeloGioacchino Del Regno Cc: Dmitry Vyukov Cc: Haibo Li Cc: Matthias Brugger Cc: Vincenzo Frascino Cc: Arnd Bergmann Cc: Kees Cook Signed-off-by: Andrew Morton --- include/linux/kasan.h | 6 +++--- mm/kasan/report.c | 4 +--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 842623d708c2..6dbb5ded3cf9 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -466,10 +466,10 @@ static inline void kasan_free_module_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ -#ifdef CONFIG_KASAN_INLINE +#ifdef CONFIG_KASAN void kasan_non_canonical_hook(unsigned long addr); -#else /* CONFIG_KASAN_INLINE */ +#else /* CONFIG_KASAN */ static inline void kasan_non_canonical_hook(unsigned long addr) { } -#endif /* CONFIG_KASAN_INLINE */ +#endif /* CONFIG_KASAN */ #endif /* LINUX_KASAN_H */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ca4b6ff080a6..3974e4549c3e 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -621,9 +621,8 @@ void kasan_report_async(void) } #endif /* CONFIG_KASAN_HW_TAGS */ -#ifdef CONFIG_KASAN_INLINE /* - * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high + * With CONFIG_KASAN, accesses to bogus pointers (outside the high * canonical half of the address space) cause out-of-bounds shadow memory reads * before the actual access. For addresses in the low canonical half of the * address space, as well as most non-canonical addresses, that out-of-bounds @@ -659,4 +658,3 @@ void kasan_non_canonical_hook(unsigned long addr) pr_alert("KASAN: %s in range [0x%016lx-0x%016lx]\n", bug_type, orig_addr, orig_addr + KASAN_GRANULE_SIZE - 1); } -#endif From 17c17567fe510857b18fe01b7a88027600e76ac6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 16 Oct 2023 22:08:38 +0200 Subject: [PATCH 14/20] kasan: disable kasan_non_canonical_hook() for HW tags On arm64, building with CONFIG_KASAN_HW_TAGS now causes a compile-time error: mm/kasan/report.c: In function 'kasan_non_canonical_hook': mm/kasan/report.c:637:20: error: 'KASAN_SHADOW_OFFSET' undeclared (first use in this function) 637 | if (addr < KASAN_SHADOW_OFFSET) | ^~~~~~~~~~~~~~~~~~~ mm/kasan/report.c:637:20: note: each undeclared identifier is reported only once for each function it appears in mm/kasan/report.c:640:77: error: expected expression before ';' token 640 | orig_addr = (addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT; This was caused by removing the dependency on CONFIG_KASAN_INLINE that used to prevent this from happening. Use the more specific dependency on KASAN_SW_TAGS || KASAN_GENERIC to only ignore the function for hwasan mode. Link: https://lkml.kernel.org/r/20231016200925.984439-1-arnd@kernel.org Fixes: 12ec6a919b0f ("kasan: print the original fault addr when access invalid shadow") Signed-off-by: Arnd Bergmann Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Haibo Li Cc: Kees Cook Cc: Vincenzo Frascino Cc: AngeloGioacchino Del Regno Cc: Matthias Brugger Signed-off-by: Andrew Morton --- include/linux/kasan.h | 6 +++--- mm/kasan/report.c | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 6dbb5ded3cf9..71fa9a40fb5a 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -466,10 +466,10 @@ static inline void kasan_free_module_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_non_canonical_hook(unsigned long addr); -#else /* CONFIG_KASAN */ +#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ static inline void kasan_non_canonical_hook(unsigned long addr) { } -#endif /* CONFIG_KASAN */ +#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ #endif /* LINUX_KASAN_H */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 3974e4549c3e..6e3cb118d20e 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -621,8 +621,9 @@ void kasan_report_async(void) } #endif /* CONFIG_KASAN_HW_TAGS */ +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) /* - * With CONFIG_KASAN, accesses to bogus pointers (outside the high + * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high * canonical half of the address space) cause out-of-bounds shadow memory reads * before the actual access. For addresses in the low canonical half of the * address space, as well as most non-canonical addresses, that out-of-bounds @@ -658,3 +659,4 @@ void kasan_non_canonical_hook(unsigned long addr) pr_alert("KASAN: %s in range [0x%016lx-0x%016lx]\n", bug_type, orig_addr, orig_addr + KASAN_GRANULE_SIZE - 1); } +#endif From c5155d4ef4b2ac03cb5838b4060ab2ceb7dbda09 Mon Sep 17 00:00:00 2001 From: Ondrej Jirman Date: Sun, 8 Oct 2023 12:58:07 +0200 Subject: [PATCH 15/20] MAINTAINERS: Ondrej has moved Update my email-address in MAINTAINERS to . Also add .mailmap entries to map my old, now blocked, email address. Link: https://lkml.kernel.org/r/20231008105812.1084226-1-megi@xff.cz Signed-off-by: Ondrej Jirman Cc: Bjorn Andersson Cc: Heiko Stuebner Cc: Jakub Kicinski Cc: Jens Axboe Cc: Konrad Dybcio # qcom Cc: Mark Brown Cc: Qais Yousef Cc: Stephen Hemminger Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index a0a6efe87186..2084bf8ec2ab 100644 --- a/.mailmap +++ b/.mailmap @@ -452,6 +452,7 @@ Oleksij Rempel Oleksij Rempel Oleksij Rempel Oliver Upton +Ondřej Jirman Oza Pawandeep Pali Rohár Paolo 'Blaisorblade' Giarrusso diff --git a/MAINTAINERS b/MAINTAINERS index 35977b269d5e..68ff5b62a1d2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6748,7 +6748,7 @@ F: drivers/gpu/drm/panel/panel-sitronix-st7701.c DRM DRIVER FOR SITRONIX ST7703 PANELS M: Guido Günther R: Purism Kernel Team -R: Ondrej Jirman +R: Ondrej Jirman S: Maintained F: Documentation/devicetree/bindings/display/panel/rocktech,jh057n00900.yaml F: drivers/gpu/drm/panel/panel-sitronix-st7703.c From 76b7069bcc89dec33f03eb08abee165d0306b754 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Oct 2023 20:04:32 +0000 Subject: [PATCH 16/20] mm/damon/sysfs: check DAMOS regions update progress from before_terminate() DAMON_SYSFS can receive DAMOS tried regions update request while kdamond is already out of the main loop and before_terminate callback (damon_sysfs_before_terminate() in this case) is not yet called. And damon_sysfs_handle_cmd() can further be finished before the callback is invoked. Then, damon_sysfs_before_terminate() unlocks damon_sysfs_lock, which is not locked by anyone. This happens because the callback function assumes damon_sysfs_cmd_request_callback() should be called before it. Check if the assumption was true before doing the unlock, to avoid this problem. Link: https://lkml.kernel.org/r/20231007200432.3110-1-sj@kernel.org Fixes: f1d13cacabe1 ("mm/damon/sysfs: implement DAMOS tried regions update command") Signed-off-by: SeongJae Park Cc: [6.2.x] Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index b86ba7b0a921..f60e56150feb 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1208,6 +1208,8 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, return 0; } +static bool damon_sysfs_schemes_regions_updating; + static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; @@ -1219,8 +1221,10 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) cmd = damon_sysfs_cmd_request.cmd; if (kdamond && ctx == kdamond->damon_ctx && (cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS || - cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES)) { + cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES) && + damon_sysfs_schemes_regions_updating) { damon_sysfs_schemes_update_regions_stop(ctx); + damon_sysfs_schemes_regions_updating = false; mutex_unlock(&damon_sysfs_lock); } @@ -1340,7 +1344,6 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) { struct damon_sysfs_kdamond *kdamond; - static bool damon_sysfs_schemes_regions_updating; bool total_bytes_only = false; int err = 0; From 002e39e9ece5589140236558a23c63ca4da45b68 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 11 Oct 2023 17:01:04 +0200 Subject: [PATCH 17/20] mailmap: map Bartosz's old address to the current one I no longer work for BayLibre but many DT bindings have my BL address in the maintainers entries. Map it to the email address I use for kernel development. Link: https://lkml.kernel.org/r/20231011150104.73863-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski Suggested-by: Conor Dooley Cc: Bartosz Golaszewski Cc: Bjorn Andersson Cc: Heiko Stuebner Cc: Jakub Kicinski Cc: Jens Axboe Cc: Konrad Dybcio # qcom Cc: Qais Yousef Cc: Stephen Hemminger Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 2084bf8ec2ab..b0c07e7fbd2b 100644 --- a/.mailmap +++ b/.mailmap @@ -87,6 +87,7 @@ Baolin Wang Baolin Wang Bart Van Assche Bart Van Assche +Bartosz Golaszewski Ben Dooks Ben Dooks Ben Gardner From d2313c77592661e5ba259cdae3a120fb391054ff Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 11 Oct 2023 13:25:19 +0200 Subject: [PATCH 18/20] mailmap: correct email aliasing for Oleksij Rempel Ensure the current work email addresses for Oleksij Rempel are preserved and not overridden by private address. Alias the alternate work email to the primary work email address. Link: https://lkml.kernel.org/r/20231011112519.1427077-1-o.rempel@pengutronix.de Signed-off-by: Oleksij Rempel Cc: Jakub Kicinski Cc: Konrad Dybcio # qcom Cc: Mark Brown Cc: Qais Yousef Signed-off-by: Andrew Morton --- .mailmap | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index b0c07e7fbd2b..f56aafbc75be 100644 --- a/.mailmap +++ b/.mailmap @@ -450,8 +450,8 @@ Oleksandr Natalenko Oleksij Rempel Oleksij Rempel Oleksij Rempel -Oleksij Rempel -Oleksij Rempel +Oleksij Rempel +Oleksij Rempel Oliver Upton Ondřej Jirman Oza Pawandeep From e2de156b0d918b5ebe975577d25f9ef92379a756 Mon Sep 17 00:00:00 2001 From: Samasth Norway Ananda Date: Thu, 12 Oct 2023 08:52:57 -0700 Subject: [PATCH 19/20] selftests/mm: include mman header to access MREMAP_DONTUNMAP identifier Definition for MREMAP_DONTUNMAP is not present in glibc older than 2.32 thus throwing an undeclared error when running make on mm. Including linux/mman.h solves the build error for people having older glibc. Link: https://lkml.kernel.org/r/20231012155257.891776-1-samasth.norway.ananda@oracle.com Fixes: 0183d777c29a ("selftests: mm: remove duplicate unneeded defines") Signed-off-by: Samasth Norway Ananda Reported-by: Linux Kernel Functional Testing Closes: https://lore.kernel.org/linux-mm/CA+G9fYvV-71XqpCr_jhdDfEtN701fBdG3q+=bafaZiGwUXy_aA@mail.gmail.com/ Tested-by: Muhammad Usama Anjum Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_dontunmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/mm/mremap_dontunmap.c b/tools/testing/selftests/mm/mremap_dontunmap.c index ca2359835e75..a06e73ec8568 100644 --- a/tools/testing/selftests/mm/mremap_dontunmap.c +++ b/tools/testing/selftests/mm/mremap_dontunmap.c @@ -7,6 +7,7 @@ */ #define _GNU_SOURCE #include +#include #include #include #include From 099d7439ce03d0e7bc8f0c3d7878b562f3a48d3d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 12 Oct 2023 11:52:33 -0400 Subject: [PATCH 20/20] maple_tree: add GFP_KERNEL to allocations in mas_expected_entries() Users complained about OOM errors during fork without triggering compaction. This can be fixed by modifying the flags used in mas_expected_entries() so that the compaction will be triggered in low memory situations. Since mas_expected_entries() is only used during fork, the extra argument does not need to be passed through. Additionally, the two test_maple_tree test cases and one benchmark test were altered to use the correct locking type so that allocations would not trigger sleeping and thus fail. Testing was completed with lockdep atomic sleep detection. The additional locking change requires rwsem support additions to the tools/ directory through the use of pthreads pthread_rwlock_t. With this change test_maple_tree works in userspace, as a module, and in-kernel. Users may notice that the system gave up early on attempting to start new processes instead of attempting to reclaim memory. Link: https://lkml.kernel.org/r/20230915093243epcms1p46fa00bbac1ab7b7dca94acb66c44c456@epcms1p4 Link: https://lkml.kernel.org/r/20231012155233.2272446-1-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Reviewed-by: Peng Zhang Cc: Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- lib/test_maple_tree.c | 35 ++++++++++++++++++++++---------- tools/include/linux/rwsem.h | 40 +++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 12 deletions(-) create mode 100644 tools/include/linux/rwsem.h diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 0e00a84e8e8f..bb24d84a4922 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5627,7 +5627,7 @@ int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries) /* Internal nodes */ nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap); /* Add working room for split (2 nodes) + new parents */ - mas_node_count(mas, nr_nodes + 3); + mas_node_count_gfp(mas, nr_nodes + 3, GFP_KERNEL); /* Detect if allocations run out */ mas->mas_flags |= MA_STATE_PREALLOC; diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 06959165e2f9..464eeb90d5ad 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -9,6 +9,7 @@ #include #include +#include #define MTREE_ALLOC_MAX 0x2000000000000Ul #define CONFIG_MAPLE_SEARCH @@ -1841,17 +1842,21 @@ static noinline void __init check_forking(struct maple_tree *mt) void *val; MA_STATE(mas, mt, 0, 0); MA_STATE(newmas, mt, 0, 0); + struct rw_semaphore newmt_lock; + + init_rwsem(&newmt_lock); for (i = 0; i <= nr_entries; i++) mtree_store_range(mt, i*10, i*10 + 5, xa_mk_value(i), GFP_KERNEL); mt_set_non_kernel(99999); - mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE); + mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&newmt, &newmt_lock); newmas.tree = &newmt; mas_reset(&newmas); mas_reset(&mas); - mas_lock(&newmas); + down_write(&newmt_lock); mas.index = 0; mas.last = 0; if (mas_expected_entries(&newmas, nr_entries)) { @@ -1866,10 +1871,10 @@ static noinline void __init check_forking(struct maple_tree *mt) } rcu_read_unlock(); mas_destroy(&newmas); - mas_unlock(&newmas); mt_validate(&newmt); mt_set_non_kernel(0); - mtree_destroy(&newmt); + __mt_destroy(&newmt); + up_write(&newmt_lock); } static noinline void __init check_iteration(struct maple_tree *mt) @@ -1980,6 +1985,10 @@ static noinline void __init bench_forking(struct maple_tree *mt) void *val; MA_STATE(mas, mt, 0, 0); MA_STATE(newmas, mt, 0, 0); + struct rw_semaphore newmt_lock; + + init_rwsem(&newmt_lock); + mt_set_external_lock(&newmt, &newmt_lock); for (i = 0; i <= nr_entries; i++) mtree_store_range(mt, i*10, i*10 + 5, @@ -1994,7 +2003,7 @@ static noinline void __init bench_forking(struct maple_tree *mt) mas.index = 0; mas.last = 0; rcu_read_lock(); - mas_lock(&newmas); + down_write(&newmt_lock); if (mas_expected_entries(&newmas, nr_entries)) { printk("OOM!"); BUG_ON(1); @@ -2005,11 +2014,11 @@ static noinline void __init bench_forking(struct maple_tree *mt) mas_store(&newmas, val); } mas_destroy(&newmas); - mas_unlock(&newmas); rcu_read_unlock(); mt_validate(&newmt); mt_set_non_kernel(0); - mtree_destroy(&newmt); + __mt_destroy(&newmt); + up_write(&newmt_lock); } } #endif @@ -2616,6 +2625,10 @@ static noinline void __init check_dup_gaps(struct maple_tree *mt, void *tmp; MA_STATE(mas, mt, 0, 0); MA_STATE(newmas, &newmt, 0, 0); + struct rw_semaphore newmt_lock; + + init_rwsem(&newmt_lock); + mt_set_external_lock(&newmt, &newmt_lock); if (!zero_start) i = 1; @@ -2625,9 +2638,9 @@ static noinline void __init check_dup_gaps(struct maple_tree *mt, mtree_store_range(mt, i*10, (i+1)*10 - gap, xa_mk_value(i), GFP_KERNEL); - mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE); + mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); mt_set_non_kernel(99999); - mas_lock(&newmas); + down_write(&newmt_lock); ret = mas_expected_entries(&newmas, nr_entries); mt_set_non_kernel(0); MT_BUG_ON(mt, ret != 0); @@ -2640,9 +2653,9 @@ static noinline void __init check_dup_gaps(struct maple_tree *mt, } rcu_read_unlock(); mas_destroy(&newmas); - mas_unlock(&newmas); - mtree_destroy(&newmt); + __mt_destroy(&newmt); + up_write(&newmt_lock); } /* Duplicate many sizes of trees. Mainly to test expected entry values */ diff --git a/tools/include/linux/rwsem.h b/tools/include/linux/rwsem.h new file mode 100644 index 000000000000..83971b3cbfce --- /dev/null +++ b/tools/include/linux/rwsem.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _TOOLS__RWSEM_H +#define _TOOLS__RWSEM_H + +#include + +struct rw_semaphore { + pthread_rwlock_t lock; +}; + +static inline int init_rwsem(struct rw_semaphore *sem) +{ + return pthread_rwlock_init(&sem->lock, NULL); +} + +static inline int exit_rwsem(struct rw_semaphore *sem) +{ + return pthread_rwlock_destroy(&sem->lock); +} + +static inline int down_read(struct rw_semaphore *sem) +{ + return pthread_rwlock_rdlock(&sem->lock); +} + +static inline int up_read(struct rw_semaphore *sem) +{ + return pthread_rwlock_unlock(&sem->lock); +} + +static inline int down_write(struct rw_semaphore *sem) +{ + return pthread_rwlock_wrlock(&sem->lock); +} + +static inline int up_write(struct rw_semaphore *sem) +{ + return pthread_rwlock_unlock(&sem->lock); +} +#endif /* _TOOLS_RWSEM_H */