linux-stable/include/linux/mmap_lock.h
Jann Horn 90717566f8 mm: don't drop VMA locks in mm_drop_all_locks()
Despite its name, mm_drop_all_locks() does not drop _all_ locks; the mmap
lock is held write-locked by the caller, and the caller is responsible for
dropping the mmap lock at a later point (which will also release the VMA
locks).

Calling vma_end_write_all() here is dangerous because the caller might
have write-locked a VMA with the expectation that it will stay
write-locked until the mmap_lock is released, as usual.

This _almost_ becomes a problem in the following scenario:

An anonymous VMA A and an SGX VMA B are mapped adjacent to each other. 
Userspace calls munmap() on a range starting at the start address of A and
ending in the middle of B.

Hypothetical call graph with additional notes in brackets:

do_vmi_align_munmap
  [begin first for_each_vma_range loop]
  vma_start_write [on VMA A]
  vma_mark_detached [on VMA A]
  __split_vma [on VMA B]
    sgx_vma_open [== new->vm_ops->open]
      sgx_encl_mm_add
        __mmu_notifier_register [luckily THIS CAN'T ACTUALLY HAPPEN]
          mm_take_all_locks
          mm_drop_all_locks
            vma_end_write_all [drops VMA lock taken on VMA A before]
  vma_start_write [on VMA B]
  vma_mark_detached [on VMA B]
  [end first for_each_vma_range loop]
  vma_iter_clear_gfp [removes VMAs from maple tree]
  mmap_write_downgrade
  unmap_region
  mmap_read_unlock

In this hypothetical scenario, while do_vmi_align_munmap() thinks it still
holds a VMA write lock on VMA A, the VMA write lock has actually been
invalidated inside __split_vma().

The call from sgx_encl_mm_add() to __mmu_notifier_register() can't
actually happen here, as far as I understand, because we are duplicating
an existing SGX VMA, but sgx_encl_mm_add() only calls
__mmu_notifier_register() for the first SGX VMA created in a given
process.  So this could only happen in fork(), not on munmap().  But in my
view it is just pure luck that this can't happen.

Also, we wouldn't actually have any bad consequences from this in
do_vmi_align_munmap(), because by the time the bug drops the lock on VMA
A, we've already marked VMA A as detached, which makes it completely
ineligible for any VMA-locked page faults.  But again, that's just pure
luck.

So remove the vma_end_write_all(), so that VMA write locks are only ever
released on mmap_write_unlock() or mmap_write_downgrade().

Also add comments to document the locking rules established by this patch.

Link: https://lkml.kernel.org/r/20230720193436.454247-1-jannh@google.com
Fixes: eeff9a5d47 ("mm/mmap: prevent pagefault handler from racing with mmu_notifier registration")
Signed-off-by: Jann Horn <jannh@google.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-08-18 10:12:46 -07:00

187 lines
5 KiB
C

#ifndef _LINUX_MMAP_LOCK_H
#define _LINUX_MMAP_LOCK_H
#include <linux/lockdep.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/rwsem.h>
#include <linux/tracepoint-defs.h>
#include <linux/types.h>
#define MMAP_LOCK_INITIALIZER(name) \
.mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
DECLARE_TRACEPOINT(mmap_lock_start_locking);
DECLARE_TRACEPOINT(mmap_lock_acquire_returned);
DECLARE_TRACEPOINT(mmap_lock_released);
#ifdef CONFIG_TRACING
void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);
void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
bool success);
void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write);
static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
bool write)
{
if (tracepoint_enabled(mmap_lock_start_locking))
__mmap_lock_do_trace_start_locking(mm, write);
}
static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
bool write, bool success)
{
if (tracepoint_enabled(mmap_lock_acquire_returned))
__mmap_lock_do_trace_acquire_returned(mm, write, success);
}
static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
{
if (tracepoint_enabled(mmap_lock_released))
__mmap_lock_do_trace_released(mm, write);
}
#else /* !CONFIG_TRACING */
static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
bool write)
{
}
static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
bool write, bool success)
{
}
static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
{
}
#endif /* CONFIG_TRACING */
static inline void mmap_assert_locked(struct mm_struct *mm)
{
lockdep_assert_held(&mm->mmap_lock);
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
}
static inline void mmap_assert_write_locked(struct mm_struct *mm)
{
lockdep_assert_held_write(&mm->mmap_lock);
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
}
#ifdef CONFIG_PER_VMA_LOCK
/*
* Drop all currently-held per-VMA locks.
* This is called from the mmap_lock implementation directly before releasing
* a write-locked mmap_lock (or downgrading it to read-locked).
* This should normally NOT be called manually from other places.
* If you want to call this manually anyway, keep in mind that this will release
* *all* VMA write locks, including ones from further up the stack.
*/
static inline void vma_end_write_all(struct mm_struct *mm)
{
mmap_assert_write_locked(mm);
/*
* Nobody can concurrently modify mm->mm_lock_seq due to exclusive
* mmap_lock being held.
* We need RELEASE semantics here to ensure that preceding stores into
* the VMA take effect before we unlock it with this store.
* Pairs with ACQUIRE semantics in vma_start_read().
*/
smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1);
}
#else
static inline void vma_end_write_all(struct mm_struct *mm) {}
#endif
static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
}
static inline void mmap_write_lock(struct mm_struct *mm)
{
__mmap_lock_trace_start_locking(mm, true);
down_write(&mm->mmap_lock);
__mmap_lock_trace_acquire_returned(mm, true, true);
}
static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
__mmap_lock_trace_start_locking(mm, true);
down_write_nested(&mm->mmap_lock, subclass);
__mmap_lock_trace_acquire_returned(mm, true, true);
}
static inline int mmap_write_lock_killable(struct mm_struct *mm)
{
int ret;
__mmap_lock_trace_start_locking(mm, true);
ret = down_write_killable(&mm->mmap_lock);
__mmap_lock_trace_acquire_returned(mm, true, ret == 0);
return ret;
}
static inline void mmap_write_unlock(struct mm_struct *mm)
{
__mmap_lock_trace_released(mm, true);
vma_end_write_all(mm);
up_write(&mm->mmap_lock);
}
static inline void mmap_write_downgrade(struct mm_struct *mm)
{
__mmap_lock_trace_acquire_returned(mm, false, true);
vma_end_write_all(mm);
downgrade_write(&mm->mmap_lock);
}
static inline void mmap_read_lock(struct mm_struct *mm)
{
__mmap_lock_trace_start_locking(mm, false);
down_read(&mm->mmap_lock);
__mmap_lock_trace_acquire_returned(mm, false, true);
}
static inline int mmap_read_lock_killable(struct mm_struct *mm)
{
int ret;
__mmap_lock_trace_start_locking(mm, false);
ret = down_read_killable(&mm->mmap_lock);
__mmap_lock_trace_acquire_returned(mm, false, ret == 0);
return ret;
}
static inline bool mmap_read_trylock(struct mm_struct *mm)
{
bool ret;
__mmap_lock_trace_start_locking(mm, false);
ret = down_read_trylock(&mm->mmap_lock) != 0;
__mmap_lock_trace_acquire_returned(mm, false, ret);
return ret;
}
static inline void mmap_read_unlock(struct mm_struct *mm)
{
__mmap_lock_trace_released(mm, false);
up_read(&mm->mmap_lock);
}
static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
{
__mmap_lock_trace_released(mm, false);
up_read_non_owner(&mm->mmap_lock);
}
static inline int mmap_lock_is_contended(struct mm_struct *mm)
{
return rwsem_is_contended(&mm->mmap_lock);
}
#endif /* _LINUX_MMAP_LOCK_H */