Two regression fixes for reader writer semaphores:

- Plug a race in the lock handoff which is caused by inconsistency of the
    reader and writer path and can lead to corruption of the underlying
    counter.
 
  - down_read_trylock() is suboptimal when the lock is contended and
    multiple readers trylock concurrently. That's due to the initial value
    being read non-atomically which results in at least two compare exchange
    loops. Making the initial readout atomic reduces this significantly.
    Whith 40 readers by 11% in a benchmark which enforces contention on
    mmap_sem.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmGjrRITHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYodsdEACRDUU5tkNVIgNTsGrO4IUhNW9fxyfG
 3dCAzcQx9w1UjjBn23/B0c6rPsVqEv6hKouBGXqdOHj0kLx6Xn0IPMTvqycPL+mp
 OyDzx+t773BlvTZyaYFa6vBiWbEVGzedDp6uLsYaBNo//4yN1WZY3mevTwzKVceX
 WOoobHjsoh5Wfwr1XmNw+7HVhPaY0E50DaIuRQrJjNj1zsUhzJsjr/M1NpiqCaSm
 PleDum3Dg0PD/pxdWtm34teuGQur0QknqPc2I6sZGnX0UMsCozeZAuH/MGnwwXec
 fsweMXBVyDngOIZbFX/tPbVTocOpfxkYgJKXwIrlmVwHzFeT6KFfpEPXxVhUj6ao
 3KNqD+V5VL2zdMF11WB2lVQaX2/48WIXz23ppiUA5R7tJTPr+yAIYIUzT2GFkMTr
 u//41pxnoXlm9RCjANrbzGSl049exf01mMFVzm6zGt6PZqTE/kaBuklRy6Vibk/C
 cSB7Iy/iVaySunmF6X5RuBT7HsKrIN6SgYRCHZ7BI9aelQpHztJuy4LZAbgRPZZU
 /VKB2BKLx1KeRNfn6ScvF1uSSLmXoFVs0PP7HwMrPs3AdI+KaHmYLqZf+Bf4W1q2
 5bAfj2x5qWwvMrV4RnwLltWAASw1G/o5fs8WhPA6cZkG9iZCB5EBCnHv4B0pm+oq
 xw8RPYImZFzK8w==
 =dKz+
 -----END PGP SIGNATURE-----

Merge tag 'locking-urgent-2021-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking fixes from Thomas Gleixner:
 "Two regression fixes for reader writer semaphores:

   - Plug a race in the lock handoff which is caused by inconsistency of
     the reader and writer path and can lead to corruption of the
     underlying counter.

   - down_read_trylock() is suboptimal when the lock is contended and
     multiple readers trylock concurrently. That's due to the initial
     value being read non-atomically which results in at least two
     compare exchange loops. Making the initial readout atomic reduces
     this significantly. Whith 40 readers by 11% in a benchmark which
     enforces contention on mmap_sem"

* tag 'locking-urgent-2021-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  locking/rwsem: Optimize down_read_trylock() under highly contended case
  locking/rwsem: Make handoff bit handling more consistent
This commit is contained in:
Linus Torvalds 2021-11-28 09:04:41 -08:00
commit d039f38801
1 changed files with 91 additions and 95 deletions

View File

@ -105,9 +105,9 @@
* atomic_long_cmpxchg() will be used to obtain writer lock.
*
* There are three places where the lock handoff bit may be set or cleared.
* 1) rwsem_mark_wake() for readers.
* 2) rwsem_try_write_lock() for writers.
* 3) Error path of rwsem_down_write_slowpath().
* 1) rwsem_mark_wake() for readers -- set, clear
* 2) rwsem_try_write_lock() for writers -- set, clear
* 3) rwsem_del_waiter() -- clear
*
* For all the above cases, wait_lock will be held. A writer must also
* be the first one in the wait_list to be eligible for setting the handoff
@ -334,6 +334,9 @@ struct rwsem_waiter {
struct task_struct *task;
enum rwsem_waiter_type type;
unsigned long timeout;
/* Writer only, not initialized in reader */
bool handoff_set;
};
#define rwsem_first_waiter(sem) \
list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
@ -344,12 +347,6 @@ enum rwsem_wake_type {
RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
};
enum writer_wait_state {
WRITER_NOT_FIRST, /* Writer is not first in wait list */
WRITER_FIRST, /* Writer is first in wait list */
WRITER_HANDOFF /* Writer is first & handoff needed */
};
/*
* The typical HZ value is either 250 or 1000. So set the minimum waiting
* time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
@ -365,6 +362,31 @@ enum writer_wait_state {
*/
#define MAX_READERS_WAKEUP 0x100
static inline void
rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
{
lockdep_assert_held(&sem->wait_lock);
list_add_tail(&waiter->list, &sem->wait_list);
/* caller will set RWSEM_FLAG_WAITERS */
}
/*
* Remove a waiter from the wait_list and clear flags.
*
* Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
* this function. Modify with care.
*/
static inline void
rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
{
lockdep_assert_held(&sem->wait_lock);
list_del(&waiter->list);
if (likely(!list_empty(&sem->wait_list)))
return;
atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
}
/*
* handle the lock release when processes blocked on it that can now run
* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@ -376,6 +398,8 @@ enum writer_wait_state {
* preferably when the wait_lock is released
* - woken process blocks are discarded from the list after having task zeroed
* - writers are only marked woken if downgrading is false
*
* Implies rwsem_del_waiter() for all woken readers.
*/
static void rwsem_mark_wake(struct rw_semaphore *sem,
enum rwsem_wake_type wake_type,
@ -490,17 +514,24 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
adjustment = woken * RWSEM_READER_BIAS - adjustment;
lockevent_cond_inc(rwsem_wake_reader, woken);
if (list_empty(&sem->wait_list)) {
/* hit end of list above */
adjustment -= RWSEM_FLAG_WAITERS;
}
/*
* When we've woken a reader, we no longer need to force writers
* to give up the lock and we can clear HANDOFF.
*/
if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
adjustment -= RWSEM_FLAG_HANDOFF;
oldcount = atomic_long_read(&sem->count);
if (list_empty(&sem->wait_list)) {
/*
* Combined with list_move_tail() above, this implies
* rwsem_del_waiter().
*/
adjustment -= RWSEM_FLAG_WAITERS;
if (oldcount & RWSEM_FLAG_HANDOFF)
adjustment -= RWSEM_FLAG_HANDOFF;
} else if (woken) {
/*
* When we've woken a reader, we no longer need to force
* writers to give up the lock and we can clear HANDOFF.
*/
if (oldcount & RWSEM_FLAG_HANDOFF)
adjustment -= RWSEM_FLAG_HANDOFF;
}
if (adjustment)
atomic_long_add(adjustment, &sem->count);
@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* race conditions between checking the rwsem wait list and setting the
* sem->count accordingly.
*
* If wstate is WRITER_HANDOFF, it will make sure that either the handoff
* bit is set or the lock is acquired with handoff bit cleared.
* Implies rwsem_del_waiter() on success.
*/
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
enum writer_wait_state wstate)
struct rwsem_waiter *waiter)
{
bool first = rwsem_first_waiter(sem) == waiter;
long count, new;
lockdep_assert_held(&sem->wait_lock);
@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
do {
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
if (has_handoff && wstate == WRITER_NOT_FIRST)
return false;
if (has_handoff) {
if (!first)
return false;
/* First waiter inherits a previously set handoff bit */
waiter->handoff_set = true;
}
new = count;
if (count & RWSEM_LOCK_MASK) {
if (has_handoff || (wstate != WRITER_HANDOFF))
if (has_handoff || (!rt_task(waiter->task) &&
!time_after(jiffies, waiter->timeout)))
return false;
new |= RWSEM_FLAG_HANDOFF;
@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
* We have either acquired the lock with handoff bit cleared or
* set the handoff bit.
*/
if (new & RWSEM_FLAG_HANDOFF)
if (new & RWSEM_FLAG_HANDOFF) {
waiter->handoff_set = true;
lockevent_inc(rwsem_wlock_handoff);
return false;
}
/*
* Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
* success.
*/
list_del(&waiter->list);
rwsem_set_owner(sem);
return true;
}
@ -956,7 +1001,7 @@ queue:
}
adjustment += RWSEM_FLAG_WAITERS;
}
list_add_tail(&waiter.list, &sem->wait_list);
rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock, but no longer actively locking */
count = atomic_long_add_return(adjustment, &sem->count);
@ -1002,11 +1047,7 @@ queue:
return sem;
out_nolock:
list_del(&waiter.list);
if (list_empty(&sem->wait_list)) {
atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
&sem->count);
}
rwsem_del_waiter(sem, &waiter);
raw_spin_unlock_irq(&sem->wait_lock);
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock_fail);
@ -1020,9 +1061,7 @@ static struct rw_semaphore *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
long count;
enum writer_wait_state wstate;
struct rwsem_waiter waiter;
struct rw_semaphore *ret = sem;
DEFINE_WAKE_Q(wake_q);
/* do optimistic spinning and steal lock if possible */
@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_WRITE;
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
/* account for this before adding a new element to the list */
wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
list_add_tail(&waiter.list, &sem->wait_list);
rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock */
if (wstate == WRITER_NOT_FIRST) {
if (rwsem_first_waiter(sem) != &waiter) {
count = atomic_long_read(&sem->count);
/*
@ -1083,13 +1119,16 @@ wait:
/* wait until we successfully acquire the lock */
set_current_state(state);
for (;;) {
if (rwsem_try_write_lock(sem, wstate)) {
if (rwsem_try_write_lock(sem, &waiter)) {
/* rwsem_try_write_lock() implies ACQUIRE on success */
break;
}
raw_spin_unlock_irq(&sem->wait_lock);
if (signal_pending_state(state, current))
goto out_nolock;
/*
* After setting the handoff bit and failing to acquire
* the lock, attempt to spin on owner to accelerate lock
@ -1098,7 +1137,7 @@ wait:
* In this case, we attempt to acquire the lock again
* without sleeping.
*/
if (wstate == WRITER_HANDOFF) {
if (waiter.handoff_set) {
enum owner_state owner_state;
preempt_disable();
@ -1109,66 +1148,26 @@ wait:
goto trylock_again;
}
/* Block until there are no active lockers. */
for (;;) {
if (signal_pending_state(state, current))
goto out_nolock;
schedule();
lockevent_inc(rwsem_sleep_writer);
set_current_state(state);
/*
* If HANDOFF bit is set, unconditionally do
* a trylock.
*/
if (wstate == WRITER_HANDOFF)
break;
if ((wstate == WRITER_NOT_FIRST) &&
(rwsem_first_waiter(sem) == &waiter))
wstate = WRITER_FIRST;
count = atomic_long_read(&sem->count);
if (!(count & RWSEM_LOCK_MASK))
break;
/*
* The setting of the handoff bit is deferred
* until rwsem_try_write_lock() is called.
*/
if ((wstate == WRITER_FIRST) && (rt_task(current) ||
time_after(jiffies, waiter.timeout))) {
wstate = WRITER_HANDOFF;
lockevent_inc(rwsem_wlock_handoff);
break;
}
}
schedule();
lockevent_inc(rwsem_sleep_writer);
set_current_state(state);
trylock_again:
raw_spin_lock_irq(&sem->wait_lock);
}
__set_current_state(TASK_RUNNING);
list_del(&waiter.list);
raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock);
return ret;
return sem;
out_nolock:
__set_current_state(TASK_RUNNING);
raw_spin_lock_irq(&sem->wait_lock);
list_del(&waiter.list);
if (unlikely(wstate == WRITER_HANDOFF))
atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count);
if (list_empty(&sem->wait_list))
atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
else
rwsem_del_waiter(sem, &waiter);
if (!list_empty(&sem->wait_list))
rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q);
lockevent_inc(rwsem_wlock_fail);
return ERR_PTR(-EINTR);
}
@ -1249,17 +1248,14 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
/*
* Optimize for the case when the rwsem is not locked at all.
*/
tmp = RWSEM_UNLOCKED_VALUE;
do {
tmp = atomic_long_read(&sem->count);
while (!(tmp & RWSEM_READ_FAILED_MASK)) {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
tmp + RWSEM_READER_BIAS)) {
tmp + RWSEM_READER_BIAS)) {
rwsem_set_reader_owned(sem);
return 1;
}
} while (!(tmp & RWSEM_READ_FAILED_MASK));
}
return 0;
}