Two regression fixes for reader writer semaphores:
- Plug a race in the lock handoff which is caused by inconsistency of the reader and writer path and can lead to corruption of the underlying counter. - down_read_trylock() is suboptimal when the lock is contended and multiple readers trylock concurrently. That's due to the initial value being read non-atomically which results in at least two compare exchange loops. Making the initial readout atomic reduces this significantly. Whith 40 readers by 11% in a benchmark which enforces contention on mmap_sem. -----BEGIN PGP SIGNATURE----- iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmGjrRITHHRnbHhAbGlu dXRyb25peC5kZQAKCRCmGPVMDXSYodsdEACRDUU5tkNVIgNTsGrO4IUhNW9fxyfG 3dCAzcQx9w1UjjBn23/B0c6rPsVqEv6hKouBGXqdOHj0kLx6Xn0IPMTvqycPL+mp OyDzx+t773BlvTZyaYFa6vBiWbEVGzedDp6uLsYaBNo//4yN1WZY3mevTwzKVceX WOoobHjsoh5Wfwr1XmNw+7HVhPaY0E50DaIuRQrJjNj1zsUhzJsjr/M1NpiqCaSm PleDum3Dg0PD/pxdWtm34teuGQur0QknqPc2I6sZGnX0UMsCozeZAuH/MGnwwXec fsweMXBVyDngOIZbFX/tPbVTocOpfxkYgJKXwIrlmVwHzFeT6KFfpEPXxVhUj6ao 3KNqD+V5VL2zdMF11WB2lVQaX2/48WIXz23ppiUA5R7tJTPr+yAIYIUzT2GFkMTr u//41pxnoXlm9RCjANrbzGSl049exf01mMFVzm6zGt6PZqTE/kaBuklRy6Vibk/C cSB7Iy/iVaySunmF6X5RuBT7HsKrIN6SgYRCHZ7BI9aelQpHztJuy4LZAbgRPZZU /VKB2BKLx1KeRNfn6ScvF1uSSLmXoFVs0PP7HwMrPs3AdI+KaHmYLqZf+Bf4W1q2 5bAfj2x5qWwvMrV4RnwLltWAASw1G/o5fs8WhPA6cZkG9iZCB5EBCnHv4B0pm+oq xw8RPYImZFzK8w== =dKz+ -----END PGP SIGNATURE----- Merge tag 'locking-urgent-2021-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull locking fixes from Thomas Gleixner: "Two regression fixes for reader writer semaphores: - Plug a race in the lock handoff which is caused by inconsistency of the reader and writer path and can lead to corruption of the underlying counter. - down_read_trylock() is suboptimal when the lock is contended and multiple readers trylock concurrently. That's due to the initial value being read non-atomically which results in at least two compare exchange loops. Making the initial readout atomic reduces this significantly. Whith 40 readers by 11% in a benchmark which enforces contention on mmap_sem" * tag 'locking-urgent-2021-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: locking/rwsem: Optimize down_read_trylock() under highly contended case locking/rwsem: Make handoff bit handling more consistent
This commit is contained in:
commit
d039f38801
@ -105,9 +105,9 @@
|
||||
* atomic_long_cmpxchg() will be used to obtain writer lock.
|
||||
*
|
||||
* There are three places where the lock handoff bit may be set or cleared.
|
||||
* 1) rwsem_mark_wake() for readers.
|
||||
* 2) rwsem_try_write_lock() for writers.
|
||||
* 3) Error path of rwsem_down_write_slowpath().
|
||||
* 1) rwsem_mark_wake() for readers -- set, clear
|
||||
* 2) rwsem_try_write_lock() for writers -- set, clear
|
||||
* 3) rwsem_del_waiter() -- clear
|
||||
*
|
||||
* For all the above cases, wait_lock will be held. A writer must also
|
||||
* be the first one in the wait_list to be eligible for setting the handoff
|
||||
@ -334,6 +334,9 @@ struct rwsem_waiter {
|
||||
struct task_struct *task;
|
||||
enum rwsem_waiter_type type;
|
||||
unsigned long timeout;
|
||||
|
||||
/* Writer only, not initialized in reader */
|
||||
bool handoff_set;
|
||||
};
|
||||
#define rwsem_first_waiter(sem) \
|
||||
list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
|
||||
@ -344,12 +347,6 @@ enum rwsem_wake_type {
|
||||
RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
|
||||
};
|
||||
|
||||
enum writer_wait_state {
|
||||
WRITER_NOT_FIRST, /* Writer is not first in wait list */
|
||||
WRITER_FIRST, /* Writer is first in wait list */
|
||||
WRITER_HANDOFF /* Writer is first & handoff needed */
|
||||
};
|
||||
|
||||
/*
|
||||
* The typical HZ value is either 250 or 1000. So set the minimum waiting
|
||||
* time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
|
||||
@ -365,6 +362,31 @@ enum writer_wait_state {
|
||||
*/
|
||||
#define MAX_READERS_WAKEUP 0x100
|
||||
|
||||
static inline void
|
||||
rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
|
||||
{
|
||||
lockdep_assert_held(&sem->wait_lock);
|
||||
list_add_tail(&waiter->list, &sem->wait_list);
|
||||
/* caller will set RWSEM_FLAG_WAITERS */
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove a waiter from the wait_list and clear flags.
|
||||
*
|
||||
* Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
|
||||
* this function. Modify with care.
|
||||
*/
|
||||
static inline void
|
||||
rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
|
||||
{
|
||||
lockdep_assert_held(&sem->wait_lock);
|
||||
list_del(&waiter->list);
|
||||
if (likely(!list_empty(&sem->wait_list)))
|
||||
return;
|
||||
|
||||
atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
|
||||
}
|
||||
|
||||
/*
|
||||
* handle the lock release when processes blocked on it that can now run
|
||||
* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
|
||||
@ -376,6 +398,8 @@ enum writer_wait_state {
|
||||
* preferably when the wait_lock is released
|
||||
* - woken process blocks are discarded from the list after having task zeroed
|
||||
* - writers are only marked woken if downgrading is false
|
||||
*
|
||||
* Implies rwsem_del_waiter() for all woken readers.
|
||||
*/
|
||||
static void rwsem_mark_wake(struct rw_semaphore *sem,
|
||||
enum rwsem_wake_type wake_type,
|
||||
@ -490,17 +514,24 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
|
||||
|
||||
adjustment = woken * RWSEM_READER_BIAS - adjustment;
|
||||
lockevent_cond_inc(rwsem_wake_reader, woken);
|
||||
if (list_empty(&sem->wait_list)) {
|
||||
/* hit end of list above */
|
||||
adjustment -= RWSEM_FLAG_WAITERS;
|
||||
}
|
||||
|
||||
/*
|
||||
* When we've woken a reader, we no longer need to force writers
|
||||
* to give up the lock and we can clear HANDOFF.
|
||||
*/
|
||||
if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
|
||||
adjustment -= RWSEM_FLAG_HANDOFF;
|
||||
oldcount = atomic_long_read(&sem->count);
|
||||
if (list_empty(&sem->wait_list)) {
|
||||
/*
|
||||
* Combined with list_move_tail() above, this implies
|
||||
* rwsem_del_waiter().
|
||||
*/
|
||||
adjustment -= RWSEM_FLAG_WAITERS;
|
||||
if (oldcount & RWSEM_FLAG_HANDOFF)
|
||||
adjustment -= RWSEM_FLAG_HANDOFF;
|
||||
} else if (woken) {
|
||||
/*
|
||||
* When we've woken a reader, we no longer need to force
|
||||
* writers to give up the lock and we can clear HANDOFF.
|
||||
*/
|
||||
if (oldcount & RWSEM_FLAG_HANDOFF)
|
||||
adjustment -= RWSEM_FLAG_HANDOFF;
|
||||
}
|
||||
|
||||
if (adjustment)
|
||||
atomic_long_add(adjustment, &sem->count);
|
||||
@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
|
||||
* race conditions between checking the rwsem wait list and setting the
|
||||
* sem->count accordingly.
|
||||
*
|
||||
* If wstate is WRITER_HANDOFF, it will make sure that either the handoff
|
||||
* bit is set or the lock is acquired with handoff bit cleared.
|
||||
* Implies rwsem_del_waiter() on success.
|
||||
*/
|
||||
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
|
||||
enum writer_wait_state wstate)
|
||||
struct rwsem_waiter *waiter)
|
||||
{
|
||||
bool first = rwsem_first_waiter(sem) == waiter;
|
||||
long count, new;
|
||||
|
||||
lockdep_assert_held(&sem->wait_lock);
|
||||
@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
|
||||
do {
|
||||
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
|
||||
|
||||
if (has_handoff && wstate == WRITER_NOT_FIRST)
|
||||
return false;
|
||||
if (has_handoff) {
|
||||
if (!first)
|
||||
return false;
|
||||
|
||||
/* First waiter inherits a previously set handoff bit */
|
||||
waiter->handoff_set = true;
|
||||
}
|
||||
|
||||
new = count;
|
||||
|
||||
if (count & RWSEM_LOCK_MASK) {
|
||||
if (has_handoff || (wstate != WRITER_HANDOFF))
|
||||
if (has_handoff || (!rt_task(waiter->task) &&
|
||||
!time_after(jiffies, waiter->timeout)))
|
||||
return false;
|
||||
|
||||
new |= RWSEM_FLAG_HANDOFF;
|
||||
@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
|
||||
* We have either acquired the lock with handoff bit cleared or
|
||||
* set the handoff bit.
|
||||
*/
|
||||
if (new & RWSEM_FLAG_HANDOFF)
|
||||
if (new & RWSEM_FLAG_HANDOFF) {
|
||||
waiter->handoff_set = true;
|
||||
lockevent_inc(rwsem_wlock_handoff);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
|
||||
* success.
|
||||
*/
|
||||
list_del(&waiter->list);
|
||||
rwsem_set_owner(sem);
|
||||
return true;
|
||||
}
|
||||
@ -956,7 +1001,7 @@ queue:
|
||||
}
|
||||
adjustment += RWSEM_FLAG_WAITERS;
|
||||
}
|
||||
list_add_tail(&waiter.list, &sem->wait_list);
|
||||
rwsem_add_waiter(sem, &waiter);
|
||||
|
||||
/* we're now waiting on the lock, but no longer actively locking */
|
||||
count = atomic_long_add_return(adjustment, &sem->count);
|
||||
@ -1002,11 +1047,7 @@ queue:
|
||||
return sem;
|
||||
|
||||
out_nolock:
|
||||
list_del(&waiter.list);
|
||||
if (list_empty(&sem->wait_list)) {
|
||||
atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
|
||||
&sem->count);
|
||||
}
|
||||
rwsem_del_waiter(sem, &waiter);
|
||||
raw_spin_unlock_irq(&sem->wait_lock);
|
||||
__set_current_state(TASK_RUNNING);
|
||||
lockevent_inc(rwsem_rlock_fail);
|
||||
@ -1020,9 +1061,7 @@ static struct rw_semaphore *
|
||||
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
|
||||
{
|
||||
long count;
|
||||
enum writer_wait_state wstate;
|
||||
struct rwsem_waiter waiter;
|
||||
struct rw_semaphore *ret = sem;
|
||||
DEFINE_WAKE_Q(wake_q);
|
||||
|
||||
/* do optimistic spinning and steal lock if possible */
|
||||
@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
|
||||
waiter.task = current;
|
||||
waiter.type = RWSEM_WAITING_FOR_WRITE;
|
||||
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
|
||||
waiter.handoff_set = false;
|
||||
|
||||
raw_spin_lock_irq(&sem->wait_lock);
|
||||
|
||||
/* account for this before adding a new element to the list */
|
||||
wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
|
||||
|
||||
list_add_tail(&waiter.list, &sem->wait_list);
|
||||
rwsem_add_waiter(sem, &waiter);
|
||||
|
||||
/* we're now waiting on the lock */
|
||||
if (wstate == WRITER_NOT_FIRST) {
|
||||
if (rwsem_first_waiter(sem) != &waiter) {
|
||||
count = atomic_long_read(&sem->count);
|
||||
|
||||
/*
|
||||
@ -1083,13 +1119,16 @@ wait:
|
||||
/* wait until we successfully acquire the lock */
|
||||
set_current_state(state);
|
||||
for (;;) {
|
||||
if (rwsem_try_write_lock(sem, wstate)) {
|
||||
if (rwsem_try_write_lock(sem, &waiter)) {
|
||||
/* rwsem_try_write_lock() implies ACQUIRE on success */
|
||||
break;
|
||||
}
|
||||
|
||||
raw_spin_unlock_irq(&sem->wait_lock);
|
||||
|
||||
if (signal_pending_state(state, current))
|
||||
goto out_nolock;
|
||||
|
||||
/*
|
||||
* After setting the handoff bit and failing to acquire
|
||||
* the lock, attempt to spin on owner to accelerate lock
|
||||
@ -1098,7 +1137,7 @@ wait:
|
||||
* In this case, we attempt to acquire the lock again
|
||||
* without sleeping.
|
||||
*/
|
||||
if (wstate == WRITER_HANDOFF) {
|
||||
if (waiter.handoff_set) {
|
||||
enum owner_state owner_state;
|
||||
|
||||
preempt_disable();
|
||||
@ -1109,66 +1148,26 @@ wait:
|
||||
goto trylock_again;
|
||||
}
|
||||
|
||||
/* Block until there are no active lockers. */
|
||||
for (;;) {
|
||||
if (signal_pending_state(state, current))
|
||||
goto out_nolock;
|
||||
|
||||
schedule();
|
||||
lockevent_inc(rwsem_sleep_writer);
|
||||
set_current_state(state);
|
||||
/*
|
||||
* If HANDOFF bit is set, unconditionally do
|
||||
* a trylock.
|
||||
*/
|
||||
if (wstate == WRITER_HANDOFF)
|
||||
break;
|
||||
|
||||
if ((wstate == WRITER_NOT_FIRST) &&
|
||||
(rwsem_first_waiter(sem) == &waiter))
|
||||
wstate = WRITER_FIRST;
|
||||
|
||||
count = atomic_long_read(&sem->count);
|
||||
if (!(count & RWSEM_LOCK_MASK))
|
||||
break;
|
||||
|
||||
/*
|
||||
* The setting of the handoff bit is deferred
|
||||
* until rwsem_try_write_lock() is called.
|
||||
*/
|
||||
if ((wstate == WRITER_FIRST) && (rt_task(current) ||
|
||||
time_after(jiffies, waiter.timeout))) {
|
||||
wstate = WRITER_HANDOFF;
|
||||
lockevent_inc(rwsem_wlock_handoff);
|
||||
break;
|
||||
}
|
||||
}
|
||||
schedule();
|
||||
lockevent_inc(rwsem_sleep_writer);
|
||||
set_current_state(state);
|
||||
trylock_again:
|
||||
raw_spin_lock_irq(&sem->wait_lock);
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
list_del(&waiter.list);
|
||||
raw_spin_unlock_irq(&sem->wait_lock);
|
||||
lockevent_inc(rwsem_wlock);
|
||||
|
||||
return ret;
|
||||
return sem;
|
||||
|
||||
out_nolock:
|
||||
__set_current_state(TASK_RUNNING);
|
||||
raw_spin_lock_irq(&sem->wait_lock);
|
||||
list_del(&waiter.list);
|
||||
|
||||
if (unlikely(wstate == WRITER_HANDOFF))
|
||||
atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count);
|
||||
|
||||
if (list_empty(&sem->wait_list))
|
||||
atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
|
||||
else
|
||||
rwsem_del_waiter(sem, &waiter);
|
||||
if (!list_empty(&sem->wait_list))
|
||||
rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
|
||||
raw_spin_unlock_irq(&sem->wait_lock);
|
||||
wake_up_q(&wake_q);
|
||||
lockevent_inc(rwsem_wlock_fail);
|
||||
|
||||
return ERR_PTR(-EINTR);
|
||||
}
|
||||
|
||||
@ -1249,17 +1248,14 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
|
||||
|
||||
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
|
||||
|
||||
/*
|
||||
* Optimize for the case when the rwsem is not locked at all.
|
||||
*/
|
||||
tmp = RWSEM_UNLOCKED_VALUE;
|
||||
do {
|
||||
tmp = atomic_long_read(&sem->count);
|
||||
while (!(tmp & RWSEM_READ_FAILED_MASK)) {
|
||||
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
|
||||
tmp + RWSEM_READER_BIAS)) {
|
||||
tmp + RWSEM_READER_BIAS)) {
|
||||
rwsem_set_reader_owned(sem);
|
||||
return 1;
|
||||
}
|
||||
} while (!(tmp & RWSEM_READ_FAILED_MASK));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user