2014-01-29 15:51:42 +04:00
# include <linux/percpu.h>
# include <linux/sched.h>
2015-01-06 22:45:07 +03:00
# include <linux/osq_lock.h>
2014-01-29 15:51:42 +04:00
/*
* An MCS like lock especially tailored for optimistic spinning for sleeping
* lock implementations ( mutex , rwsem , etc ) .
*
* Using a single mcs node per CPU is safe because sleeping locks should not be
* called from interrupt context and we have preemption disabled while
* spinning .
*/
2014-07-14 21:27:48 +04:00
static DEFINE_PER_CPU_SHARED_ALIGNED ( struct optimistic_spin_node , osq_node ) ;
2014-01-29 15:51:42 +04:00
2014-07-14 21:27:49 +04:00
/*
* We use the value 0 to represent " no CPU " , thus the encoded value
* will be the CPU number incremented by 1.
*/
static inline int encode_cpu ( int cpu_nr )
{
return cpu_nr + 1 ;
}
2016-11-02 12:08:29 +03:00
static inline int node_cpu ( struct optimistic_spin_node * node )
{
return node - > cpu - 1 ;
}
2014-07-14 21:27:49 +04:00
static inline struct optimistic_spin_node * decode_cpu ( int encoded_cpu_val )
{
int cpu_nr = encoded_cpu_val - 1 ;
return per_cpu_ptr ( & osq_node , cpu_nr ) ;
}
2014-01-29 15:51:42 +04:00
/*
* Get a stable @ node - > next pointer , either for unlock ( ) or unqueue ( ) purposes .
* Can return NULL in case we were the last queued and we updated @ lock instead .
*/
2014-07-14 21:27:48 +04:00
static inline struct optimistic_spin_node *
2014-07-14 21:27:49 +04:00
osq_wait_next ( struct optimistic_spin_queue * lock ,
2014-07-14 21:27:48 +04:00
struct optimistic_spin_node * node ,
struct optimistic_spin_node * prev )
2014-01-29 15:51:42 +04:00
{
2014-07-14 21:27:48 +04:00
struct optimistic_spin_node * next = NULL ;
2014-07-14 21:27:49 +04:00
int curr = encode_cpu ( smp_processor_id ( ) ) ;
int old ;
/*
* If there is a prev node in queue , then the ' old ' value will be
* the prev node ' s CPU # , else it ' s set to OSQ_UNLOCKED_VAL since if
* we ' re currently last in queue , then the queue will then become empty .
*/
old = prev ? prev - > cpu : OSQ_UNLOCKED_VAL ;
2014-01-29 15:51:42 +04:00
for ( ; ; ) {
2014-07-14 21:27:49 +04:00
if ( atomic_read ( & lock - > tail ) = = curr & &
2015-09-14 10:37:24 +03:00
atomic_cmpxchg_acquire ( & lock - > tail , curr , old ) = = curr ) {
2014-01-29 15:51:42 +04:00
/*
* We were the last queued , we moved @ lock back . @ prev
* will now observe @ lock and will complete its
* unlock ( ) / unqueue ( ) .
*/
break ;
}
/*
* We must xchg ( ) the @ node - > next value , because if we were to
* leave it in , a concurrent unlock ( ) / unqueue ( ) from
* @ node - > next might complete Step - A and think its @ prev is
* still valid .
*
* If the concurrent unlock ( ) / unqueue ( ) wins the race , we ' ll
* wait for either @ lock to point to us , through its Step - B , or
* wait for a new @ node - > next from its Step - C .
*/
if ( node - > next ) {
next = xchg ( & node - > next , NULL ) ;
if ( next )
break ;
}
2016-10-25 12:03:14 +03:00
cpu_relax ( ) ;
2014-01-29 15:51:42 +04:00
}
return next ;
}
2014-07-14 21:27:49 +04:00
bool osq_lock ( struct optimistic_spin_queue * lock )
2014-01-29 15:51:42 +04:00
{
2014-07-14 21:27:48 +04:00
struct optimistic_spin_node * node = this_cpu_ptr ( & osq_node ) ;
struct optimistic_spin_node * prev , * next ;
2014-07-14 21:27:49 +04:00
int curr = encode_cpu ( smp_processor_id ( ) ) ;
int old ;
2014-01-29 15:51:42 +04:00
node - > locked = 0 ;
node - > next = NULL ;
2014-07-14 21:27:49 +04:00
node - > cpu = curr ;
2014-01-29 15:51:42 +04:00
2015-09-14 10:37:24 +03:00
/*
2015-12-11 20:46:41 +03:00
* We need both ACQUIRE ( pairs with corresponding RELEASE in
* unlock ( ) uncontended , or fastpath ) and RELEASE ( to publish
* the node fields we just initialised ) semantics when updating
* the lock tail .
2015-09-14 10:37:24 +03:00
*/
2015-12-11 20:46:41 +03:00
old = atomic_xchg ( & lock - > tail , curr ) ;
2014-07-14 21:27:49 +04:00
if ( old = = OSQ_UNLOCKED_VAL )
2014-01-29 15:51:42 +04:00
return true ;
2014-07-14 21:27:49 +04:00
prev = decode_cpu ( old ) ;
node - > prev = prev ;
locking/osq_lock: Fix osq_lock queue corruption
Fix ordering of link creation between node->prev and prev->next in
osq_lock(). A case in which the status of optimistic spin queue is
CPU6->CPU2 in which CPU6 has acquired the lock.
tail
v
,-. <- ,-.
|6| |2|
`-' -> `-'
At this point if CPU0 comes in to acquire osq_lock, it will update the
tail count.
CPU2 CPU0
----------------------------------
tail
v
,-. <- ,-. ,-.
|6| |2| |0|
`-' -> `-' `-'
After tail count update if CPU2 starts to unqueue itself from
optimistic spin queue, it will find an updated tail count with CPU0 and
update CPU2 node->next to NULL in osq_wait_next().
unqueue-A
tail
v
,-. <- ,-. ,-.
|6| |2| |0|
`-' `-' `-'
unqueue-B
->tail != curr && !node->next
If reordering of following stores happen then prev->next where prev
being CPU2 would be updated to point to CPU0 node:
tail
v
,-. <- ,-. ,-.
|6| |2| |0|
`-' `-' -> `-'
osq_wait_next()
node->next <- 0
xchg(node->next, NULL)
tail
v
,-. <- ,-. ,-.
|6| |2| |0|
`-' `-' `-'
unqueue-C
At this point if next instruction
WRITE_ONCE(next->prev, prev);
in CPU2 path is committed before the update of CPU0 node->prev = prev then
CPU0 node->prev will point to CPU6 node.
tail
v----------. v
,-. <- ,-. ,-.
|6| |2| |0|
`-' `-' `-'
`----------^
At this point if CPU0 path's node->prev = prev is committed resulting
in change of CPU0 prev back to CPU2 node. CPU2 node->next is NULL
currently,
tail
v
,-. <- ,-. <- ,-.
|6| |2| |0|
`-' `-' `-'
`----------^
so if CPU0 gets into unqueue path of osq_lock it will keep spinning
in infinite loop as condition prev->next == node will never be true.
Signed-off-by: Prateek Sood <prsood@codeaurora.org>
[ Added pictures, rewrote comments. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: sramana@codeaurora.org
Link: http://lkml.kernel.org/r/1500040076-27626-1-git-send-email-prsood@codeaurora.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-14 16:47:56 +03:00
/*
* osq_lock ( ) unqueue
*
* node - > prev = prev osq_wait_next ( )
* WMB MB
* prev - > next = node next - > prev = prev // unqueue-C
*
* Here ' node - > prev ' and ' next - > prev ' are the same variable and we need
* to ensure these stores happen in - order to avoid corrupting the list .
*/
smp_wmb ( ) ;
2015-02-23 06:31:41 +03:00
WRITE_ONCE ( prev - > next , node ) ;
2014-01-29 15:51:42 +04:00
/*
* Normally @ prev is untouchable after the above store ; because at that
* moment unlock can proceed and wipe the node element from stack .
*
* However , since our nodes are static per - cpu storage , we ' re
* guaranteed their existence - - this allows us to apply
* cmpxchg in an attempt to undo our queueing .
*/
2015-02-23 06:31:41 +03:00
while ( ! READ_ONCE ( node - > locked ) ) {
2014-01-29 15:51:42 +04:00
/*
* If we need to reschedule bail . . . so we can block .
2016-11-02 12:08:29 +03:00
* Use vcpu_is_preempted ( ) to avoid waiting for a preempted
* lock holder :
2014-01-29 15:51:42 +04:00
*/
2016-11-02 12:08:29 +03:00
if ( need_resched ( ) | | vcpu_is_preempted ( node_cpu ( node - > prev ) ) )
2014-01-29 15:51:42 +04:00
goto unqueue ;
2016-10-25 12:03:14 +03:00
cpu_relax ( ) ;
2014-01-29 15:51:42 +04:00
}
return true ;
unqueue :
/*
* Step - A - - stabilize @ prev
*
* Undo our @ prev - > next assignment ; this will make @ prev ' s
* unlock ( ) / unqueue ( ) wait for a next pointer since @ lock points to us
* ( or later ) .
*/
for ( ; ; ) {
if ( prev - > next = = node & &
cmpxchg ( & prev - > next , node , NULL ) = = node )
break ;
/*
* We can only fail the cmpxchg ( ) racing against an unlock ( ) ,
* in which case we should observe @ node - > locked becomming
* true .
*/
if ( smp_load_acquire ( & node - > locked ) )
return true ;
2016-10-25 12:03:14 +03:00
cpu_relax ( ) ;
2014-01-29 15:51:42 +04:00
/*
* Or we race against a concurrent unqueue ( ) ' s step - B , in which
* case its step - C will write us a new @ node - > prev pointer .
*/
2015-02-23 06:31:41 +03:00
prev = READ_ONCE ( node - > prev ) ;
2014-01-29 15:51:42 +04:00
}
/*
* Step - B - - stabilize @ next
*
* Similar to unlock ( ) , wait for @ node - > next or move @ lock from @ node
* back to @ prev .
*/
next = osq_wait_next ( lock , node , prev ) ;
if ( ! next )
return false ;
/*
* Step - C - - unlink
*
* @ prev is stable because its still waiting for a new @ prev - > next
* pointer , @ next is stable because our @ node - > next pointer is NULL and
* it will wait in Step - A .
*/
2015-02-23 06:31:41 +03:00
WRITE_ONCE ( next - > prev , prev ) ;
WRITE_ONCE ( prev - > next , next ) ;
2014-01-29 15:51:42 +04:00
return false ;
}
2014-07-14 21:27:49 +04:00
void osq_unlock ( struct optimistic_spin_queue * lock )
2014-01-29 15:51:42 +04:00
{
2014-07-14 21:27:51 +04:00
struct optimistic_spin_node * node , * next ;
2014-07-14 21:27:49 +04:00
int curr = encode_cpu ( smp_processor_id ( ) ) ;
2014-01-29 15:51:42 +04:00
/*
* Fast path for the uncontended case .
*/
2015-09-14 10:37:24 +03:00
if ( likely ( atomic_cmpxchg_release ( & lock - > tail , curr ,
OSQ_UNLOCKED_VAL ) = = curr ) )
2014-01-29 15:51:42 +04:00
return ;
/*
* Second most likely case .
*/
2014-07-14 21:27:51 +04:00
node = this_cpu_ptr ( & osq_node ) ;
2014-01-29 15:51:42 +04:00
next = xchg ( & node - > next , NULL ) ;
if ( next ) {
2015-02-23 06:31:41 +03:00
WRITE_ONCE ( next - > locked , 1 ) ;
2014-01-29 15:51:42 +04:00
return ;
}
next = osq_wait_next ( lock , node , NULL ) ;
if ( next )
2015-02-23 06:31:41 +03:00
WRITE_ONCE ( next - > locked , 1 ) ;
2014-01-29 15:51:42 +04:00
}