2015-04-24 21:56:37 +03:00
# ifndef _GEN_PV_LOCK_SLOWPATH
# error "do not include this file"
# endif
# include <linux/hash.h>
# include <linux/bootmem.h>
/*
* Implement paravirt qspinlocks ; the general idea is to halt the vcpus instead
* of spinning them .
*
* This relies on the architecture to provide two paravirt hypercalls :
*
* pv_wait ( u8 * ptr , u8 val ) - - suspends the vcpu if * ptr = = val
* pv_kick ( cpu ) - - wakes a suspended vcpu
*
* Using these we implement __pv_queued_spin_lock_slowpath ( ) and
* __pv_queued_spin_unlock ( ) to replace native_queued_spin_lock_slowpath ( ) and
* native_queued_spin_unlock ( ) .
*/
# define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
enum vcpu_state {
vcpu_running = 0 ,
vcpu_halted ,
} ;
struct pv_node {
struct mcs_spinlock mcs ;
struct mcs_spinlock __res [ 3 ] ;
int cpu ;
u8 state ;
} ;
/*
* Lock and MCS node addresses hash table for fast lookup
*
* Hashing is done on a per - cacheline basis to minimize the need to access
* more than one cacheline .
*
* Dynamically allocate a hash table big enough to hold at least 4 X the
* number of possible cpus in the system . Allocation is done on page
* granularity . So the minimum number of hash buckets should be at least
* 256 ( 64 - bit ) or 512 ( 32 - bit ) to fully utilize a 4 k page .
*
* Since we should not be holding locks from NMI context ( very rare indeed ) the
* max load factor is 0.75 , which is around the point where open addressing
* breaks down .
*
*/
struct pv_hash_entry {
struct qspinlock * lock ;
struct pv_node * node ;
} ;
# define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
# define PV_HE_MIN (PAGE_SIZE / sizeof(struct pv_hash_entry))
static struct pv_hash_entry * pv_lock_hash ;
static unsigned int pv_lock_hash_bits __read_mostly ;
/*
* Allocate memory for the PV qspinlock hash buckets
*
* This function should be called from the paravirt spinlock initialization
* routine .
*/
void __init __pv_init_lock_hash ( void )
{
int pv_hash_size = ALIGN ( 4 * num_possible_cpus ( ) , PV_HE_PER_LINE ) ;
if ( pv_hash_size < PV_HE_MIN )
pv_hash_size = PV_HE_MIN ;
/*
* Allocate space from bootmem which should be page - size aligned
* and hence cacheline aligned .
*/
pv_lock_hash = alloc_large_system_hash ( " PV qspinlock " ,
sizeof ( struct pv_hash_entry ) ,
pv_hash_size , 0 , HASH_EARLY ,
& pv_lock_hash_bits , NULL ,
pv_hash_size , pv_hash_size ) ;
}
# define for_each_hash_entry(he, offset, hash) \
for ( hash & = ~ ( PV_HE_PER_LINE - 1 ) , he = & pv_lock_hash [ hash ] , offset = 0 ; \
offset < ( 1 < < pv_lock_hash_bits ) ; \
offset + + , he = & pv_lock_hash [ ( hash + offset ) & ( ( 1 < < pv_lock_hash_bits ) - 1 ) ] )
static struct qspinlock * * pv_hash ( struct qspinlock * lock , struct pv_node * node )
{
unsigned long offset , hash = hash_ptr ( lock , pv_lock_hash_bits ) ;
struct pv_hash_entry * he ;
for_each_hash_entry ( he , offset , hash ) {
if ( ! cmpxchg ( & he - > lock , NULL , lock ) ) {
WRITE_ONCE ( he - > node , node ) ;
return & he - > lock ;
}
}
/*
* Hard assume there is a free entry for us .
*
* This is guaranteed by ensuring every blocked lock only ever consumes
* a single entry , and since we only have 4 nesting levels per CPU
* and allocated 4 * nr_possible_cpus ( ) , this must be so .
*
* The single entry is guaranteed by having the lock owner unhash
* before it releases .
*/
BUG ( ) ;
}
static struct pv_node * pv_unhash ( struct qspinlock * lock )
{
unsigned long offset , hash = hash_ptr ( lock , pv_lock_hash_bits ) ;
struct pv_hash_entry * he ;
struct pv_node * node ;
for_each_hash_entry ( he , offset , hash ) {
if ( READ_ONCE ( he - > lock ) = = lock ) {
node = READ_ONCE ( he - > node ) ;
WRITE_ONCE ( he - > lock , NULL ) ;
return node ;
}
}
/*
* Hard assume we ' ll find an entry .
*
* This guarantees a limited lookup time and is itself guaranteed by
* having the lock owner do the unhash - - IFF the unlock sees the
* SLOW flag , there MUST be a hash entry .
*/
BUG ( ) ;
}
/*
* Initialize the PV part of the mcs_spinlock node .
*/
static void pv_init_node ( struct mcs_spinlock * node )
{
struct pv_node * pn = ( struct pv_node * ) node ;
BUILD_BUG_ON ( sizeof ( struct pv_node ) > 5 * sizeof ( struct mcs_spinlock ) ) ;
pn - > cpu = smp_processor_id ( ) ;
pn - > state = vcpu_running ;
}
/*
* Wait for node - > locked to become true , halt the vcpu after a short spin .
* pv_kick_node ( ) is used to wake the vcpu again .
*/
static void pv_wait_node ( struct mcs_spinlock * node )
{
struct pv_node * pn = ( struct pv_node * ) node ;
int loop ;
for ( ; ; ) {
for ( loop = SPIN_THRESHOLD ; loop ; loop - - ) {
if ( READ_ONCE ( node - > locked ) )
return ;
cpu_relax ( ) ;
}
/*
* Order pn - > state vs pn - > locked thusly :
*
* [ S ] pn - > state = vcpu_halted [ S ] next - > locked = 1
* MB MB
* [ L ] pn - > locked [ RmW ] pn - > state = vcpu_running
*
* Matches the xchg ( ) from pv_kick_node ( ) .
*/
2015-05-11 04:17:10 +03:00
set_mb ( pn - > state , vcpu_halted ) ;
2015-04-24 21:56:37 +03:00
if ( ! READ_ONCE ( node - > locked ) )
pv_wait ( & pn - > state , vcpu_halted ) ;
/*
* Reset the vCPU state to avoid unncessary CPU kicking
*/
WRITE_ONCE ( pn - > state , vcpu_running ) ;
/*
* If the locked flag is still not set after wakeup , it is a
* spurious wakeup and the vCPU should wait again . However ,
* there is a pretty high overhead for CPU halting and kicking .
* So it is better to spin for a while in the hope that the
* MCS lock will be released soon .
*/
}
/*
* By now our node - > locked should be 1 and our caller will not actually
* spin - wait for it . We do however rely on our caller to do a
* load - acquire for us .
*/
}
/*
* Called after setting next - > locked = 1 , used to wake those stuck in
* pv_wait_node ( ) .
*/
static void pv_kick_node ( struct mcs_spinlock * node )
{
struct pv_node * pn = ( struct pv_node * ) node ;
/*
* Note that because node - > locked is already set , this actual
* mcs_spinlock entry could be re - used already .
*
* This should be fine however , kicking people for no reason is
* harmless .
*
* See the comment in pv_wait_node ( ) .
*/
if ( xchg ( & pn - > state , vcpu_running ) = = vcpu_halted )
pv_kick ( pn - > cpu ) ;
}
/*
* Wait for l - > locked to become clear ; halt the vcpu after a short spin .
* __pv_queued_spin_unlock ( ) will wake us .
*/
static void pv_wait_head ( struct qspinlock * lock , struct mcs_spinlock * node )
{
struct pv_node * pn = ( struct pv_node * ) node ;
struct __qspinlock * l = ( void * ) lock ;
struct qspinlock * * lp = NULL ;
int loop ;
for ( ; ; ) {
for ( loop = SPIN_THRESHOLD ; loop ; loop - - ) {
if ( ! READ_ONCE ( l - > locked ) )
return ;
cpu_relax ( ) ;
}
WRITE_ONCE ( pn - > state , vcpu_halted ) ;
if ( ! lp ) { /* ONCE */
lp = pv_hash ( lock , pn ) ;
/*
* lp must be set before setting _Q_SLOW_VAL
*
* [ S ] lp = lock [ RmW ] l = l - > locked = 0
* MB MB
* [ S ] l - > locked = _Q_SLOW_VAL [ L ] lp
*
* Matches the cmpxchg ( ) in __pv_queued_spin_unlock ( ) .
*/
if ( ! cmpxchg ( & l - > locked , _Q_LOCKED_VAL , _Q_SLOW_VAL ) ) {
/*
* The lock is free and _Q_SLOW_VAL has never
* been set . Therefore we need to unhash before
* getting the lock .
*/
WRITE_ONCE ( * lp , NULL ) ;
return ;
}
}
pv_wait ( & l - > locked , _Q_SLOW_VAL ) ;
/*
* The unlocker should have freed the lock before kicking the
* CPU . So if the lock is still not free , it is a spurious
* wakeup and so the vCPU should wait again after spinning for
* a while .
*/
}
/*
* Lock is unlocked now ; the caller will acquire it without waiting .
* As with pv_wait_node ( ) we rely on the caller to do a load - acquire
* for us .
*/
}
/*
* PV version of the unlock function to be used in stead of
* queued_spin_unlock ( ) .
*/
__visible void __pv_queued_spin_unlock ( struct qspinlock * lock )
{
struct __qspinlock * l = ( void * ) lock ;
struct pv_node * node ;
/*
* We must not unlock if SLOW , because in that case we must first
* unhash . Otherwise it would be possible to have multiple @ lock
* entries , which would be BAD .
*/
if ( likely ( cmpxchg ( & l - > locked , _Q_LOCKED_VAL , 0 ) = = _Q_LOCKED_VAL ) )
return ;
/*
* Since the above failed to release , this must be the SLOW path .
* Therefore start by looking up the blocked node and unhashing it .
*/
node = pv_unhash ( lock ) ;
/*
* Now that we have a reference to the ( likely ) blocked pv_node ,
* release the lock .
*/
smp_store_release ( & l - > locked , 0 ) ;
/*
* At this point the memory pointed at by lock can be freed / reused ,
* however we can still use the pv_node to kick the CPU .
*/
if ( READ_ONCE ( node - > state ) = = vcpu_halted )
pv_kick ( node - > cpu ) ;
}
/*
* Include the architecture specific callee - save thunk of the
* __pv_queued_spin_unlock ( ) . This thunk is put together with
* __pv_queued_spin_unlock ( ) near the top of the file to make sure
* that the callee - save thunk and the real unlock function are close
* to each other sharing consecutive instruction cachelines .
*/
# include <asm/qspinlock_paravirt.h>