b629b54170
Rename yield_propagate_owner to yield_sleepy_owner, which better describes what it does (what, not how). Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Tested-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com> Reviewed-by: "Nysal Jan K.A" <nysal@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://msgid.link/20231016124305.139923-7-npiggin@gmail.com
990 lines
23 KiB
C
990 lines
23 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
#include <linux/bug.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/export.h>
|
|
#include <linux/percpu.h>
|
|
#include <linux/processor.h>
|
|
#include <linux/smp.h>
|
|
#include <linux/topology.h>
|
|
#include <linux/sched/clock.h>
|
|
#include <asm/qspinlock.h>
|
|
#include <asm/paravirt.h>
|
|
|
|
#define MAX_NODES 4
|
|
|
|
struct qnode {
|
|
struct qnode *next;
|
|
struct qspinlock *lock;
|
|
int cpu;
|
|
u8 sleepy; /* 1 if the previous vCPU was preempted or
|
|
* if the previous node was sleepy */
|
|
u8 locked; /* 1 if lock acquired */
|
|
};
|
|
|
|
struct qnodes {
|
|
int count;
|
|
struct qnode nodes[MAX_NODES];
|
|
};
|
|
|
|
/* Tuning parameters */
|
|
static int steal_spins __read_mostly = (1 << 5);
|
|
static int remote_steal_spins __read_mostly = (1 << 2);
|
|
#if _Q_SPIN_TRY_LOCK_STEAL == 1
|
|
static const bool maybe_stealers = true;
|
|
#else
|
|
static bool maybe_stealers __read_mostly = true;
|
|
#endif
|
|
static int head_spins __read_mostly = (1 << 8);
|
|
|
|
static bool pv_yield_owner __read_mostly = true;
|
|
static bool pv_yield_allow_steal __read_mostly = false;
|
|
static bool pv_spin_on_preempted_owner __read_mostly = false;
|
|
static bool pv_sleepy_lock __read_mostly = true;
|
|
static bool pv_sleepy_lock_sticky __read_mostly = false;
|
|
static u64 pv_sleepy_lock_interval_ns __read_mostly = 0;
|
|
static int pv_sleepy_lock_factor __read_mostly = 256;
|
|
static bool pv_yield_prev __read_mostly = true;
|
|
static bool pv_yield_sleepy_owner __read_mostly = true;
|
|
static bool pv_prod_head __read_mostly = false;
|
|
|
|
static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
|
|
static DEFINE_PER_CPU_ALIGNED(u64, sleepy_lock_seen_clock);
|
|
|
|
#if _Q_SPIN_SPEC_BARRIER == 1
|
|
#define spec_barrier() do { asm volatile("ori 31,31,0" ::: "memory"); } while (0)
|
|
#else
|
|
#define spec_barrier() do { } while (0)
|
|
#endif
|
|
|
|
static __always_inline bool recently_sleepy(void)
|
|
{
|
|
/* pv_sleepy_lock is true when this is called */
|
|
if (pv_sleepy_lock_interval_ns) {
|
|
u64 seen = this_cpu_read(sleepy_lock_seen_clock);
|
|
|
|
if (seen) {
|
|
u64 delta = sched_clock() - seen;
|
|
if (delta < pv_sleepy_lock_interval_ns)
|
|
return true;
|
|
this_cpu_write(sleepy_lock_seen_clock, 0);
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static __always_inline int get_steal_spins(bool paravirt, bool sleepy)
|
|
{
|
|
if (paravirt && sleepy)
|
|
return steal_spins * pv_sleepy_lock_factor;
|
|
else
|
|
return steal_spins;
|
|
}
|
|
|
|
static __always_inline int get_remote_steal_spins(bool paravirt, bool sleepy)
|
|
{
|
|
if (paravirt && sleepy)
|
|
return remote_steal_spins * pv_sleepy_lock_factor;
|
|
else
|
|
return remote_steal_spins;
|
|
}
|
|
|
|
static __always_inline int get_head_spins(bool paravirt, bool sleepy)
|
|
{
|
|
if (paravirt && sleepy)
|
|
return head_spins * pv_sleepy_lock_factor;
|
|
else
|
|
return head_spins;
|
|
}
|
|
|
|
static inline u32 encode_tail_cpu(int cpu)
|
|
{
|
|
return (cpu + 1) << _Q_TAIL_CPU_OFFSET;
|
|
}
|
|
|
|
static inline int decode_tail_cpu(u32 val)
|
|
{
|
|
return (val >> _Q_TAIL_CPU_OFFSET) - 1;
|
|
}
|
|
|
|
static inline int get_owner_cpu(u32 val)
|
|
{
|
|
return (val & _Q_OWNER_CPU_MASK) >> _Q_OWNER_CPU_OFFSET;
|
|
}
|
|
|
|
/*
|
|
* Try to acquire the lock if it was not already locked. If the tail matches
|
|
* mytail then clear it, otherwise leave it unchnaged. Return previous value.
|
|
*
|
|
* This is used by the head of the queue to acquire the lock and clean up
|
|
* its tail if it was the last one queued.
|
|
*/
|
|
static __always_inline u32 trylock_clean_tail(struct qspinlock *lock, u32 tail)
|
|
{
|
|
u32 newval = queued_spin_encode_locked_val();
|
|
u32 prev, tmp;
|
|
|
|
asm volatile(
|
|
"1: lwarx %0,0,%2,%7 # trylock_clean_tail \n"
|
|
/* This test is necessary if there could be stealers */
|
|
" andi. %1,%0,%5 \n"
|
|
" bne 3f \n"
|
|
/* Test whether the lock tail == mytail */
|
|
" and %1,%0,%6 \n"
|
|
" cmpw 0,%1,%3 \n"
|
|
/* Merge the new locked value */
|
|
" or %1,%1,%4 \n"
|
|
" bne 2f \n"
|
|
/* If the lock tail matched, then clear it, otherwise leave it. */
|
|
" andc %1,%1,%6 \n"
|
|
"2: stwcx. %1,0,%2 \n"
|
|
" bne- 1b \n"
|
|
"\t" PPC_ACQUIRE_BARRIER " \n"
|
|
"3: \n"
|
|
: "=&r" (prev), "=&r" (tmp)
|
|
: "r" (&lock->val), "r"(tail), "r" (newval),
|
|
"i" (_Q_LOCKED_VAL),
|
|
"r" (_Q_TAIL_CPU_MASK),
|
|
"i" (_Q_SPIN_EH_HINT)
|
|
: "cr0", "memory");
|
|
|
|
return prev;
|
|
}
|
|
|
|
/*
|
|
* Publish our tail, replacing previous tail. Return previous value.
|
|
*
|
|
* This provides a release barrier for publishing node, this pairs with the
|
|
* acquire barrier in get_tail_qnode() when the next CPU finds this tail
|
|
* value.
|
|
*/
|
|
static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail)
|
|
{
|
|
u32 prev, tmp;
|
|
|
|
kcsan_release();
|
|
|
|
asm volatile(
|
|
"\t" PPC_RELEASE_BARRIER " \n"
|
|
"1: lwarx %0,0,%2 # publish_tail_cpu \n"
|
|
" andc %1,%0,%4 \n"
|
|
" or %1,%1,%3 \n"
|
|
" stwcx. %1,0,%2 \n"
|
|
" bne- 1b \n"
|
|
: "=&r" (prev), "=&r"(tmp)
|
|
: "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK)
|
|
: "cr0", "memory");
|
|
|
|
return prev;
|
|
}
|
|
|
|
static __always_inline u32 set_mustq(struct qspinlock *lock)
|
|
{
|
|
u32 prev;
|
|
|
|
asm volatile(
|
|
"1: lwarx %0,0,%1 # set_mustq \n"
|
|
" or %0,%0,%2 \n"
|
|
" stwcx. %0,0,%1 \n"
|
|
" bne- 1b \n"
|
|
: "=&r" (prev)
|
|
: "r" (&lock->val), "r" (_Q_MUST_Q_VAL)
|
|
: "cr0", "memory");
|
|
|
|
return prev;
|
|
}
|
|
|
|
static __always_inline u32 clear_mustq(struct qspinlock *lock)
|
|
{
|
|
u32 prev;
|
|
|
|
asm volatile(
|
|
"1: lwarx %0,0,%1 # clear_mustq \n"
|
|
" andc %0,%0,%2 \n"
|
|
" stwcx. %0,0,%1 \n"
|
|
" bne- 1b \n"
|
|
: "=&r" (prev)
|
|
: "r" (&lock->val), "r" (_Q_MUST_Q_VAL)
|
|
: "cr0", "memory");
|
|
|
|
return prev;
|
|
}
|
|
|
|
static __always_inline bool try_set_sleepy(struct qspinlock *lock, u32 old)
|
|
{
|
|
u32 prev;
|
|
u32 new = old | _Q_SLEEPY_VAL;
|
|
|
|
BUG_ON(!(old & _Q_LOCKED_VAL));
|
|
BUG_ON(old & _Q_SLEEPY_VAL);
|
|
|
|
asm volatile(
|
|
"1: lwarx %0,0,%1 # try_set_sleepy \n"
|
|
" cmpw 0,%0,%2 \n"
|
|
" bne- 2f \n"
|
|
" stwcx. %3,0,%1 \n"
|
|
" bne- 1b \n"
|
|
"2: \n"
|
|
: "=&r" (prev)
|
|
: "r" (&lock->val), "r"(old), "r" (new)
|
|
: "cr0", "memory");
|
|
|
|
return likely(prev == old);
|
|
}
|
|
|
|
static __always_inline void seen_sleepy_owner(struct qspinlock *lock, u32 val)
|
|
{
|
|
if (pv_sleepy_lock) {
|
|
if (pv_sleepy_lock_interval_ns)
|
|
this_cpu_write(sleepy_lock_seen_clock, sched_clock());
|
|
if (!(val & _Q_SLEEPY_VAL))
|
|
try_set_sleepy(lock, val);
|
|
}
|
|
}
|
|
|
|
static __always_inline void seen_sleepy_lock(void)
|
|
{
|
|
if (pv_sleepy_lock && pv_sleepy_lock_interval_ns)
|
|
this_cpu_write(sleepy_lock_seen_clock, sched_clock());
|
|
}
|
|
|
|
static __always_inline void seen_sleepy_node(void)
|
|
{
|
|
if (pv_sleepy_lock) {
|
|
if (pv_sleepy_lock_interval_ns)
|
|
this_cpu_write(sleepy_lock_seen_clock, sched_clock());
|
|
/* Don't set sleepy because we likely have a stale val */
|
|
}
|
|
}
|
|
|
|
static struct qnode *get_tail_qnode(struct qspinlock *lock, int prev_cpu)
|
|
{
|
|
struct qnodes *qnodesp = per_cpu_ptr(&qnodes, prev_cpu);
|
|
int idx;
|
|
|
|
/*
|
|
* After publishing the new tail and finding a previous tail in the
|
|
* previous val (which is the control dependency), this barrier
|
|
* orders the release barrier in publish_tail_cpu performed by the
|
|
* last CPU, with subsequently looking at its qnode structures
|
|
* after the barrier.
|
|
*/
|
|
smp_acquire__after_ctrl_dep();
|
|
|
|
for (idx = 0; idx < MAX_NODES; idx++) {
|
|
struct qnode *qnode = &qnodesp->nodes[idx];
|
|
if (qnode->lock == lock)
|
|
return qnode;
|
|
}
|
|
|
|
BUG();
|
|
}
|
|
|
|
/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */
|
|
static __always_inline bool __yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt, bool mustq)
|
|
{
|
|
int owner;
|
|
u32 yield_count;
|
|
bool preempted = false;
|
|
|
|
BUG_ON(!(val & _Q_LOCKED_VAL));
|
|
|
|
if (!paravirt)
|
|
goto relax;
|
|
|
|
if (!pv_yield_owner)
|
|
goto relax;
|
|
|
|
owner = get_owner_cpu(val);
|
|
yield_count = yield_count_of(owner);
|
|
|
|
if ((yield_count & 1) == 0)
|
|
goto relax; /* owner vcpu is running */
|
|
|
|
spin_end();
|
|
|
|
seen_sleepy_owner(lock, val);
|
|
preempted = true;
|
|
|
|
/*
|
|
* Read the lock word after sampling the yield count. On the other side
|
|
* there may a wmb because the yield count update is done by the
|
|
* hypervisor preemption and the value update by the OS, however this
|
|
* ordering might reduce the chance of out of order accesses and
|
|
* improve the heuristic.
|
|
*/
|
|
smp_rmb();
|
|
|
|
if (READ_ONCE(lock->val) == val) {
|
|
if (mustq)
|
|
clear_mustq(lock);
|
|
yield_to_preempted(owner, yield_count);
|
|
if (mustq)
|
|
set_mustq(lock);
|
|
spin_begin();
|
|
|
|
/* Don't relax if we yielded. Maybe we should? */
|
|
return preempted;
|
|
}
|
|
spin_begin();
|
|
relax:
|
|
spin_cpu_relax();
|
|
|
|
return preempted;
|
|
}
|
|
|
|
/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */
|
|
static __always_inline bool yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt)
|
|
{
|
|
return __yield_to_locked_owner(lock, val, paravirt, false);
|
|
}
|
|
|
|
/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */
|
|
static __always_inline bool yield_head_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt)
|
|
{
|
|
bool mustq = false;
|
|
|
|
if ((val & _Q_MUST_Q_VAL) && pv_yield_allow_steal)
|
|
mustq = true;
|
|
|
|
return __yield_to_locked_owner(lock, val, paravirt, mustq);
|
|
}
|
|
|
|
static __always_inline void propagate_sleepy(struct qnode *node, u32 val, bool paravirt)
|
|
{
|
|
struct qnode *next;
|
|
int owner;
|
|
|
|
if (!paravirt)
|
|
return;
|
|
if (!pv_yield_sleepy_owner)
|
|
return;
|
|
|
|
next = READ_ONCE(node->next);
|
|
if (!next)
|
|
return;
|
|
|
|
if (next->sleepy)
|
|
return;
|
|
|
|
owner = get_owner_cpu(val);
|
|
if (vcpu_is_preempted(owner))
|
|
next->sleepy = 1;
|
|
}
|
|
|
|
/* Called inside spin_begin() */
|
|
static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode *node, int prev_cpu, bool paravirt)
|
|
{
|
|
u32 yield_count;
|
|
bool preempted = false;
|
|
|
|
if (!paravirt)
|
|
goto relax;
|
|
|
|
if (!pv_yield_sleepy_owner)
|
|
goto yield_prev;
|
|
|
|
/*
|
|
* If the previous waiter was preempted it might not be able to
|
|
* propagate sleepy to us, so check the lock in that case too.
|
|
*/
|
|
if (node->sleepy || vcpu_is_preempted(prev_cpu)) {
|
|
u32 val = READ_ONCE(lock->val);
|
|
|
|
if (val & _Q_LOCKED_VAL) {
|
|
if (node->next && !node->next->sleepy) {
|
|
/*
|
|
* Propagate sleepy to next waiter. Only if
|
|
* owner is preempted, which allows the queue
|
|
* to become "non-sleepy" if vCPU preemption
|
|
* ceases to occur, even if the lock remains
|
|
* highly contended.
|
|
*/
|
|
if (vcpu_is_preempted(get_owner_cpu(val)))
|
|
node->next->sleepy = 1;
|
|
}
|
|
|
|
preempted = yield_to_locked_owner(lock, val, paravirt);
|
|
if (preempted)
|
|
return preempted;
|
|
}
|
|
node->sleepy = false;
|
|
}
|
|
|
|
yield_prev:
|
|
if (!pv_yield_prev)
|
|
goto relax;
|
|
|
|
yield_count = yield_count_of(prev_cpu);
|
|
if ((yield_count & 1) == 0)
|
|
goto relax; /* owner vcpu is running */
|
|
|
|
spin_end();
|
|
|
|
preempted = true;
|
|
seen_sleepy_node();
|
|
|
|
smp_rmb(); /* See __yield_to_locked_owner comment */
|
|
|
|
if (!READ_ONCE(node->locked)) {
|
|
yield_to_preempted(prev_cpu, yield_count);
|
|
spin_begin();
|
|
return preempted;
|
|
}
|
|
spin_begin();
|
|
|
|
relax:
|
|
spin_cpu_relax();
|
|
|
|
return preempted;
|
|
}
|
|
|
|
static __always_inline bool steal_break(u32 val, int iters, bool paravirt, bool sleepy)
|
|
{
|
|
if (iters >= get_steal_spins(paravirt, sleepy))
|
|
return true;
|
|
|
|
if (IS_ENABLED(CONFIG_NUMA) &&
|
|
(iters >= get_remote_steal_spins(paravirt, sleepy))) {
|
|
int cpu = get_owner_cpu(val);
|
|
if (numa_node_id() != cpu_to_node(cpu))
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool paravirt)
|
|
{
|
|
bool seen_preempted = false;
|
|
bool sleepy = false;
|
|
int iters = 0;
|
|
u32 val;
|
|
|
|
if (!steal_spins) {
|
|
/* XXX: should spin_on_preempted_owner do anything here? */
|
|
return false;
|
|
}
|
|
|
|
/* Attempt to steal the lock */
|
|
spin_begin();
|
|
do {
|
|
bool preempted = false;
|
|
|
|
val = READ_ONCE(lock->val);
|
|
if (val & _Q_MUST_Q_VAL)
|
|
break;
|
|
spec_barrier();
|
|
|
|
if (unlikely(!(val & _Q_LOCKED_VAL))) {
|
|
spin_end();
|
|
if (__queued_spin_trylock_steal(lock))
|
|
return true;
|
|
spin_begin();
|
|
} else {
|
|
preempted = yield_to_locked_owner(lock, val, paravirt);
|
|
}
|
|
|
|
if (paravirt && pv_sleepy_lock) {
|
|
if (!sleepy) {
|
|
if (val & _Q_SLEEPY_VAL) {
|
|
seen_sleepy_lock();
|
|
sleepy = true;
|
|
} else if (recently_sleepy()) {
|
|
sleepy = true;
|
|
}
|
|
}
|
|
if (pv_sleepy_lock_sticky && seen_preempted &&
|
|
!(val & _Q_SLEEPY_VAL)) {
|
|
if (try_set_sleepy(lock, val))
|
|
val |= _Q_SLEEPY_VAL;
|
|
}
|
|
}
|
|
|
|
if (preempted) {
|
|
seen_preempted = true;
|
|
sleepy = true;
|
|
if (!pv_spin_on_preempted_owner)
|
|
iters++;
|
|
/*
|
|
* pv_spin_on_preempted_owner don't increase iters
|
|
* while the owner is preempted -- we won't interfere
|
|
* with it by definition. This could introduce some
|
|
* latency issue if we continually observe preempted
|
|
* owners, but hopefully that's a rare corner case of
|
|
* a badly oversubscribed system.
|
|
*/
|
|
} else {
|
|
iters++;
|
|
}
|
|
} while (!steal_break(val, iters, paravirt, sleepy));
|
|
|
|
spin_end();
|
|
|
|
return false;
|
|
}
|
|
|
|
static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, bool paravirt)
|
|
{
|
|
struct qnodes *qnodesp;
|
|
struct qnode *next, *node;
|
|
u32 val, old, tail;
|
|
bool seen_preempted = false;
|
|
bool sleepy = false;
|
|
bool mustq = false;
|
|
int idx;
|
|
int iters = 0;
|
|
|
|
BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
|
|
|
|
qnodesp = this_cpu_ptr(&qnodes);
|
|
if (unlikely(qnodesp->count >= MAX_NODES)) {
|
|
spec_barrier();
|
|
while (!queued_spin_trylock(lock))
|
|
cpu_relax();
|
|
return;
|
|
}
|
|
|
|
idx = qnodesp->count++;
|
|
/*
|
|
* Ensure that we increment the head node->count before initialising
|
|
* the actual node. If the compiler is kind enough to reorder these
|
|
* stores, then an IRQ could overwrite our assignments.
|
|
*/
|
|
barrier();
|
|
node = &qnodesp->nodes[idx];
|
|
node->next = NULL;
|
|
node->lock = lock;
|
|
node->cpu = smp_processor_id();
|
|
node->sleepy = 0;
|
|
node->locked = 0;
|
|
|
|
tail = encode_tail_cpu(node->cpu);
|
|
|
|
/*
|
|
* Assign all attributes of a node before it can be published.
|
|
* Issues an lwsync, serving as a release barrier, as well as a
|
|
* compiler barrier.
|
|
*/
|
|
old = publish_tail_cpu(lock, tail);
|
|
|
|
/*
|
|
* If there was a previous node; link it and wait until reaching the
|
|
* head of the waitqueue.
|
|
*/
|
|
if (old & _Q_TAIL_CPU_MASK) {
|
|
int prev_cpu = decode_tail_cpu(old);
|
|
struct qnode *prev = get_tail_qnode(lock, prev_cpu);
|
|
|
|
/* Link @node into the waitqueue. */
|
|
WRITE_ONCE(prev->next, node);
|
|
|
|
/* Wait for mcs node lock to be released */
|
|
spin_begin();
|
|
while (!READ_ONCE(node->locked)) {
|
|
spec_barrier();
|
|
|
|
if (yield_to_prev(lock, node, prev_cpu, paravirt))
|
|
seen_preempted = true;
|
|
}
|
|
spec_barrier();
|
|
spin_end();
|
|
|
|
smp_rmb(); /* acquire barrier for the mcs lock */
|
|
|
|
/*
|
|
* Generic qspinlocks have this prefetch here, but it seems
|
|
* like it could cause additional line transitions because
|
|
* the waiter will keep loading from it.
|
|
*/
|
|
if (_Q_SPIN_PREFETCH_NEXT) {
|
|
next = READ_ONCE(node->next);
|
|
if (next)
|
|
prefetchw(next);
|
|
}
|
|
}
|
|
|
|
/* We're at the head of the waitqueue, wait for the lock. */
|
|
again:
|
|
spin_begin();
|
|
for (;;) {
|
|
bool preempted;
|
|
|
|
val = READ_ONCE(lock->val);
|
|
if (!(val & _Q_LOCKED_VAL))
|
|
break;
|
|
spec_barrier();
|
|
|
|
if (paravirt && pv_sleepy_lock && maybe_stealers) {
|
|
if (!sleepy) {
|
|
if (val & _Q_SLEEPY_VAL) {
|
|
seen_sleepy_lock();
|
|
sleepy = true;
|
|
} else if (recently_sleepy()) {
|
|
sleepy = true;
|
|
}
|
|
}
|
|
if (pv_sleepy_lock_sticky && seen_preempted &&
|
|
!(val & _Q_SLEEPY_VAL)) {
|
|
if (try_set_sleepy(lock, val))
|
|
val |= _Q_SLEEPY_VAL;
|
|
}
|
|
}
|
|
|
|
propagate_sleepy(node, val, paravirt);
|
|
preempted = yield_head_to_locked_owner(lock, val, paravirt);
|
|
if (!maybe_stealers)
|
|
continue;
|
|
|
|
if (preempted)
|
|
seen_preempted = true;
|
|
|
|
if (paravirt && preempted) {
|
|
sleepy = true;
|
|
|
|
if (!pv_spin_on_preempted_owner)
|
|
iters++;
|
|
} else {
|
|
iters++;
|
|
}
|
|
|
|
if (!mustq && iters >= get_head_spins(paravirt, sleepy)) {
|
|
mustq = true;
|
|
set_mustq(lock);
|
|
val |= _Q_MUST_Q_VAL;
|
|
}
|
|
}
|
|
spec_barrier();
|
|
spin_end();
|
|
|
|
/* If we're the last queued, must clean up the tail. */
|
|
old = trylock_clean_tail(lock, tail);
|
|
if (unlikely(old & _Q_LOCKED_VAL)) {
|
|
BUG_ON(!maybe_stealers);
|
|
goto again; /* Can only be true if maybe_stealers. */
|
|
}
|
|
|
|
if ((old & _Q_TAIL_CPU_MASK) == tail)
|
|
goto release; /* We were the tail, no next. */
|
|
|
|
/* There is a next, must wait for node->next != NULL (MCS protocol) */
|
|
next = READ_ONCE(node->next);
|
|
if (!next) {
|
|
spin_begin();
|
|
while (!(next = READ_ONCE(node->next)))
|
|
cpu_relax();
|
|
spin_end();
|
|
}
|
|
spec_barrier();
|
|
|
|
/*
|
|
* Unlock the next mcs waiter node. Release barrier is not required
|
|
* here because the acquirer is only accessing the lock word, and
|
|
* the acquire barrier we took the lock with orders that update vs
|
|
* this store to locked. The corresponding barrier is the smp_rmb()
|
|
* acquire barrier for mcs lock, above.
|
|
*/
|
|
if (paravirt && pv_prod_head) {
|
|
int next_cpu = next->cpu;
|
|
WRITE_ONCE(next->locked, 1);
|
|
if (_Q_SPIN_MISO)
|
|
asm volatile("miso" ::: "memory");
|
|
if (vcpu_is_preempted(next_cpu))
|
|
prod_cpu(next_cpu);
|
|
} else {
|
|
WRITE_ONCE(next->locked, 1);
|
|
if (_Q_SPIN_MISO)
|
|
asm volatile("miso" ::: "memory");
|
|
}
|
|
|
|
release:
|
|
qnodesp->count--; /* release the node */
|
|
}
|
|
|
|
void queued_spin_lock_slowpath(struct qspinlock *lock)
|
|
{
|
|
/*
|
|
* This looks funny, but it induces the compiler to inline both
|
|
* sides of the branch rather than share code as when the condition
|
|
* is passed as the paravirt argument to the functions.
|
|
*/
|
|
if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && is_shared_processor()) {
|
|
if (try_to_steal_lock(lock, true)) {
|
|
spec_barrier();
|
|
return;
|
|
}
|
|
queued_spin_lock_mcs_queue(lock, true);
|
|
} else {
|
|
if (try_to_steal_lock(lock, false)) {
|
|
spec_barrier();
|
|
return;
|
|
}
|
|
queued_spin_lock_mcs_queue(lock, false);
|
|
}
|
|
}
|
|
EXPORT_SYMBOL(queued_spin_lock_slowpath);
|
|
|
|
#ifdef CONFIG_PARAVIRT_SPINLOCKS
|
|
void pv_spinlocks_init(void)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
#include <linux/debugfs.h>
|
|
static int steal_spins_set(void *data, u64 val)
|
|
{
|
|
#if _Q_SPIN_TRY_LOCK_STEAL == 1
|
|
/* MAYBE_STEAL remains true */
|
|
steal_spins = val;
|
|
#else
|
|
static DEFINE_MUTEX(lock);
|
|
|
|
/*
|
|
* The lock slow path has a !maybe_stealers case that can assume
|
|
* the head of queue will not see concurrent waiters. That waiter
|
|
* is unsafe in the presence of stealers, so must keep them away
|
|
* from one another.
|
|
*/
|
|
|
|
mutex_lock(&lock);
|
|
if (val && !steal_spins) {
|
|
maybe_stealers = true;
|
|
/* wait for queue head waiter to go away */
|
|
synchronize_rcu();
|
|
steal_spins = val;
|
|
} else if (!val && steal_spins) {
|
|
steal_spins = val;
|
|
/* wait for all possible stealers to go away */
|
|
synchronize_rcu();
|
|
maybe_stealers = false;
|
|
} else {
|
|
steal_spins = val;
|
|
}
|
|
mutex_unlock(&lock);
|
|
#endif
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int steal_spins_get(void *data, u64 *val)
|
|
{
|
|
*val = steal_spins;
|
|
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(fops_steal_spins, steal_spins_get, steal_spins_set, "%llu\n");
|
|
|
|
static int remote_steal_spins_set(void *data, u64 val)
|
|
{
|
|
remote_steal_spins = val;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int remote_steal_spins_get(void *data, u64 *val)
|
|
{
|
|
*val = remote_steal_spins;
|
|
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(fops_remote_steal_spins, remote_steal_spins_get, remote_steal_spins_set, "%llu\n");
|
|
|
|
static int head_spins_set(void *data, u64 val)
|
|
{
|
|
head_spins = val;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int head_spins_get(void *data, u64 *val)
|
|
{
|
|
*val = head_spins;
|
|
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(fops_head_spins, head_spins_get, head_spins_set, "%llu\n");
|
|
|
|
static int pv_yield_owner_set(void *data, u64 val)
|
|
{
|
|
pv_yield_owner = !!val;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int pv_yield_owner_get(void *data, u64 *val)
|
|
{
|
|
*val = pv_yield_owner;
|
|
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_owner, pv_yield_owner_get, pv_yield_owner_set, "%llu\n");
|
|
|
|
static int pv_yield_allow_steal_set(void *data, u64 val)
|
|
{
|
|
pv_yield_allow_steal = !!val;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int pv_yield_allow_steal_get(void *data, u64 *val)
|
|
{
|
|
*val = pv_yield_allow_steal;
|
|
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_allow_steal, pv_yield_allow_steal_get, pv_yield_allow_steal_set, "%llu\n");
|
|
|
|
static int pv_spin_on_preempted_owner_set(void *data, u64 val)
|
|
{
|
|
pv_spin_on_preempted_owner = !!val;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int pv_spin_on_preempted_owner_get(void *data, u64 *val)
|
|
{
|
|
*val = pv_spin_on_preempted_owner;
|
|
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(fops_pv_spin_on_preempted_owner, pv_spin_on_preempted_owner_get, pv_spin_on_preempted_owner_set, "%llu\n");
|
|
|
|
static int pv_sleepy_lock_set(void *data, u64 val)
|
|
{
|
|
pv_sleepy_lock = !!val;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int pv_sleepy_lock_get(void *data, u64 *val)
|
|
{
|
|
*val = pv_sleepy_lock;
|
|
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock, pv_sleepy_lock_get, pv_sleepy_lock_set, "%llu\n");
|
|
|
|
static int pv_sleepy_lock_sticky_set(void *data, u64 val)
|
|
{
|
|
pv_sleepy_lock_sticky = !!val;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int pv_sleepy_lock_sticky_get(void *data, u64 *val)
|
|
{
|
|
*val = pv_sleepy_lock_sticky;
|
|
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_sticky, pv_sleepy_lock_sticky_get, pv_sleepy_lock_sticky_set, "%llu\n");
|
|
|
|
static int pv_sleepy_lock_interval_ns_set(void *data, u64 val)
|
|
{
|
|
pv_sleepy_lock_interval_ns = val;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int pv_sleepy_lock_interval_ns_get(void *data, u64 *val)
|
|
{
|
|
*val = pv_sleepy_lock_interval_ns;
|
|
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_interval_ns, pv_sleepy_lock_interval_ns_get, pv_sleepy_lock_interval_ns_set, "%llu\n");
|
|
|
|
static int pv_sleepy_lock_factor_set(void *data, u64 val)
|
|
{
|
|
pv_sleepy_lock_factor = val;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int pv_sleepy_lock_factor_get(void *data, u64 *val)
|
|
{
|
|
*val = pv_sleepy_lock_factor;
|
|
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_factor, pv_sleepy_lock_factor_get, pv_sleepy_lock_factor_set, "%llu\n");
|
|
|
|
static int pv_yield_prev_set(void *data, u64 val)
|
|
{
|
|
pv_yield_prev = !!val;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int pv_yield_prev_get(void *data, u64 *val)
|
|
{
|
|
*val = pv_yield_prev;
|
|
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n");
|
|
|
|
static int pv_yield_sleepy_owner_set(void *data, u64 val)
|
|
{
|
|
pv_yield_sleepy_owner = !!val;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int pv_yield_sleepy_owner_get(void *data, u64 *val)
|
|
{
|
|
*val = pv_yield_sleepy_owner;
|
|
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_sleepy_owner, pv_yield_sleepy_owner_get, pv_yield_sleepy_owner_set, "%llu\n");
|
|
|
|
static int pv_prod_head_set(void *data, u64 val)
|
|
{
|
|
pv_prod_head = !!val;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int pv_prod_head_get(void *data, u64 *val)
|
|
{
|
|
*val = pv_prod_head;
|
|
|
|
return 0;
|
|
}
|
|
|
|
DEFINE_SIMPLE_ATTRIBUTE(fops_pv_prod_head, pv_prod_head_get, pv_prod_head_set, "%llu\n");
|
|
|
|
static __init int spinlock_debugfs_init(void)
|
|
{
|
|
debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins);
|
|
debugfs_create_file("qspl_remote_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_remote_steal_spins);
|
|
debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, &fops_head_spins);
|
|
if (is_shared_processor()) {
|
|
debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner);
|
|
debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal);
|
|
debugfs_create_file("qspl_pv_spin_on_preempted_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_spin_on_preempted_owner);
|
|
debugfs_create_file("qspl_pv_sleepy_lock", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock);
|
|
debugfs_create_file("qspl_pv_sleepy_lock_sticky", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_sticky);
|
|
debugfs_create_file("qspl_pv_sleepy_lock_interval_ns", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_interval_ns);
|
|
debugfs_create_file("qspl_pv_sleepy_lock_factor", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_factor);
|
|
debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev);
|
|
debugfs_create_file("qspl_pv_yield_sleepy_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_sleepy_owner);
|
|
debugfs_create_file("qspl_pv_prod_head", 0600, arch_debugfs_dir, NULL, &fops_pv_prod_head);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
device_initcall(spinlock_debugfs_init);
|