Merge branch 'lockref' (locked reference counts)
Merge lockref infrastructure code by me and Waiman Long. I already merged some of the preparatory patches that didn't actually do any semantic changes earlier, but this merges the actual _reason_ for those preparatory patches. The "lockref" structure is a combination "spinlock and reference count" that allows optimized reference count accesses. In particular, it guarantees that the reference count will be updated AS IF the spinlock was held, but using atomic accesses that cover both the reference count and the spinlock words, we can often do the update without actually having to take the lock. This allows us to avoid the nastiest cases of spinlock contention on large machines under heavy pathname lookup loads. When updating the dentry reference counts on a large system, we'll still end up with the cache line bouncing around, but that's much less noticeable than actually having to spin waiting for the lock. * lockref: lockref: implement lockless reference count updates using cmpxchg() lockref: uninline lockref helper functions vfs: reimplement d_rcu_to_refcount() using lockref_get_or_lock() vfs: use lockref_get_not_zero() for optimistic lockless dget_parent() lockref: add 'lockref_get_or_lock() helper
This commit is contained in:
commit
fc6d0b0376
@ -16,6 +16,7 @@ config X86_64
|
||||
def_bool y
|
||||
depends on 64BIT
|
||||
select X86_DEV_DMA_OPS
|
||||
select ARCH_USE_CMPXCHG_LOCKREF
|
||||
|
||||
### Arch settings
|
||||
config X86
|
||||
|
@ -34,6 +34,11 @@
|
||||
# define UNLOCK_LOCK_PREFIX
|
||||
#endif
|
||||
|
||||
static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
|
||||
{
|
||||
return lock.tickets.head == lock.tickets.tail;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ticket locks are conceptually two parts, one indicating the current head of
|
||||
* the queue, and the other indicating the current tail. The lock is acquired
|
||||
|
17
fs/dcache.c
17
fs/dcache.c
@ -611,8 +611,23 @@ static inline void __dget(struct dentry *dentry)
|
||||
|
||||
struct dentry *dget_parent(struct dentry *dentry)
|
||||
{
|
||||
int gotref;
|
||||
struct dentry *ret;
|
||||
|
||||
/*
|
||||
* Do optimistic parent lookup without any
|
||||
* locking.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
ret = ACCESS_ONCE(dentry->d_parent);
|
||||
gotref = lockref_get_not_zero(&ret->d_lockref);
|
||||
rcu_read_unlock();
|
||||
if (likely(gotref)) {
|
||||
if (likely(ret == ACCESS_ONCE(dentry->d_parent)))
|
||||
return ret;
|
||||
dput(ret);
|
||||
}
|
||||
|
||||
repeat:
|
||||
/*
|
||||
* Don't need rcu_dereference because we re-check it was correct under
|
||||
@ -1771,7 +1786,7 @@ static noinline enum slow_d_compare slow_dentry_cmp(
|
||||
* without taking d_lock and checking d_seq sequence count against @seq
|
||||
* returned here.
|
||||
*
|
||||
* A refcount may be taken on the found dentry with the __d_rcu_to_refcount
|
||||
* A refcount may be taken on the found dentry with the d_rcu_to_refcount
|
||||
* function.
|
||||
*
|
||||
* Alternatively, __d_lookup_rcu may be called again to look up the child of
|
||||
|
90
fs/namei.c
90
fs/namei.c
@ -494,6 +494,50 @@ static inline void unlock_rcu_walk(void)
|
||||
br_read_unlock(&vfsmount_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* When we move over from the RCU domain to properly refcounted
|
||||
* long-lived dentries, we need to check the sequence numbers
|
||||
* we got before lookup very carefully.
|
||||
*
|
||||
* We cannot blindly increment a dentry refcount - even if it
|
||||
* is not locked - if it is zero, because it may have gone
|
||||
* through the final d_kill() logic already.
|
||||
*
|
||||
* So for a zero refcount, we need to get the spinlock (which is
|
||||
* safe even for a dead dentry because the de-allocation is
|
||||
* RCU-delayed), and check the sequence count under the lock.
|
||||
*
|
||||
* Once we have checked the sequence count, we know it is live,
|
||||
* and since we hold the spinlock it cannot die from under us.
|
||||
*
|
||||
* In contrast, if the reference count wasn't zero, we can just
|
||||
* increment the lockref without having to take the spinlock.
|
||||
* Even if the sequence number ends up being stale, we haven't
|
||||
* gone through the final dput() and killed the dentry yet.
|
||||
*/
|
||||
static inline int d_rcu_to_refcount(struct dentry *dentry, seqcount_t *validate, unsigned seq)
|
||||
{
|
||||
int gotref;
|
||||
|
||||
gotref = lockref_get_or_lock(&dentry->d_lockref);
|
||||
|
||||
/* Does the sequence number still match? */
|
||||
if (read_seqcount_retry(validate, seq)) {
|
||||
if (gotref)
|
||||
dput(dentry);
|
||||
else
|
||||
spin_unlock(&dentry->d_lock);
|
||||
return -ECHILD;
|
||||
}
|
||||
|
||||
/* Get the ref now, if we couldn't get it originally */
|
||||
if (!gotref) {
|
||||
dentry->d_lockref.count++;
|
||||
spin_unlock(&dentry->d_lock);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* unlazy_walk - try to switch to ref-walk mode.
|
||||
* @nd: nameidata pathwalk data
|
||||
@ -518,29 +562,28 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
|
||||
nd->root.dentry != fs->root.dentry)
|
||||
goto err_root;
|
||||
}
|
||||
spin_lock(&parent->d_lock);
|
||||
|
||||
/*
|
||||
* For a negative lookup, the lookup sequence point is the parents
|
||||
* sequence point, and it only needs to revalidate the parent dentry.
|
||||
*
|
||||
* For a positive lookup, we need to move both the parent and the
|
||||
* dentry from the RCU domain to be properly refcounted. And the
|
||||
* sequence number in the dentry validates *both* dentry counters,
|
||||
* since we checked the sequence number of the parent after we got
|
||||
* the child sequence number. So we know the parent must still
|
||||
* be valid if the child sequence number is still valid.
|
||||
*/
|
||||
if (!dentry) {
|
||||
if (!__d_rcu_to_refcount(parent, nd->seq))
|
||||
goto err_parent;
|
||||
if (d_rcu_to_refcount(parent, &parent->d_seq, nd->seq) < 0)
|
||||
goto err_root;
|
||||
BUG_ON(nd->inode != parent->d_inode);
|
||||
} else {
|
||||
if (dentry->d_parent != parent)
|
||||
if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0)
|
||||
goto err_root;
|
||||
if (d_rcu_to_refcount(parent, &dentry->d_seq, nd->seq) < 0)
|
||||
goto err_parent;
|
||||
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
|
||||
if (!__d_rcu_to_refcount(dentry, nd->seq))
|
||||
goto err_child;
|
||||
/*
|
||||
* If the sequence check on the child dentry passed, then
|
||||
* the child has not been removed from its parent. This
|
||||
* means the parent dentry must be valid and able to take
|
||||
* a reference at this point.
|
||||
*/
|
||||
BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
|
||||
BUG_ON(!parent->d_lockref.count);
|
||||
parent->d_lockref.count++;
|
||||
spin_unlock(&dentry->d_lock);
|
||||
}
|
||||
spin_unlock(&parent->d_lock);
|
||||
if (want_root) {
|
||||
path_get(&nd->root);
|
||||
spin_unlock(&fs->lock);
|
||||
@ -551,10 +594,8 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
|
||||
nd->flags &= ~LOOKUP_RCU;
|
||||
return 0;
|
||||
|
||||
err_child:
|
||||
spin_unlock(&dentry->d_lock);
|
||||
err_parent:
|
||||
spin_unlock(&parent->d_lock);
|
||||
dput(dentry);
|
||||
err_root:
|
||||
if (want_root)
|
||||
spin_unlock(&fs->lock);
|
||||
@ -585,14 +626,11 @@ static int complete_walk(struct nameidata *nd)
|
||||
nd->flags &= ~LOOKUP_RCU;
|
||||
if (!(nd->flags & LOOKUP_ROOT))
|
||||
nd->root.mnt = NULL;
|
||||
spin_lock(&dentry->d_lock);
|
||||
if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
|
||||
spin_unlock(&dentry->d_lock);
|
||||
|
||||
if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) {
|
||||
unlock_rcu_walk();
|
||||
return -ECHILD;
|
||||
}
|
||||
BUG_ON(nd->inode != dentry->d_inode);
|
||||
spin_unlock(&dentry->d_lock);
|
||||
mntget(nd->path.mnt);
|
||||
unlock_rcu_walk();
|
||||
}
|
||||
|
@ -304,28 +304,6 @@ extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
|
||||
extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
|
||||
const struct qstr *name, unsigned *seq);
|
||||
|
||||
/**
|
||||
* __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok
|
||||
* @dentry: dentry to take a ref on
|
||||
* @seq: seqcount to verify against
|
||||
* Returns: 0 on failure, else 1.
|
||||
*
|
||||
* __d_rcu_to_refcount operates on a dentry,seq pair that was returned
|
||||
* by __d_lookup_rcu, to get a reference on an rcu-walk dentry.
|
||||
*/
|
||||
static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
assert_spin_locked(&dentry->d_lock);
|
||||
if (!read_seqcount_retry(&dentry->d_seq, seq)) {
|
||||
ret = 1;
|
||||
dentry->d_lockref.count++;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline unsigned d_count(const struct dentry *dentry)
|
||||
{
|
||||
return dentry->d_lockref.count;
|
||||
|
@ -17,55 +17,20 @@
|
||||
#include <linux/spinlock.h>
|
||||
|
||||
struct lockref {
|
||||
spinlock_t lock;
|
||||
unsigned int count;
|
||||
union {
|
||||
#ifdef CONFIG_CMPXCHG_LOCKREF
|
||||
aligned_u64 lock_count;
|
||||
#endif
|
||||
struct {
|
||||
spinlock_t lock;
|
||||
unsigned int count;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* lockref_get - Increments reference count unconditionally
|
||||
* @lockcnt: pointer to lockref structure
|
||||
*
|
||||
* This operation is only valid if you already hold a reference
|
||||
* to the object, so you know the count cannot be zero.
|
||||
*/
|
||||
static inline void lockref_get(struct lockref *lockref)
|
||||
{
|
||||
spin_lock(&lockref->lock);
|
||||
lockref->count++;
|
||||
spin_unlock(&lockref->lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* lockref_get_not_zero - Increments count unless the count is 0
|
||||
* @lockcnt: pointer to lockref structure
|
||||
* Return: 1 if count updated successfully or 0 if count is 0
|
||||
*/
|
||||
static inline int lockref_get_not_zero(struct lockref *lockref)
|
||||
{
|
||||
int retval = 0;
|
||||
|
||||
spin_lock(&lockref->lock);
|
||||
if (lockref->count) {
|
||||
lockref->count++;
|
||||
retval = 1;
|
||||
}
|
||||
spin_unlock(&lockref->lock);
|
||||
return retval;
|
||||
}
|
||||
|
||||
/**
|
||||
* lockref_put_or_lock - decrements count unless count <= 1 before decrement
|
||||
* @lockcnt: pointer to lockref structure
|
||||
* Return: 1 if count updated successfully or 0 if count <= 1 and lock taken
|
||||
*/
|
||||
static inline int lockref_put_or_lock(struct lockref *lockref)
|
||||
{
|
||||
spin_lock(&lockref->lock);
|
||||
if (lockref->count <= 1)
|
||||
return 0;
|
||||
lockref->count--;
|
||||
spin_unlock(&lockref->lock);
|
||||
return 1;
|
||||
}
|
||||
extern void lockref_get(struct lockref *);
|
||||
extern int lockref_get_not_zero(struct lockref *);
|
||||
extern int lockref_get_or_lock(struct lockref *);
|
||||
extern int lockref_put_or_lock(struct lockref *);
|
||||
|
||||
#endif /* __LINUX_LOCKREF_H */
|
||||
|
10
lib/Kconfig
10
lib/Kconfig
@ -48,6 +48,16 @@ config STMP_DEVICE
|
||||
config PERCPU_RWSEM
|
||||
boolean
|
||||
|
||||
config ARCH_USE_CMPXCHG_LOCKREF
|
||||
bool
|
||||
|
||||
config CMPXCHG_LOCKREF
|
||||
def_bool y if ARCH_USE_CMPXCHG_LOCKREF
|
||||
depends on SMP
|
||||
depends on !GENERIC_LOCKBREAK
|
||||
depends on !DEBUG_SPINLOCK
|
||||
depends on !DEBUG_LOCK_ALLOC
|
||||
|
||||
config CRC_CCITT
|
||||
tristate "CRC-CCITT functions"
|
||||
help
|
||||
|
@ -20,6 +20,7 @@ lib-$(CONFIG_MMU) += ioremap.o
|
||||
lib-$(CONFIG_SMP) += cpumask.o
|
||||
|
||||
lib-y += kobject.o klist.o
|
||||
obj-y += lockref.o
|
||||
|
||||
obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
|
||||
bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
|
||||
|
127
lib/lockref.c
Normal file
127
lib/lockref.c
Normal file
@ -0,0 +1,127 @@
|
||||
#include <linux/export.h>
|
||||
#include <linux/lockref.h>
|
||||
|
||||
#ifdef CONFIG_CMPXCHG_LOCKREF
|
||||
|
||||
/*
|
||||
* Note that the "cmpxchg()" reloads the "old" value for the
|
||||
* failure case.
|
||||
*/
|
||||
#define CMPXCHG_LOOP(CODE, SUCCESS) do { \
|
||||
struct lockref old; \
|
||||
BUILD_BUG_ON(sizeof(old) != 8); \
|
||||
old.lock_count = ACCESS_ONCE(lockref->lock_count); \
|
||||
while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) { \
|
||||
struct lockref new = old, prev = old; \
|
||||
CODE \
|
||||
old.lock_count = cmpxchg(&lockref->lock_count, \
|
||||
old.lock_count, new.lock_count); \
|
||||
if (likely(old.lock_count == prev.lock_count)) { \
|
||||
SUCCESS; \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
|
||||
#define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* lockref_get - Increments reference count unconditionally
|
||||
* @lockcnt: pointer to lockref structure
|
||||
*
|
||||
* This operation is only valid if you already hold a reference
|
||||
* to the object, so you know the count cannot be zero.
|
||||
*/
|
||||
void lockref_get(struct lockref *lockref)
|
||||
{
|
||||
CMPXCHG_LOOP(
|
||||
new.count++;
|
||||
,
|
||||
return;
|
||||
);
|
||||
|
||||
spin_lock(&lockref->lock);
|
||||
lockref->count++;
|
||||
spin_unlock(&lockref->lock);
|
||||
}
|
||||
EXPORT_SYMBOL(lockref_get);
|
||||
|
||||
/**
|
||||
* lockref_get_not_zero - Increments count unless the count is 0
|
||||
* @lockcnt: pointer to lockref structure
|
||||
* Return: 1 if count updated successfully or 0 if count was zero
|
||||
*/
|
||||
int lockref_get_not_zero(struct lockref *lockref)
|
||||
{
|
||||
int retval;
|
||||
|
||||
CMPXCHG_LOOP(
|
||||
new.count++;
|
||||
if (!old.count)
|
||||
return 0;
|
||||
,
|
||||
return 1;
|
||||
);
|
||||
|
||||
spin_lock(&lockref->lock);
|
||||
retval = 0;
|
||||
if (lockref->count) {
|
||||
lockref->count++;
|
||||
retval = 1;
|
||||
}
|
||||
spin_unlock(&lockref->lock);
|
||||
return retval;
|
||||
}
|
||||
EXPORT_SYMBOL(lockref_get_not_zero);
|
||||
|
||||
/**
|
||||
* lockref_get_or_lock - Increments count unless the count is 0
|
||||
* @lockcnt: pointer to lockref structure
|
||||
* Return: 1 if count updated successfully or 0 if count was zero
|
||||
* and we got the lock instead.
|
||||
*/
|
||||
int lockref_get_or_lock(struct lockref *lockref)
|
||||
{
|
||||
CMPXCHG_LOOP(
|
||||
new.count++;
|
||||
if (!old.count)
|
||||
break;
|
||||
,
|
||||
return 1;
|
||||
);
|
||||
|
||||
spin_lock(&lockref->lock);
|
||||
if (!lockref->count)
|
||||
return 0;
|
||||
lockref->count++;
|
||||
spin_unlock(&lockref->lock);
|
||||
return 1;
|
||||
}
|
||||
EXPORT_SYMBOL(lockref_get_or_lock);
|
||||
|
||||
/**
|
||||
* lockref_put_or_lock - decrements count unless count <= 1 before decrement
|
||||
* @lockcnt: pointer to lockref structure
|
||||
* Return: 1 if count updated successfully or 0 if count <= 1 and lock taken
|
||||
*/
|
||||
int lockref_put_or_lock(struct lockref *lockref)
|
||||
{
|
||||
CMPXCHG_LOOP(
|
||||
new.count--;
|
||||
if (old.count <= 1)
|
||||
break;
|
||||
,
|
||||
return 1;
|
||||
);
|
||||
|
||||
spin_lock(&lockref->lock);
|
||||
if (lockref->count <= 1)
|
||||
return 0;
|
||||
lockref->count--;
|
||||
spin_unlock(&lockref->lock);
|
||||
return 1;
|
||||
}
|
||||
EXPORT_SYMBOL(lockref_put_or_lock);
|
Loading…
Reference in New Issue
Block a user