cd0272fab7
In an overcommitted guest where some vCPUs have to be halted to make forward progress in other areas, it is highly likely that a vCPU later in the spinlock queue will be spinning while the ones earlier in the queue would have been halted. The spinning in the later vCPUs is then just a waste of precious CPU cycles because they are not going to get the lock soon as the earlier ones have to be woken up and take their turn to get the lock. This patch implements an adaptive spinning mechanism where the vCPU will call pv_wait() if the previous vCPU is not running. Linux kernel builds were run in KVM guest on an 8-socket, 4 cores/socket Westmere-EX system and a 4-socket, 8 cores/socket Haswell-EX system. Both systems are configured to have 32 physical CPUs. The kernel build times before and after the patch were: Westmere Haswell Patch 32 vCPUs 48 vCPUs 32 vCPUs 48 vCPUs ----- -------- -------- -------- -------- Before patch 3m02.3s 5m00.2s 1m43.7s 3m03.5s After patch 3m03.0s 4m37.5s 1m43.0s 2m47.2s For 32 vCPUs, this patch doesn't cause any noticeable change in performance. For 48 vCPUs (over-committed), there is about 8% performance improvement. Signed-off-by: Waiman Long <Waiman.Long@hpe.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Douglas Hatch <doug.hatch@hpe.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Scott J Norton <scott.norton@hpe.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1447114167-47185-8-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
301 lines
7.8 KiB
C
301 lines
7.8 KiB
C
/*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* Authors: Waiman Long <waiman.long@hpe.com>
|
|
*/
|
|
|
|
/*
|
|
* When queued spinlock statistical counters are enabled, the following
|
|
* debugfs files will be created for reporting the counter values:
|
|
*
|
|
* <debugfs>/qlockstat/
|
|
* pv_hash_hops - average # of hops per hashing operation
|
|
* pv_kick_unlock - # of vCPU kicks issued at unlock time
|
|
* pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
|
|
* pv_latency_kick - average latency (ns) of vCPU kick operation
|
|
* pv_latency_wake - average latency (ns) from vCPU kick to wakeup
|
|
* pv_lock_stealing - # of lock stealing operations
|
|
* pv_spurious_wakeup - # of spurious wakeups
|
|
* pv_wait_again - # of vCPU wait's that happened after a vCPU kick
|
|
* pv_wait_early - # of early vCPU wait's
|
|
* pv_wait_head - # of vCPU wait's at the queue head
|
|
* pv_wait_node - # of vCPU wait's at a non-head queue node
|
|
*
|
|
* Writing to the "reset_counters" file will reset all the above counter
|
|
* values.
|
|
*
|
|
* These statistical counters are implemented as per-cpu variables which are
|
|
* summed and computed whenever the corresponding debugfs files are read. This
|
|
* minimizes added overhead making the counters usable even in a production
|
|
* environment.
|
|
*
|
|
* There may be slight difference between pv_kick_wake and pv_kick_unlock.
|
|
*/
|
|
enum qlock_stats {
|
|
qstat_pv_hash_hops,
|
|
qstat_pv_kick_unlock,
|
|
qstat_pv_kick_wake,
|
|
qstat_pv_latency_kick,
|
|
qstat_pv_latency_wake,
|
|
qstat_pv_lock_stealing,
|
|
qstat_pv_spurious_wakeup,
|
|
qstat_pv_wait_again,
|
|
qstat_pv_wait_early,
|
|
qstat_pv_wait_head,
|
|
qstat_pv_wait_node,
|
|
qstat_num, /* Total number of statistical counters */
|
|
qstat_reset_cnts = qstat_num,
|
|
};
|
|
|
|
#ifdef CONFIG_QUEUED_LOCK_STAT
|
|
/*
|
|
* Collect pvqspinlock statistics
|
|
*/
|
|
#include <linux/debugfs.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/fs.h>
|
|
|
|
static const char * const qstat_names[qstat_num + 1] = {
|
|
[qstat_pv_hash_hops] = "pv_hash_hops",
|
|
[qstat_pv_kick_unlock] = "pv_kick_unlock",
|
|
[qstat_pv_kick_wake] = "pv_kick_wake",
|
|
[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
|
|
[qstat_pv_latency_kick] = "pv_latency_kick",
|
|
[qstat_pv_latency_wake] = "pv_latency_wake",
|
|
[qstat_pv_lock_stealing] = "pv_lock_stealing",
|
|
[qstat_pv_wait_again] = "pv_wait_again",
|
|
[qstat_pv_wait_early] = "pv_wait_early",
|
|
[qstat_pv_wait_head] = "pv_wait_head",
|
|
[qstat_pv_wait_node] = "pv_wait_node",
|
|
[qstat_reset_cnts] = "reset_counters",
|
|
};
|
|
|
|
/*
|
|
* Per-cpu counters
|
|
*/
|
|
static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
|
|
static DEFINE_PER_CPU(u64, pv_kick_time);
|
|
|
|
/*
|
|
* Function to read and return the qlock statistical counter values
|
|
*
|
|
* The following counters are handled specially:
|
|
* 1. qstat_pv_latency_kick
|
|
* Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
|
|
* 2. qstat_pv_latency_wake
|
|
* Average wake latency (ns) = pv_latency_wake/pv_kick_wake
|
|
* 3. qstat_pv_hash_hops
|
|
* Average hops/hash = pv_hash_hops/pv_kick_unlock
|
|
*/
|
|
static ssize_t qstat_read(struct file *file, char __user *user_buf,
|
|
size_t count, loff_t *ppos)
|
|
{
|
|
char buf[64];
|
|
int cpu, counter, len;
|
|
u64 stat = 0, kicks = 0;
|
|
|
|
/*
|
|
* Get the counter ID stored in file->f_inode->i_private
|
|
*/
|
|
if (!file->f_inode) {
|
|
WARN_ON_ONCE(1);
|
|
return -EBADF;
|
|
}
|
|
counter = (long)(file->f_inode->i_private);
|
|
|
|
if (counter >= qstat_num)
|
|
return -EBADF;
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
stat += per_cpu(qstats[counter], cpu);
|
|
/*
|
|
* Need to sum additional counter for some of them
|
|
*/
|
|
switch (counter) {
|
|
|
|
case qstat_pv_latency_kick:
|
|
case qstat_pv_hash_hops:
|
|
kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
|
|
break;
|
|
|
|
case qstat_pv_latency_wake:
|
|
kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (counter == qstat_pv_hash_hops) {
|
|
u64 frac;
|
|
|
|
frac = 100ULL * do_div(stat, kicks);
|
|
frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
|
|
|
|
/*
|
|
* Return a X.XX decimal number
|
|
*/
|
|
len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
|
|
} else {
|
|
/*
|
|
* Round to the nearest ns
|
|
*/
|
|
if ((counter == qstat_pv_latency_kick) ||
|
|
(counter == qstat_pv_latency_wake)) {
|
|
stat = 0;
|
|
if (kicks)
|
|
stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
|
|
}
|
|
len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
|
|
}
|
|
|
|
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
|
|
}
|
|
|
|
/*
|
|
* Function to handle write request
|
|
*
|
|
* When counter = reset_cnts, reset all the counter values.
|
|
* Since the counter updates aren't atomic, the resetting is done twice
|
|
* to make sure that the counters are very likely to be all cleared.
|
|
*/
|
|
static ssize_t qstat_write(struct file *file, const char __user *user_buf,
|
|
size_t count, loff_t *ppos)
|
|
{
|
|
int cpu;
|
|
|
|
/*
|
|
* Get the counter ID stored in file->f_inode->i_private
|
|
*/
|
|
if (!file->f_inode) {
|
|
WARN_ON_ONCE(1);
|
|
return -EBADF;
|
|
}
|
|
if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
|
|
return count;
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
int i;
|
|
unsigned long *ptr = per_cpu_ptr(qstats, cpu);
|
|
|
|
for (i = 0 ; i < qstat_num; i++)
|
|
WRITE_ONCE(ptr[i], 0);
|
|
for (i = 0 ; i < qstat_num; i++)
|
|
WRITE_ONCE(ptr[i], 0);
|
|
}
|
|
return count;
|
|
}
|
|
|
|
/*
|
|
* Debugfs data structures
|
|
*/
|
|
static const struct file_operations fops_qstat = {
|
|
.read = qstat_read,
|
|
.write = qstat_write,
|
|
.llseek = default_llseek,
|
|
};
|
|
|
|
/*
|
|
* Initialize debugfs for the qspinlock statistical counters
|
|
*/
|
|
static int __init init_qspinlock_stat(void)
|
|
{
|
|
struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
|
|
int i;
|
|
|
|
if (!d_qstat) {
|
|
pr_warn("Could not create 'qlockstat' debugfs directory\n");
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Create the debugfs files
|
|
*
|
|
* As reading from and writing to the stat files can be slow, only
|
|
* root is allowed to do the read/write to limit impact to system
|
|
* performance.
|
|
*/
|
|
for (i = 0; i < qstat_num; i++)
|
|
debugfs_create_file(qstat_names[i], 0400, d_qstat,
|
|
(void *)(long)i, &fops_qstat);
|
|
|
|
debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
|
|
(void *)(long)qstat_reset_cnts, &fops_qstat);
|
|
return 0;
|
|
}
|
|
fs_initcall(init_qspinlock_stat);
|
|
|
|
/*
|
|
* Increment the PV qspinlock statistical counters
|
|
*/
|
|
static inline void qstat_inc(enum qlock_stats stat, bool cond)
|
|
{
|
|
if (cond)
|
|
this_cpu_inc(qstats[stat]);
|
|
}
|
|
|
|
/*
|
|
* PV hash hop count
|
|
*/
|
|
static inline void qstat_hop(int hopcnt)
|
|
{
|
|
this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
|
|
}
|
|
|
|
/*
|
|
* Replacement function for pv_kick()
|
|
*/
|
|
static inline void __pv_kick(int cpu)
|
|
{
|
|
u64 start = sched_clock();
|
|
|
|
per_cpu(pv_kick_time, cpu) = start;
|
|
pv_kick(cpu);
|
|
this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
|
|
}
|
|
|
|
/*
|
|
* Replacement function for pv_wait()
|
|
*/
|
|
static inline void __pv_wait(u8 *ptr, u8 val)
|
|
{
|
|
u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
|
|
|
|
*pkick_time = 0;
|
|
pv_wait(ptr, val);
|
|
if (*pkick_time) {
|
|
this_cpu_add(qstats[qstat_pv_latency_wake],
|
|
sched_clock() - *pkick_time);
|
|
qstat_inc(qstat_pv_kick_wake, true);
|
|
}
|
|
}
|
|
|
|
#define pv_kick(c) __pv_kick(c)
|
|
#define pv_wait(p, v) __pv_wait(p, v)
|
|
|
|
/*
|
|
* PV unfair trylock count tracking function
|
|
*/
|
|
static inline int qstat_spin_steal_lock(struct qspinlock *lock)
|
|
{
|
|
int ret = pv_queued_spin_steal_lock(lock);
|
|
|
|
qstat_inc(qstat_pv_lock_stealing, ret);
|
|
return ret;
|
|
}
|
|
#undef queued_spin_trylock
|
|
#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
|
|
|
|
#else /* CONFIG_QUEUED_LOCK_STAT */
|
|
|
|
static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
|
|
static inline void qstat_hop(int hopcnt) { }
|
|
|
|
#endif /* CONFIG_QUEUED_LOCK_STAT */
|