2019-06-01 10:08:51 +02:00
// SPDX-License-Identifier: GPL-2.0-only
2017-06-23 22:11:52 -07:00
/*
* x86 APERF / MPERF KHz calculation for
* / sys / . . . / cpufreq / scaling_cur_freq
*
* Copyright ( C ) 2017 Intel Corp .
* Author : Len Brown < len . brown @ intel . com >
*/
2022-04-15 21:19:51 +02:00
# include <linux/cpufreq.h>
2017-07-28 14:45:03 +02:00
# include <linux/delay.h>
# include <linux/ktime.h>
2017-06-23 22:11:52 -07:00
# include <linux/math64.h>
# include <linux/percpu.h>
2020-09-03 15:23:29 -07:00
# include <linux/rcupdate.h>
2022-04-15 21:19:51 +02:00
# include <linux/sched/isolation.h>
# include <linux/sched/topology.h>
# include <linux/smp.h>
# include <linux/syscore_ops.h>
2022-04-15 21:19:59 +02:00
# include <asm/cpu.h>
2022-04-15 21:19:51 +02:00
# include <asm/cpu_device_id.h>
# include <asm/intel-family.h>
2017-06-23 22:11:52 -07:00
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
# include "cpu.h"
2022-04-15 21:19:56 +02:00
struct aperfmperf {
u64 aperf ;
u64 mperf ;
} ;
static DEFINE_PER_CPU_SHARED_ALIGNED ( struct aperfmperf , cpu_samples ) ;
2017-06-23 22:11:52 -07:00
struct aperfmperf_sample {
unsigned int khz ;
x86/cpu: Avoid cpuinfo-induced IPI pileups
The aperfmperf_snapshot_cpu() function is invoked upon access to
/proc/cpuinfo, and it does do an early exit if the specified CPU has
recently done a snapshot. Unfortunately, the indication that a snapshot
has been completed is set in an IPI handler, and the execution of this
handler can be delayed by any number of unfortunate events. This means
that a system that starts a number of applications, each of which
parses /proc/cpuinfo, can suffer from an smp_call_function_single()
storm, especially given that each access to /proc/cpuinfo invokes
smp_call_function_single() for all CPUs. Please note that this is not
theoretical speculation. Note also that one CPU's pending IPI serves
all requests, so there is no point in ever having more than one IPI
pending to a given CPU.
This commit therefore suppresses duplicate IPIs to a given CPU via a
new ->scfpending field in the aperfmperf_sample structure. This field
is set to the value one if an IPI is pending to the corresponding CPU
and to zero otherwise.
The aperfmperf_snapshot_cpu() function uses atomic_xchg() to set this
field to the value one and sample the old value. If this function's
"wait" parameter is zero, smp_call_function_single() is called only if
the old value of the ->scfpending field was zero. The IPI handler uses
atomic_set_release() to set this new field to zero just before returning,
so that the prior stores into the aperfmperf_sample structure are seen
by future requests that get to the atomic_xchg(). Future requests that
pass the elapsed-time check are ordered by the fact that on x86 loads act
as acquire loads, just as was the case prior to this change. The return
value is based off of the age of the prior snapshot, just as before.
Reported-by: Dave Jones <davej@codemonkey.org.uk>
[ paulmck: Allow /proc/cpuinfo to take advantage of arch_freq_get_on_cpu(). ]
[ paulmck: Add comment on memory barrier. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
2020-09-02 13:19:12 -07:00
atomic_t scfpending ;
2017-07-28 14:45:03 +02:00
ktime_t time ;
2017-06-23 22:11:52 -07:00
u64 aperf ;
u64 mperf ;
} ;
static DEFINE_PER_CPU ( struct aperfmperf_sample , samples ) ;
2017-07-28 14:45:03 +02:00
# define APERFMPERF_CACHE_THRESHOLD_MS 10
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
# define APERFMPERF_REFRESH_DELAY_MS 10
2017-07-28 14:45:03 +02:00
# define APERFMPERF_STALE_THRESHOLD_MS 1000
2017-06-23 22:11:52 -07:00
/*
* aperfmperf_snapshot_khz ( )
* On the current CPU , snapshot APERF , MPERF , and jiffies
* unless we already did it within 10 ms
* calculate kHz , save snapshot
*/
static void aperfmperf_snapshot_khz ( void * dummy )
{
u64 aperf , aperf_delta ;
u64 mperf , mperf_delta ;
struct aperfmperf_sample * s = this_cpu_ptr ( & samples ) ;
2017-08-08 14:12:49 -07:00
unsigned long flags ;
2017-06-23 22:11:52 -07:00
2017-08-08 14:12:49 -07:00
local_irq_save ( flags ) ;
2017-06-23 22:11:52 -07:00
rdmsrl ( MSR_IA32_APERF , aperf ) ;
rdmsrl ( MSR_IA32_MPERF , mperf ) ;
2017-08-08 14:12:49 -07:00
local_irq_restore ( flags ) ;
2017-06-23 22:11:52 -07:00
aperf_delta = aperf - s - > aperf ;
mperf_delta = mperf - s - > mperf ;
/*
* There is no architectural guarantee that MPERF
* increments faster than we can read it .
*/
if ( mperf_delta = = 0 )
return ;
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
s - > time = ktime_get ( ) ;
2017-06-23 22:11:52 -07:00
s - > aperf = aperf ;
s - > mperf = mperf ;
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
s - > khz = div64_u64 ( ( cpu_khz * aperf_delta ) , mperf_delta ) ;
x86/cpu: Avoid cpuinfo-induced IPI pileups
The aperfmperf_snapshot_cpu() function is invoked upon access to
/proc/cpuinfo, and it does do an early exit if the specified CPU has
recently done a snapshot. Unfortunately, the indication that a snapshot
has been completed is set in an IPI handler, and the execution of this
handler can be delayed by any number of unfortunate events. This means
that a system that starts a number of applications, each of which
parses /proc/cpuinfo, can suffer from an smp_call_function_single()
storm, especially given that each access to /proc/cpuinfo invokes
smp_call_function_single() for all CPUs. Please note that this is not
theoretical speculation. Note also that one CPU's pending IPI serves
all requests, so there is no point in ever having more than one IPI
pending to a given CPU.
This commit therefore suppresses duplicate IPIs to a given CPU via a
new ->scfpending field in the aperfmperf_sample structure. This field
is set to the value one if an IPI is pending to the corresponding CPU
and to zero otherwise.
The aperfmperf_snapshot_cpu() function uses atomic_xchg() to set this
field to the value one and sample the old value. If this function's
"wait" parameter is zero, smp_call_function_single() is called only if
the old value of the ->scfpending field was zero. The IPI handler uses
atomic_set_release() to set this new field to zero just before returning,
so that the prior stores into the aperfmperf_sample structure are seen
by future requests that get to the atomic_xchg(). Future requests that
pass the elapsed-time check are ordered by the fact that on x86 loads act
as acquire loads, just as was the case prior to this change. The return
value is based off of the age of the prior snapshot, just as before.
Reported-by: Dave Jones <davej@codemonkey.org.uk>
[ paulmck: Allow /proc/cpuinfo to take advantage of arch_freq_get_on_cpu(). ]
[ paulmck: Add comment on memory barrier. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
2020-09-02 13:19:12 -07:00
atomic_set_release ( & s - > scfpending , 0 ) ;
2017-06-23 22:11:52 -07:00
}
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
static bool aperfmperf_snapshot_cpu ( int cpu , ktime_t now , bool wait )
2017-06-23 22:11:52 -07:00
{
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
s64 time_delta = ktime_ms_delta ( now , per_cpu ( samples . time , cpu ) ) ;
x86/cpu: Avoid cpuinfo-induced IPI pileups
The aperfmperf_snapshot_cpu() function is invoked upon access to
/proc/cpuinfo, and it does do an early exit if the specified CPU has
recently done a snapshot. Unfortunately, the indication that a snapshot
has been completed is set in an IPI handler, and the execution of this
handler can be delayed by any number of unfortunate events. This means
that a system that starts a number of applications, each of which
parses /proc/cpuinfo, can suffer from an smp_call_function_single()
storm, especially given that each access to /proc/cpuinfo invokes
smp_call_function_single() for all CPUs. Please note that this is not
theoretical speculation. Note also that one CPU's pending IPI serves
all requests, so there is no point in ever having more than one IPI
pending to a given CPU.
This commit therefore suppresses duplicate IPIs to a given CPU via a
new ->scfpending field in the aperfmperf_sample structure. This field
is set to the value one if an IPI is pending to the corresponding CPU
and to zero otherwise.
The aperfmperf_snapshot_cpu() function uses atomic_xchg() to set this
field to the value one and sample the old value. If this function's
"wait" parameter is zero, smp_call_function_single() is called only if
the old value of the ->scfpending field was zero. The IPI handler uses
atomic_set_release() to set this new field to zero just before returning,
so that the prior stores into the aperfmperf_sample structure are seen
by future requests that get to the atomic_xchg(). Future requests that
pass the elapsed-time check are ordered by the fact that on x86 loads act
as acquire loads, just as was the case prior to this change. The return
value is based off of the age of the prior snapshot, just as before.
Reported-by: Dave Jones <davej@codemonkey.org.uk>
[ paulmck: Allow /proc/cpuinfo to take advantage of arch_freq_get_on_cpu(). ]
[ paulmck: Add comment on memory barrier. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
2020-09-02 13:19:12 -07:00
struct aperfmperf_sample * s = per_cpu_ptr ( & samples , cpu ) ;
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
/* Don't bother re-computing within the cache threshold time. */
if ( time_delta < APERFMPERF_CACHE_THRESHOLD_MS )
return true ;
x86/cpu: Avoid cpuinfo-induced IPI pileups
The aperfmperf_snapshot_cpu() function is invoked upon access to
/proc/cpuinfo, and it does do an early exit if the specified CPU has
recently done a snapshot. Unfortunately, the indication that a snapshot
has been completed is set in an IPI handler, and the execution of this
handler can be delayed by any number of unfortunate events. This means
that a system that starts a number of applications, each of which
parses /proc/cpuinfo, can suffer from an smp_call_function_single()
storm, especially given that each access to /proc/cpuinfo invokes
smp_call_function_single() for all CPUs. Please note that this is not
theoretical speculation. Note also that one CPU's pending IPI serves
all requests, so there is no point in ever having more than one IPI
pending to a given CPU.
This commit therefore suppresses duplicate IPIs to a given CPU via a
new ->scfpending field in the aperfmperf_sample structure. This field
is set to the value one if an IPI is pending to the corresponding CPU
and to zero otherwise.
The aperfmperf_snapshot_cpu() function uses atomic_xchg() to set this
field to the value one and sample the old value. If this function's
"wait" parameter is zero, smp_call_function_single() is called only if
the old value of the ->scfpending field was zero. The IPI handler uses
atomic_set_release() to set this new field to zero just before returning,
so that the prior stores into the aperfmperf_sample structure are seen
by future requests that get to the atomic_xchg(). Future requests that
pass the elapsed-time check are ordered by the fact that on x86 loads act
as acquire loads, just as was the case prior to this change. The return
value is based off of the age of the prior snapshot, just as before.
Reported-by: Dave Jones <davej@codemonkey.org.uk>
[ paulmck: Allow /proc/cpuinfo to take advantage of arch_freq_get_on_cpu(). ]
[ paulmck: Add comment on memory barrier. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
2020-09-02 13:19:12 -07:00
if ( ! atomic_xchg ( & s - > scfpending , 1 ) | | wait )
smp_call_function_single ( cpu , aperfmperf_snapshot_khz , NULL , wait ) ;
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
/* Return false if the previous iteration was too long ago. */
return time_delta < = APERFMPERF_STALE_THRESHOLD_MS ;
}
2017-07-28 14:45:03 +02:00
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
unsigned int aperfmperf_get_khz ( int cpu )
{
2017-06-23 22:11:52 -07:00
if ( ! cpu_khz )
return 0 ;
2019-03-29 19:52:59 +01:00
if ( ! boot_cpu_has ( X86_FEATURE_APERFMPERF ) )
2017-06-23 22:11:52 -07:00
return 0 ;
2022-02-07 16:59:06 +01:00
if ( ! housekeeping_cpu ( cpu , HK_TYPE_MISC ) )
2019-05-15 09:59:00 +03:00
return 0 ;
2020-09-03 15:23:29 -07:00
if ( rcu_is_idle_cpu ( cpu ) )
return 0 ; /* Idle CPUs are completely uninteresting. */
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
aperfmperf_snapshot_cpu ( cpu , ktime_get ( ) , true ) ;
return per_cpu ( samples . khz , cpu ) ;
}
2017-11-13 02:15:39 +01:00
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
void arch_freq_prepare_all ( void )
{
ktime_t now = ktime_get ( ) ;
bool wait = false ;
int cpu ;
if ( ! cpu_khz )
return ;
2019-03-29 19:52:59 +01:00
if ( ! boot_cpu_has ( X86_FEATURE_APERFMPERF ) )
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
return ;
2019-05-15 09:59:00 +03:00
for_each_online_cpu ( cpu ) {
2022-02-07 16:59:06 +01:00
if ( ! housekeeping_cpu ( cpu , HK_TYPE_MISC ) )
2019-05-15 09:59:00 +03:00
continue ;
2020-09-03 15:23:29 -07:00
if ( rcu_is_idle_cpu ( cpu ) )
continue ; /* Idle CPUs are completely uninteresting. */
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
if ( ! aperfmperf_snapshot_cpu ( cpu , now , false ) )
wait = true ;
2019-05-15 09:59:00 +03:00
}
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
if ( wait )
msleep ( APERFMPERF_REFRESH_DELAY_MS ) ;
}
unsigned int arch_freq_get_on_cpu ( int cpu )
{
x86/cpu: Avoid cpuinfo-induced IPI pileups
The aperfmperf_snapshot_cpu() function is invoked upon access to
/proc/cpuinfo, and it does do an early exit if the specified CPU has
recently done a snapshot. Unfortunately, the indication that a snapshot
has been completed is set in an IPI handler, and the execution of this
handler can be delayed by any number of unfortunate events. This means
that a system that starts a number of applications, each of which
parses /proc/cpuinfo, can suffer from an smp_call_function_single()
storm, especially given that each access to /proc/cpuinfo invokes
smp_call_function_single() for all CPUs. Please note that this is not
theoretical speculation. Note also that one CPU's pending IPI serves
all requests, so there is no point in ever having more than one IPI
pending to a given CPU.
This commit therefore suppresses duplicate IPIs to a given CPU via a
new ->scfpending field in the aperfmperf_sample structure. This field
is set to the value one if an IPI is pending to the corresponding CPU
and to zero otherwise.
The aperfmperf_snapshot_cpu() function uses atomic_xchg() to set this
field to the value one and sample the old value. If this function's
"wait" parameter is zero, smp_call_function_single() is called only if
the old value of the ->scfpending field was zero. The IPI handler uses
atomic_set_release() to set this new field to zero just before returning,
so that the prior stores into the aperfmperf_sample structure are seen
by future requests that get to the atomic_xchg(). Future requests that
pass the elapsed-time check are ordered by the fact that on x86 loads act
as acquire loads, just as was the case prior to this change. The return
value is based off of the age of the prior snapshot, just as before.
Reported-by: Dave Jones <davej@codemonkey.org.uk>
[ paulmck: Allow /proc/cpuinfo to take advantage of arch_freq_get_on_cpu(). ]
[ paulmck: Add comment on memory barrier. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
2020-09-02 13:19:12 -07:00
struct aperfmperf_sample * s = per_cpu_ptr ( & samples , cpu ) ;
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
if ( ! cpu_khz )
return 0 ;
2019-03-29 19:52:59 +01:00
if ( ! boot_cpu_has ( X86_FEATURE_APERFMPERF ) )
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
return 0 ;
2022-02-07 16:59:06 +01:00
if ( ! housekeeping_cpu ( cpu , HK_TYPE_MISC ) )
2019-05-15 09:59:00 +03:00
return 0 ;
2022-04-15 21:19:50 +02:00
if ( rcu_is_idle_cpu ( cpu ) )
return 0 ;
x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
After commit 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get()
for /proc/cpuinfo "cpu MHz"") the "cpu MHz" number in /proc/cpuinfo
on x86 can be either the nominal CPU frequency (which is constant)
or the frequency most recently requested by a scaling governor in
cpufreq, depending on the cpufreq configuration. That is somewhat
inconsistent and is different from what it was before 4.13, so in
order to restore the previous behavior, make it report the current
CPU frequency like the scaling_cur_freq sysfs file in cpufreq.
To that end, modify the /proc/cpuinfo implementation on x86 to use
aperfmperf_snapshot_khz() to snapshot the APERF and MPERF feedback
registers, if available, and use their values to compute the CPU
frequency to be reported as "cpu MHz".
However, do that carefully enough to avoid accumulating delays that
lead to unacceptable access times for /proc/cpuinfo on systems with
many CPUs. Run aperfmperf_snapshot_khz() once on all CPUs
asynchronously at the /proc/cpuinfo open time, add a single delay
upfront (if necessary) at that point and simply compute the current
frequency while running show_cpuinfo() for each individual CPU.
Also, to avoid slowing down /proc/cpuinfo accesses too much, reduce
the default delay between consecutive APERF and MPERF reads to 10 ms,
which should be sufficient to get large enough numbers for the
frequency computation in all cases.
Fixes: 890da9cf0983 (Revert "x86: do not use cpufreq_quick_get() for /proc/cpuinfo "cpu MHz"")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
2017-11-15 02:13:40 +01:00
if ( aperfmperf_snapshot_cpu ( cpu , ktime_get ( ) , true ) )
return per_cpu ( samples . khz , cpu ) ;
2017-07-28 14:45:03 +02:00
msleep ( APERFMPERF_REFRESH_DELAY_MS ) ;
x86/cpu: Avoid cpuinfo-induced IPI pileups
The aperfmperf_snapshot_cpu() function is invoked upon access to
/proc/cpuinfo, and it does do an early exit if the specified CPU has
recently done a snapshot. Unfortunately, the indication that a snapshot
has been completed is set in an IPI handler, and the execution of this
handler can be delayed by any number of unfortunate events. This means
that a system that starts a number of applications, each of which
parses /proc/cpuinfo, can suffer from an smp_call_function_single()
storm, especially given that each access to /proc/cpuinfo invokes
smp_call_function_single() for all CPUs. Please note that this is not
theoretical speculation. Note also that one CPU's pending IPI serves
all requests, so there is no point in ever having more than one IPI
pending to a given CPU.
This commit therefore suppresses duplicate IPIs to a given CPU via a
new ->scfpending field in the aperfmperf_sample structure. This field
is set to the value one if an IPI is pending to the corresponding CPU
and to zero otherwise.
The aperfmperf_snapshot_cpu() function uses atomic_xchg() to set this
field to the value one and sample the old value. If this function's
"wait" parameter is zero, smp_call_function_single() is called only if
the old value of the ->scfpending field was zero. The IPI handler uses
atomic_set_release() to set this new field to zero just before returning,
so that the prior stores into the aperfmperf_sample structure are seen
by future requests that get to the atomic_xchg(). Future requests that
pass the elapsed-time check are ordered by the fact that on x86 loads act
as acquire loads, just as was the case prior to this change. The return
value is based off of the age of the prior snapshot, just as before.
Reported-by: Dave Jones <davej@codemonkey.org.uk>
[ paulmck: Allow /proc/cpuinfo to take advantage of arch_freq_get_on_cpu(). ]
[ paulmck: Add comment on memory barrier. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
2020-09-02 13:19:12 -07:00
atomic_set ( & s - > scfpending , 1 ) ;
smp_mb ( ) ; /* ->scfpending before smp_call_function_single(). */
2017-07-28 14:45:03 +02:00
smp_call_function_single ( cpu , aperfmperf_snapshot_khz , NULL , 1 ) ;
2017-06-23 22:11:52 -07:00
return per_cpu ( samples . khz , cpu ) ;
}
2022-04-15 21:19:51 +02:00
2022-04-15 21:19:59 +02:00
static void init_counter_refs ( void )
{
u64 aperf , mperf ;
rdmsrl ( MSR_IA32_APERF , aperf ) ;
rdmsrl ( MSR_IA32_MPERF , mperf ) ;
this_cpu_write ( cpu_samples . aperf , aperf ) ;
this_cpu_write ( cpu_samples . mperf , mperf ) ;
}
2022-04-15 21:19:51 +02:00
# if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
/*
* APERF / MPERF frequency ratio computation .
*
* The scheduler wants to do frequency invariant accounting and needs a < 1
* ratio to account for the ' current ' frequency , corresponding to
* freq_curr / freq_max .
*
* Since the frequency freq_curr on x86 is controlled by micro - controller and
* our P - state setting is little more than a request / hint , we need to observe
* the effective frequency ' BusyMHz ' , i . e . the average frequency over a time
* interval after discarding idle time . This is given by :
*
* BusyMHz = delta_APERF / delta_MPERF * freq_base
*
* where freq_base is the max non - turbo P - state .
*
* The freq_max term has to be set to a somewhat arbitrary value , because we
* can ' t know which turbo states will be available at a given point in time :
* it all depends on the thermal headroom of the entire package . We set it to
* the turbo level with 4 cores active .
*
* Benchmarks show that ' s a good compromise between the 1 C turbo ratio
* ( freq_curr / freq_max would rarely reach 1 ) and something close to freq_base ,
* which would ignore the entire turbo range ( a conspicuous part , making
* freq_curr / freq_max always maxed out ) .
*
* An exception to the heuristic above is the Atom uarch , where we choose the
* highest turbo level for freq_max since Atom ' s are generally oriented towards
* power efficiency .
*
* Setting freq_max to anything less than the 1 C turbo ratio makes the ratio
* freq_curr / freq_max to eventually grow > 1 , in which case we clip it to 1.
*/
DEFINE_STATIC_KEY_FALSE ( arch_scale_freq_key ) ;
static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE ;
static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE ;
void arch_set_max_freq_ratio ( bool turbo_disabled )
{
arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
arch_turbo_freq_ratio ;
}
EXPORT_SYMBOL_GPL ( arch_set_max_freq_ratio ) ;
2022-04-15 21:19:54 +02:00
static bool __init turbo_disabled ( void )
2022-04-15 21:19:51 +02:00
{
u64 misc_en ;
int err ;
err = rdmsrl_safe ( MSR_IA32_MISC_ENABLE , & misc_en ) ;
if ( err )
return false ;
return ( misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ) ;
}
2022-04-15 21:19:54 +02:00
static bool __init slv_set_max_freq_ratio ( u64 * base_freq , u64 * turbo_freq )
2022-04-15 21:19:51 +02:00
{
int err ;
err = rdmsrl_safe ( MSR_ATOM_CORE_RATIOS , base_freq ) ;
if ( err )
return false ;
err = rdmsrl_safe ( MSR_ATOM_CORE_TURBO_RATIOS , turbo_freq ) ;
if ( err )
return false ;
* base_freq = ( * base_freq > > 16 ) & 0x3F ; /* max P state */
* turbo_freq = * turbo_freq & 0x3F ; /* 1C turbo */
return true ;
}
# define X86_MATCH(model) \
X86_MATCH_VENDOR_FAM_MODEL_FEATURE ( INTEL , 6 , \
INTEL_FAM6_ # # model , X86_FEATURE_APERFMPERF , NULL )
2022-04-15 21:19:54 +02:00
static const struct x86_cpu_id has_knl_turbo_ratio_limits [ ] __initconst = {
2022-04-15 21:19:51 +02:00
X86_MATCH ( XEON_PHI_KNL ) ,
X86_MATCH ( XEON_PHI_KNM ) ,
{ }
} ;
2022-04-15 21:19:54 +02:00
static const struct x86_cpu_id has_skx_turbo_ratio_limits [ ] __initconst = {
2022-04-15 21:19:51 +02:00
X86_MATCH ( SKYLAKE_X ) ,
{ }
} ;
2022-04-15 21:19:54 +02:00
static const struct x86_cpu_id has_glm_turbo_ratio_limits [ ] __initconst = {
2022-04-15 21:19:51 +02:00
X86_MATCH ( ATOM_GOLDMONT ) ,
X86_MATCH ( ATOM_GOLDMONT_D ) ,
X86_MATCH ( ATOM_GOLDMONT_PLUS ) ,
{ }
} ;
2022-04-15 21:19:54 +02:00
static bool __init knl_set_max_freq_ratio ( u64 * base_freq , u64 * turbo_freq ,
int num_delta_fratio )
2022-04-15 21:19:51 +02:00
{
int fratio , delta_fratio , found ;
int err , i ;
u64 msr ;
err = rdmsrl_safe ( MSR_PLATFORM_INFO , base_freq ) ;
if ( err )
return false ;
* base_freq = ( * base_freq > > 8 ) & 0xFF ; /* max P state */
err = rdmsrl_safe ( MSR_TURBO_RATIO_LIMIT , & msr ) ;
if ( err )
return false ;
fratio = ( msr > > 8 ) & 0xFF ;
i = 16 ;
found = 0 ;
do {
if ( found > = num_delta_fratio ) {
* turbo_freq = fratio ;
return true ;
}
delta_fratio = ( msr > > ( i + 5 ) ) & 0x7 ;
if ( delta_fratio ) {
found + = 1 ;
fratio - = delta_fratio ;
}
i + = 8 ;
} while ( i < 64 ) ;
return true ;
}
2022-04-15 21:19:54 +02:00
static bool __init skx_set_max_freq_ratio ( u64 * base_freq , u64 * turbo_freq , int size )
2022-04-15 21:19:51 +02:00
{
u64 ratios , counts ;
u32 group_size ;
int err , i ;
err = rdmsrl_safe ( MSR_PLATFORM_INFO , base_freq ) ;
if ( err )
return false ;
* base_freq = ( * base_freq > > 8 ) & 0xFF ; /* max P state */
err = rdmsrl_safe ( MSR_TURBO_RATIO_LIMIT , & ratios ) ;
if ( err )
return false ;
err = rdmsrl_safe ( MSR_TURBO_RATIO_LIMIT1 , & counts ) ;
if ( err )
return false ;
for ( i = 0 ; i < 64 ; i + = 8 ) {
group_size = ( counts > > i ) & 0xFF ;
if ( group_size > = size ) {
* turbo_freq = ( ratios > > i ) & 0xFF ;
return true ;
}
}
return false ;
}
2022-04-15 21:19:54 +02:00
static bool __init core_set_max_freq_ratio ( u64 * base_freq , u64 * turbo_freq )
2022-04-15 21:19:51 +02:00
{
u64 msr ;
int err ;
err = rdmsrl_safe ( MSR_PLATFORM_INFO , base_freq ) ;
if ( err )
return false ;
err = rdmsrl_safe ( MSR_TURBO_RATIO_LIMIT , & msr ) ;
if ( err )
return false ;
* base_freq = ( * base_freq > > 8 ) & 0xFF ; /* max P state */
* turbo_freq = ( msr > > 24 ) & 0xFF ; /* 4C turbo */
/* The CPU may have less than 4 cores */
if ( ! * turbo_freq )
* turbo_freq = msr & 0xFF ; /* 1C turbo */
return true ;
}
2022-04-15 21:19:54 +02:00
static bool __init intel_set_max_freq_ratio ( void )
2022-04-15 21:19:51 +02:00
{
u64 base_freq , turbo_freq ;
u64 turbo_ratio ;
if ( slv_set_max_freq_ratio ( & base_freq , & turbo_freq ) )
goto out ;
if ( x86_match_cpu ( has_glm_turbo_ratio_limits ) & &
skx_set_max_freq_ratio ( & base_freq , & turbo_freq , 1 ) )
goto out ;
if ( x86_match_cpu ( has_knl_turbo_ratio_limits ) & &
knl_set_max_freq_ratio ( & base_freq , & turbo_freq , 1 ) )
goto out ;
if ( x86_match_cpu ( has_skx_turbo_ratio_limits ) & &
skx_set_max_freq_ratio ( & base_freq , & turbo_freq , 4 ) )
goto out ;
if ( core_set_max_freq_ratio ( & base_freq , & turbo_freq ) )
goto out ;
return false ;
out :
/*
* Some hypervisors advertise X86_FEATURE_APERFMPERF
* but then fill all MSR ' s with zeroes .
* Some CPUs have turbo boost but don ' t declare any turbo ratio
* in MSR_TURBO_RATIO_LIMIT .
*/
if ( ! base_freq | | ! turbo_freq ) {
pr_debug ( " Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting. \n " ) ;
return false ;
}
turbo_ratio = div_u64 ( turbo_freq * SCHED_CAPACITY_SCALE , base_freq ) ;
if ( ! turbo_ratio ) {
pr_debug ( " Non-zero turbo and base frequencies led to a 0 ratio. \n " ) ;
return false ;
}
arch_turbo_freq_ratio = turbo_ratio ;
arch_set_max_freq_ratio ( turbo_disabled ( ) ) ;
return true ;
}
# ifdef CONFIG_PM_SLEEP
static struct syscore_ops freq_invariance_syscore_ops = {
. resume = init_counter_refs ,
} ;
static void register_freq_invariance_syscore_ops ( void )
{
register_syscore_ops ( & freq_invariance_syscore_ops ) ;
}
# else
static inline void register_freq_invariance_syscore_ops ( void ) { }
# endif
2022-04-15 21:19:54 +02:00
static void freq_invariance_enable ( void )
{
if ( static_branch_unlikely ( & arch_scale_freq_key ) ) {
WARN_ON_ONCE ( 1 ) ;
return ;
}
static_branch_enable ( & arch_scale_freq_key ) ;
register_freq_invariance_syscore_ops ( ) ;
pr_info ( " Estimated ratio of average max frequency by base frequency (times 1024): %llu \n " , arch_max_freq_ratio ) ;
}
void freq_invariance_set_perf_ratio ( u64 ratio , bool turbo_disabled )
2022-04-15 21:19:51 +02:00
{
2022-04-15 21:19:54 +02:00
arch_turbo_freq_ratio = ratio ;
arch_set_max_freq_ratio ( turbo_disabled ) ;
freq_invariance_enable ( ) ;
}
2022-04-15 21:19:51 +02:00
2022-04-15 21:19:59 +02:00
static void __init bp_init_freq_invariance ( void )
2022-04-15 21:19:54 +02:00
{
if ( boot_cpu_data . x86_vendor ! = X86_VENDOR_INTEL )
return ;
2022-04-15 21:19:51 +02:00
2022-04-15 21:19:54 +02:00
if ( intel_set_max_freq_ratio ( ) )
freq_invariance_enable ( ) ;
2022-04-15 21:19:51 +02:00
}
static void disable_freq_invariance_workfn ( struct work_struct * work )
{
static_branch_disable ( & arch_scale_freq_key ) ;
}
static DECLARE_WORK ( disable_freq_invariance_work ,
disable_freq_invariance_workfn ) ;
DEFINE_PER_CPU ( unsigned long , arch_freq_scale ) = SCHED_CAPACITY_SCALE ;
2022-04-15 21:19:57 +02:00
static void scale_freq_tick ( u64 acnt , u64 mcnt )
2022-04-15 21:19:51 +02:00
{
2022-04-15 21:19:57 +02:00
u64 freq_scale ;
2022-04-15 21:19:51 +02:00
2022-04-15 21:19:59 +02:00
if ( ! arch_scale_freq_invariant ( ) )
return ;
2022-04-15 21:19:51 +02:00
if ( check_shl_overflow ( acnt , 2 * SCHED_CAPACITY_SHIFT , & acnt ) )
goto error ;
if ( check_mul_overflow ( mcnt , arch_max_freq_ratio , & mcnt ) | | ! mcnt )
goto error ;
freq_scale = div64_u64 ( acnt , mcnt ) ;
if ( ! freq_scale )
goto error ;
if ( freq_scale > SCHED_CAPACITY_SCALE )
freq_scale = SCHED_CAPACITY_SCALE ;
this_cpu_write ( arch_freq_scale , freq_scale ) ;
return ;
error :
pr_warn ( " Scheduler frequency invariance went wobbly, disabling! \n " ) ;
schedule_work ( & disable_freq_invariance_work ) ;
}
2022-04-15 21:19:59 +02:00
# else
static inline void bp_init_freq_invariance ( void ) { }
static inline void scale_freq_tick ( u64 acnt , u64 mcnt ) { }
# endif /* CONFIG_X86_64 && CONFIG_SMP */
2022-04-15 21:19:57 +02:00
void arch_scale_freq_tick ( void )
{
struct aperfmperf * s = this_cpu_ptr ( & cpu_samples ) ;
u64 acnt , mcnt , aperf , mperf ;
2022-04-15 21:19:59 +02:00
if ( ! cpu_feature_enabled ( X86_FEATURE_APERFMPERF ) )
2022-04-15 21:19:57 +02:00
return ;
rdmsrl ( MSR_IA32_APERF , aperf ) ;
rdmsrl ( MSR_IA32_MPERF , mperf ) ;
acnt = aperf - s - > aperf ;
mcnt = mperf - s - > mperf ;
s - > aperf = aperf ;
s - > mperf = mperf ;
scale_freq_tick ( acnt , mcnt ) ;
}
2022-04-15 21:19:59 +02:00
static int __init bp_init_aperfmperf ( void )
{
if ( ! cpu_feature_enabled ( X86_FEATURE_APERFMPERF ) )
return 0 ;
init_counter_refs ( ) ;
bp_init_freq_invariance ( ) ;
return 0 ;
}
early_initcall ( bp_init_aperfmperf ) ;
void ap_init_aperfmperf ( void )
{
if ( cpu_feature_enabled ( X86_FEATURE_APERFMPERF ) )
init_counter_refs ( ) ;
}