2013-02-06 21:02:13 +04:00
/*
2013-04-10 02:38:18 +04:00
* intel_pstate . c : Native P state management for Intel processors
2013-02-06 21:02:13 +04:00
*
* ( C ) Copyright 2012 Intel Corporation
* Author : Dirk Brandewie < dirk . j . brandewie @ intel . com >
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; version 2
* of the License .
*/
2016-04-05 23:28:23 +03:00
# define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2013-02-06 21:02:13 +04:00
# include <linux/kernel.h>
# include <linux/kernel_stat.h>
# include <linux/module.h>
# include <linux/ktime.h>
# include <linux/hrtimer.h>
# include <linux/tick.h>
# include <linux/slab.h>
2017-02-08 20:51:31 +03:00
# include <linux/sched/cpufreq.h>
2013-02-06 21:02:13 +04:00
# include <linux/list.h>
# include <linux/cpu.h>
# include <linux/cpufreq.h>
# include <linux/sysfs.h>
# include <linux/types.h>
# include <linux/fs.h>
# include <linux/debugfs.h>
2013-10-31 19:24:05 +04:00
# include <linux/acpi.h>
2015-06-02 12:01:38 +03:00
# include <linux/vmalloc.h>
2013-02-06 21:02:13 +04:00
# include <trace/events/power.h>
# include <asm/div64.h>
# include <asm/msr.h>
# include <asm/cpu_device_id.h>
2015-04-03 16:19:53 +03:00
# include <asm/cpufeature.h>
2016-06-03 03:19:45 +03:00
# include <asm/intel-family.h>
2013-02-06 21:02:13 +04:00
2017-08-10 02:09:16 +03:00
# define INTEL_PSTATE_SAMPLING_INTERVAL (10 * NSEC_PER_MSEC)
2017-03-28 01:15:37 +03:00
2016-11-18 01:34:17 +03:00
# define INTEL_CPUFREQ_TRANSITION_LATENCY 20000
2017-04-11 01:20:41 +03:00
# define INTEL_CPUFREQ_TRANSITION_DELAY 500
2016-11-18 01:34:17 +03:00
2016-04-28 01:48:06 +03:00
# ifdef CONFIG_ACPI
# include <acpi/processor.h>
2016-11-22 23:24:00 +03:00
# include <acpi/cppc_acpi.h>
2016-04-28 01:48:06 +03:00
# endif
2014-05-29 20:32:23 +04:00
# define FRAC_BITS 8
2013-02-06 21:02:13 +04:00
# define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
# define fp_toint(X) ((X) >> FRAC_BITS)
2014-05-29 20:32:23 +04:00
2016-05-11 20:09:12 +03:00
# define EXT_BITS 6
# define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS)
2016-11-22 03:33:20 +03:00
# define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS)
# define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS)
2016-05-11 20:09:12 +03:00
2013-02-06 21:02:13 +04:00
static inline int32_t mul_fp ( int32_t x , int32_t y )
{
return ( ( int64_t ) x * ( int64_t ) y ) > > FRAC_BITS ;
}
intel_pstate: Fix overflow in busy_scaled due to long delay
The kernel may delay interrupts for a long time which can result in timers
being delayed. If this occurs the intel_pstate driver will crash with a
divide by zero error:
divide error: 0000 [#1] SMP
Modules linked in: btrfs zlib_deflate raid6_pq xor msdos ext4 mbcache jbd2 binfmt_misc arc4 md4 nls_utf8 cifs dns_resolver tcp_lp bnep bluetooth rfkill fuse dm_service_time iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi nf_conntrack_netbios_ns nf_conntrack_broadcast nf_conntrack_ftp ip6t_rpfilter ip6t_REJECT ipt_REJECT xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter ip_tables intel_powerclamp coretemp vfat fat kvm_intel iTCO_wdt iTCO_vendor_support ipmi_devintf sr_mod kvm crct10dif_pclmul
crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel cdc_ether lrw usbnet cdrom mii gf128mul glue_helper ablk_helper cryptd lpc_ich mfd_core pcspkr sb_edac edac_core ipmi_si ipmi_msghandler ioatdma wmi shpchp acpi_pad nfsd auth_rpcgss nfs_acl lockd uinput dm_multipath sunrpc xfs libcrc32c usb_storage sd_mod crc_t10dif crct10dif_common ixgbe mgag200 syscopyarea sysfillrect sysimgblt mdio drm_kms_helper ttm igb drm ptp pps_core dca i2c_algo_bit megaraid_sas i2c_core dm_mirror dm_region_hash dm_log dm_mod
CPU: 113 PID: 0 Comm: swapper/113 Tainted: G W -------------- 3.10.0-229.1.2.el7.x86_64 #1
Hardware name: IBM x3950 X6 -[3837AC2]-/00FN827, BIOS -[A8E112BUS-1.00]- 08/27/2014
task: ffff880fe8abe660 ti: ffff880fe8ae4000 task.ti: ffff880fe8ae4000
RIP: 0010:[<ffffffff814a9279>] [<ffffffff814a9279>] intel_pstate_timer_func+0x179/0x3d0
RSP: 0018:ffff883fff4e3db8 EFLAGS: 00010206
RAX: 0000000027100000 RBX: ffff883fe6965100 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000010 RDI: 000000002e53632d
RBP: ffff883fff4e3e20 R08: 000e6f69a5a125c0 R09: ffff883fe84ec001
R10: 0000000000000002 R11: 0000000000000005 R12: 00000000000049f5
R13: 0000000000271000 R14: 00000000000049f5 R15: 0000000000000246
FS: 0000000000000000(0000) GS:ffff883fff4e0000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f7668601000 CR3: 000000000190a000 CR4: 00000000001407e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Stack:
ffff883fff4e3e58 ffffffff81099dc1 0000000000000086 0000000000000071
ffff883fff4f3680 0000000000000071 fbdc8a965e33afee ffffffff810b69dd
ffff883fe84ec000 ffff883fe6965108 0000000000000100 ffffffff814a9100
Call Trace:
<IRQ>
[<ffffffff81099dc1>] ? run_posix_cpu_timers+0x51/0x840
[<ffffffff810b69dd>] ? trigger_load_balance+0x5d/0x200
[<ffffffff814a9100>] ? pid_param_set+0x130/0x130
[<ffffffff8107df56>] call_timer_fn+0x36/0x110
[<ffffffff814a9100>] ? pid_param_set+0x130/0x130
[<ffffffff8107fdcf>] run_timer_softirq+0x21f/0x320
[<ffffffff81077b2f>] __do_softirq+0xef/0x280
[<ffffffff816156dc>] call_softirq+0x1c/0x30
[<ffffffff81015d95>] do_softirq+0x65/0xa0
[<ffffffff81077ec5>] irq_exit+0x115/0x120
[<ffffffff81616355>] smp_apic_timer_interrupt+0x45/0x60
[<ffffffff81614a1d>] apic_timer_interrupt+0x6d/0x80
<EOI>
[<ffffffff814a9c32>] ? cpuidle_enter_state+0x52/0xc0
[<ffffffff814a9c28>] ? cpuidle_enter_state+0x48/0xc0
[<ffffffff814a9d65>] cpuidle_idle_call+0xc5/0x200
[<ffffffff8101d14e>] arch_cpu_idle+0xe/0x30
[<ffffffff810c67c1>] cpu_startup_entry+0xf1/0x290
[<ffffffff8104228a>] start_secondary+0x1ba/0x230
Code: 42 0f 00 45 89 e6 48 01 c2 43 8d 44 6d 00 39 d0 73 26 49 c1 e5 08 89 d2 4d 63 f4 49 63 c5 48 c1 e2 08 48 c1 e0 08 48 63 ca 48 99 <48> f7 f9 48 98 4c 0f af f0 49 c1 ee 08 8b 43 78 c1 e0 08 44 29
RIP [<ffffffff814a9279>] intel_pstate_timer_func+0x179/0x3d0
RSP <ffff883fff4e3db8>
The kernel values for cpudata for CPU 113 were:
struct cpudata {
cpu = 113,
timer = {
entry = {
next = 0x0,
prev = 0xdead000000200200
},
expires = 8357799745,
base = 0xffff883fe84ec001,
function = 0xffffffff814a9100 <intel_pstate_timer_func>,
data = 18446612406765768960,
<snip>
i_gain = 0,
d_gain = 0,
deadband = 0,
last_err = 22489
},
last_sample_time = {
tv64 = 4063132438017305
},
prev_aperf = 287326796397463,
prev_mperf = 251427432090198,
sample = {
core_pct_busy = 23081,
aperf = 2937407,
mperf = 3257884,
freq = 2524484,
time = {
tv64 = 4063149215234118
}
}
}
which results in the time between samples = last_sample_time - sample.time
= 4063149215234118 - 4063132438017305 = 16777216813 which is 16.777 seconds.
The duration between reads of the APERF and MPERF registers overflowed a s32
sized integer in intel_pstate_get_scaled_busy()'s call to div_fp(). The result
is that int_tofp(duration_us) == 0, and the kernel attempts to divide by 0.
While the kernel shouldn't be delaying for a long time, it can and does
happen and the intel_pstate driver should not panic in this situation. This
patch changes the div_fp() function to use div64_s64() to allow for "long"
division. This will avoid the overflow condition on long delays.
[v2]: use div64_s64() in div_fp()
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-15 20:43:29 +03:00
static inline int32_t div_fp ( s64 x , s64 y )
2013-02-06 21:02:13 +04:00
{
intel_pstate: Fix overflow in busy_scaled due to long delay
The kernel may delay interrupts for a long time which can result in timers
being delayed. If this occurs the intel_pstate driver will crash with a
divide by zero error:
divide error: 0000 [#1] SMP
Modules linked in: btrfs zlib_deflate raid6_pq xor msdos ext4 mbcache jbd2 binfmt_misc arc4 md4 nls_utf8 cifs dns_resolver tcp_lp bnep bluetooth rfkill fuse dm_service_time iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi nf_conntrack_netbios_ns nf_conntrack_broadcast nf_conntrack_ftp ip6t_rpfilter ip6t_REJECT ipt_REJECT xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter ip_tables intel_powerclamp coretemp vfat fat kvm_intel iTCO_wdt iTCO_vendor_support ipmi_devintf sr_mod kvm crct10dif_pclmul
crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel cdc_ether lrw usbnet cdrom mii gf128mul glue_helper ablk_helper cryptd lpc_ich mfd_core pcspkr sb_edac edac_core ipmi_si ipmi_msghandler ioatdma wmi shpchp acpi_pad nfsd auth_rpcgss nfs_acl lockd uinput dm_multipath sunrpc xfs libcrc32c usb_storage sd_mod crc_t10dif crct10dif_common ixgbe mgag200 syscopyarea sysfillrect sysimgblt mdio drm_kms_helper ttm igb drm ptp pps_core dca i2c_algo_bit megaraid_sas i2c_core dm_mirror dm_region_hash dm_log dm_mod
CPU: 113 PID: 0 Comm: swapper/113 Tainted: G W -------------- 3.10.0-229.1.2.el7.x86_64 #1
Hardware name: IBM x3950 X6 -[3837AC2]-/00FN827, BIOS -[A8E112BUS-1.00]- 08/27/2014
task: ffff880fe8abe660 ti: ffff880fe8ae4000 task.ti: ffff880fe8ae4000
RIP: 0010:[<ffffffff814a9279>] [<ffffffff814a9279>] intel_pstate_timer_func+0x179/0x3d0
RSP: 0018:ffff883fff4e3db8 EFLAGS: 00010206
RAX: 0000000027100000 RBX: ffff883fe6965100 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000010 RDI: 000000002e53632d
RBP: ffff883fff4e3e20 R08: 000e6f69a5a125c0 R09: ffff883fe84ec001
R10: 0000000000000002 R11: 0000000000000005 R12: 00000000000049f5
R13: 0000000000271000 R14: 00000000000049f5 R15: 0000000000000246
FS: 0000000000000000(0000) GS:ffff883fff4e0000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f7668601000 CR3: 000000000190a000 CR4: 00000000001407e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Stack:
ffff883fff4e3e58 ffffffff81099dc1 0000000000000086 0000000000000071
ffff883fff4f3680 0000000000000071 fbdc8a965e33afee ffffffff810b69dd
ffff883fe84ec000 ffff883fe6965108 0000000000000100 ffffffff814a9100
Call Trace:
<IRQ>
[<ffffffff81099dc1>] ? run_posix_cpu_timers+0x51/0x840
[<ffffffff810b69dd>] ? trigger_load_balance+0x5d/0x200
[<ffffffff814a9100>] ? pid_param_set+0x130/0x130
[<ffffffff8107df56>] call_timer_fn+0x36/0x110
[<ffffffff814a9100>] ? pid_param_set+0x130/0x130
[<ffffffff8107fdcf>] run_timer_softirq+0x21f/0x320
[<ffffffff81077b2f>] __do_softirq+0xef/0x280
[<ffffffff816156dc>] call_softirq+0x1c/0x30
[<ffffffff81015d95>] do_softirq+0x65/0xa0
[<ffffffff81077ec5>] irq_exit+0x115/0x120
[<ffffffff81616355>] smp_apic_timer_interrupt+0x45/0x60
[<ffffffff81614a1d>] apic_timer_interrupt+0x6d/0x80
<EOI>
[<ffffffff814a9c32>] ? cpuidle_enter_state+0x52/0xc0
[<ffffffff814a9c28>] ? cpuidle_enter_state+0x48/0xc0
[<ffffffff814a9d65>] cpuidle_idle_call+0xc5/0x200
[<ffffffff8101d14e>] arch_cpu_idle+0xe/0x30
[<ffffffff810c67c1>] cpu_startup_entry+0xf1/0x290
[<ffffffff8104228a>] start_secondary+0x1ba/0x230
Code: 42 0f 00 45 89 e6 48 01 c2 43 8d 44 6d 00 39 d0 73 26 49 c1 e5 08 89 d2 4d 63 f4 49 63 c5 48 c1 e2 08 48 c1 e0 08 48 63 ca 48 99 <48> f7 f9 48 98 4c 0f af f0 49 c1 ee 08 8b 43 78 c1 e0 08 44 29
RIP [<ffffffff814a9279>] intel_pstate_timer_func+0x179/0x3d0
RSP <ffff883fff4e3db8>
The kernel values for cpudata for CPU 113 were:
struct cpudata {
cpu = 113,
timer = {
entry = {
next = 0x0,
prev = 0xdead000000200200
},
expires = 8357799745,
base = 0xffff883fe84ec001,
function = 0xffffffff814a9100 <intel_pstate_timer_func>,
data = 18446612406765768960,
<snip>
i_gain = 0,
d_gain = 0,
deadband = 0,
last_err = 22489
},
last_sample_time = {
tv64 = 4063132438017305
},
prev_aperf = 287326796397463,
prev_mperf = 251427432090198,
sample = {
core_pct_busy = 23081,
aperf = 2937407,
mperf = 3257884,
freq = 2524484,
time = {
tv64 = 4063149215234118
}
}
}
which results in the time between samples = last_sample_time - sample.time
= 4063149215234118 - 4063132438017305 = 16777216813 which is 16.777 seconds.
The duration between reads of the APERF and MPERF registers overflowed a s32
sized integer in intel_pstate_get_scaled_busy()'s call to div_fp(). The result
is that int_tofp(duration_us) == 0, and the kernel attempts to divide by 0.
While the kernel shouldn't be delaying for a long time, it can and does
happen and the intel_pstate driver should not panic in this situation. This
patch changes the div_fp() function to use div64_s64() to allow for "long"
division. This will avoid the overflow condition on long delays.
[v2]: use div64_s64() in div_fp()
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-15 20:43:29 +03:00
return div64_s64 ( ( int64_t ) x < < FRAC_BITS , y ) ;
2013-02-06 21:02:13 +04:00
}
2014-10-13 19:37:44 +04:00
static inline int ceiling_fp ( int32_t x )
{
int mask , ret ;
ret = fp_toint ( x ) ;
mask = ( 1 < < FRAC_BITS ) - 1 ;
if ( x & mask )
ret + = 1 ;
return ret ;
}
2017-03-28 01:09:18 +03:00
static inline int32_t percent_fp ( int percent )
{
return div_fp ( percent , 100 ) ;
}
2016-05-11 20:09:12 +03:00
static inline u64 mul_ext_fp ( u64 x , u64 y )
{
return ( x * y ) > > EXT_FRAC_BITS ;
}
static inline u64 div_ext_fp ( u64 x , u64 y )
{
return div64_u64 ( x < < EXT_FRAC_BITS , y ) ;
}
2017-03-14 18:18:34 +03:00
static inline int32_t percent_ext_fp ( int percent )
{
return div_ext_fp ( percent , 100 ) ;
}
2016-04-03 23:06:46 +03:00
/**
* struct sample - Store performance sample
2016-05-11 20:09:12 +03:00
* @ core_avg_perf : Ratio of APERF / MPERF which is the actual average
2016-04-03 23:06:46 +03:00
* performance during last sample period
* @ busy_scaled : Scaled busy value which is used to calculate next
2016-05-11 20:09:12 +03:00
* P state . This can be different than core_avg_perf
2016-04-03 23:06:46 +03:00
* to account for cpu idle period
* @ aperf : Difference of actual performance frequency clock count
* read from APERF MSR between last and current sample
* @ mperf : Difference of maximum performance frequency clock count
* read from MPERF MSR between last and current sample
* @ tsc : Difference of time stamp counter between last and
* current sample
* @ time : Current time from scheduler
*
* This structure is used in the cpudata structure to store performance sample
* data for choosing next P State .
*/
2013-02-06 21:02:13 +04:00
struct sample {
2016-05-11 20:09:12 +03:00
int32_t core_avg_perf ;
2015-12-04 19:40:30 +03:00
int32_t busy_scaled ;
2013-02-06 21:02:13 +04:00
u64 aperf ;
u64 mperf ;
2015-04-12 07:10:26 +03:00
u64 tsc ;
2016-02-05 03:45:30 +03:00
u64 time ;
2013-02-06 21:02:13 +04:00
} ;
2016-04-03 23:06:46 +03:00
/**
* struct pstate_data - Store P state data
* @ current_pstate : Current requested P state
* @ min_pstate : Min P state possible for this platform
* @ max_pstate : Max P state possible for this platform
* @ max_pstate_physical : This is physical Max P state for a processor
* This can be higher than the max_pstate which can
* be limited by platform thermal design power limits
* @ scaling : Scaling factor to convert frequency to cpufreq
* frequency units
* @ turbo_pstate : Max Turbo P state possible for this platform
2016-11-18 01:34:17 +03:00
* @ max_freq : @ max_pstate frequency in cpufreq units
* @ turbo_freq : @ turbo_pstate frequency in cpufreq units
2016-04-03 23:06:46 +03:00
*
* Stores the per cpu model P state limits and current P state .
*/
2013-02-06 21:02:13 +04:00
struct pstate_data {
int current_pstate ;
int min_pstate ;
int max_pstate ;
2015-10-15 02:12:00 +03:00
int max_pstate_physical ;
2014-10-13 19:37:43 +04:00
int scaling ;
2013-02-06 21:02:13 +04:00
int turbo_pstate ;
2016-11-18 01:34:17 +03:00
unsigned int max_freq ;
unsigned int turbo_freq ;
2013-02-06 21:02:13 +04:00
} ;
2016-04-03 23:06:46 +03:00
/**
* struct vid_data - Stores voltage information data
* @ min : VID data for this platform corresponding to
* the lowest P state
* @ max : VID data corresponding to the highest P State .
* @ turbo : VID data for turbo P state
* @ ratio : Ratio of ( vid max - vid min ) /
* ( max P state - Min P State )
*
* Stores the voltage data for DVFS ( Dynamic Voltage and Frequency Scaling )
* This data is used in Atom platforms , where in addition to target P state ,
* the voltage data needs to be specified to select next P State .
*/
2013-12-18 22:32:39 +04:00
struct vid_data {
2014-05-08 23:57:23 +04:00
int min ;
int max ;
int turbo ;
2013-12-18 22:32:39 +04:00
int32_t ratio ;
} ;
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
/**
* struct global_params - Global parameters , mostly tunable via sysfs .
* @ no_turbo : Whether or not to use turbo P - states .
* @ turbo_disabled : Whethet or not turbo P - states are available at all ,
* based on the MSR_IA32_MISC_ENABLE value and whether or
* not the maximum reported turbo P - state is different from
* the maximum reported non - turbo one .
* @ min_perf_pct : Minimum capacity limit in percent of the maximum turbo
* P - state capacity .
* @ max_perf_pct : Maximum capacity limit in percent of the maximum turbo
* P - state capacity .
*/
struct global_params {
bool no_turbo ;
bool turbo_disabled ;
int max_perf_pct ;
int min_perf_pct ;
2016-10-25 23:20:40 +03:00
} ;
2016-04-03 23:06:46 +03:00
/**
* struct cpudata - Per CPU instance data storage
* @ cpu : CPU number for this instance data
2016-10-25 00:20:25 +03:00
* @ policy : CPUFreq policy value
2016-04-03 23:06:46 +03:00
* @ update_util : CPUFreq utility callback information
2016-05-11 09:33:08 +03:00
* @ update_util_set : CPUFreq utility callback is set
2016-09-14 03:28:13 +03:00
* @ iowait_boost : iowait - related boost fraction
* @ last_update : Time of the last update .
2016-04-03 23:06:46 +03:00
* @ pstate : Stores P state limits for this CPU
* @ vid : Stores VID limits for this CPU
* @ last_sample_time : Last Sample time
2017-07-14 01:03:51 +03:00
* @ aperf_mperf_shift : Number of clock cycles after aperf , merf is incremented
* This shift is a multiplier to mperf delta to
* calculate CPU busy .
2016-04-03 23:06:46 +03:00
* @ prev_aperf : Last APERF value read from APERF MSR
* @ prev_mperf : Last MPERF value read from MPERF MSR
* @ prev_tsc : Last timestamp counter ( TSC ) value
* @ prev_cummulative_iowait : IO Wait time difference from last and
* current sample
* @ sample : Storage for storing last Sample data
2017-06-13 02:30:27 +03:00
* @ min_perf_ratio : Minimum capacity in terms of PERF or HWP ratios
* @ max_perf_ratio : Maximum capacity in terms of PERF or HWP ratios
2016-04-28 01:48:06 +03:00
* @ acpi_perf_data : Stores ACPI perf information read from _PSS
* @ valid_pss_table : Set to true for valid ACPI _PSS entries found
2016-12-07 00:32:16 +03:00
* @ epp_powersave : Last saved HWP energy performance preference
* ( EPP ) or energy performance bias ( EPB ) ,
* when policy switched to performance
2016-11-25 03:07:10 +03:00
* @ epp_policy : Last saved policy used to set EPP / EPB
2016-12-07 00:32:16 +03:00
* @ epp_default : Power on default HWP energy performance
* preference / bias
* @ epp_saved : Saved EPP / EPB during system suspend or CPU offline
* operation
2016-04-03 23:06:46 +03:00
*
* This structure stores per CPU instance data for all CPUs .
*/
2013-02-06 21:02:13 +04:00
struct cpudata {
int cpu ;
2016-10-25 00:20:25 +03:00
unsigned int policy ;
2016-02-05 03:45:30 +03:00
struct update_util_data update_util ;
2016-05-11 09:33:08 +03:00
bool update_util_set ;
2013-02-06 21:02:13 +04:00
struct pstate_data pstate ;
2013-12-18 22:32:39 +04:00
struct vid_data vid ;
2013-02-06 21:02:13 +04:00
2016-09-14 03:28:13 +03:00
u64 last_update ;
2016-02-05 03:45:30 +03:00
u64 last_sample_time ;
2017-07-14 01:03:51 +03:00
u64 aperf_mperf_shift ;
2013-02-06 21:02:13 +04:00
u64 prev_aperf ;
u64 prev_mperf ;
2015-04-12 07:10:26 +03:00
u64 prev_tsc ;
2015-12-04 19:40:35 +03:00
u64 prev_cummulative_iowait ;
2014-02-12 22:01:04 +04:00
struct sample sample ;
2017-06-13 02:30:27 +03:00
int32_t min_perf_ratio ;
int32_t max_perf_ratio ;
2016-04-28 01:48:06 +03:00
# ifdef CONFIG_ACPI
struct acpi_processor_performance acpi_perf_data ;
bool valid_pss_table ;
# endif
2016-09-14 03:28:13 +03:00
unsigned int iowait_boost ;
2016-12-07 00:32:16 +03:00
s16 epp_powersave ;
2016-11-25 03:07:10 +03:00
s16 epp_policy ;
2016-12-07 00:32:16 +03:00
s16 epp_default ;
s16 epp_saved ;
2013-02-06 21:02:13 +04:00
} ;
static struct cpudata * * all_cpu_data ;
2016-04-03 23:06:46 +03:00
/**
* struct pstate_funcs - Per CPU model specific callbacks
* @ get_max : Callback to get maximum non turbo effective P state
* @ get_max_physical : Callback to get maximum non turbo physical P state
* @ get_min : Callback to get minimum P state
* @ get_turbo : Callback to get turbo P state
* @ get_scaling : Callback to get frequency scaling factor
* @ get_val : Callback to convert P state to actual MSR write value
* @ get_vid : Callback to get VID data for Atom platforms
*
* Core and Atom CPU models have different way to get P State limits . This
* structure is used to store those callbacks .
*/
2013-10-21 20:20:34 +04:00
struct pstate_funcs {
int ( * get_max ) ( void ) ;
2015-10-15 02:12:00 +03:00
int ( * get_max_physical ) ( void ) ;
2013-10-21 20:20:34 +04:00
int ( * get_min ) ( void ) ;
int ( * get_turbo ) ( void ) ;
2014-10-13 19:37:43 +04:00
int ( * get_scaling ) ( void ) ;
2017-07-14 01:03:51 +03:00
int ( * get_aperf_mperf_shift ) ( void ) ;
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts. However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().
The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated. However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely. Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.
To fix the problem at hand, rearrange the code taking the above
observations into account.
First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.
Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().
Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.
Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.
Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 01:20:02 +03:00
u64 ( * get_val ) ( struct cpudata * , int pstate ) ;
2013-12-18 22:32:39 +04:00
void ( * get_vid ) ( struct cpudata * ) ;
2013-02-06 21:02:13 +04:00
} ;
2016-06-27 13:07:18 +03:00
static struct pstate_funcs pstate_funcs __read_mostly ;
2017-03-28 01:05:44 +03:00
2016-06-27 13:07:18 +03:00
static int hwp_active __read_mostly ;
2016-10-25 23:20:40 +03:00
static bool per_cpu_limits __read_mostly ;
2013-10-21 20:20:34 +04:00
2017-03-28 01:13:00 +03:00
static struct cpufreq_driver * intel_pstate_driver __read_mostly ;
2017-01-11 06:12:16 +03:00
2016-04-28 01:48:06 +03:00
# ifdef CONFIG_ACPI
static bool acpi_ppc ;
# endif
2016-04-03 23:06:46 +03:00
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
static struct global_params global ;
2013-02-06 21:02:13 +04:00
2017-01-11 06:12:16 +03:00
static DEFINE_MUTEX ( intel_pstate_driver_lock ) ;
2016-10-28 20:44:52 +03:00
static DEFINE_MUTEX ( intel_pstate_limits_lock ) ;
2016-04-28 01:48:06 +03:00
# ifdef CONFIG_ACPI
2016-04-28 01:48:08 +03:00
static bool intel_pstate_get_ppc_enable_status ( void )
{
if ( acpi_gbl_FADT . preferred_profile = = PM_ENTERPRISE_SERVER | |
acpi_gbl_FADT . preferred_profile = = PM_PERFORMANCE_SERVER )
return true ;
return acpi_ppc ;
}
2016-11-22 23:24:00 +03:00
# ifdef CONFIG_ACPI_CPPC_LIB
/* The work item is needed to avoid CPU hotplug locking issues */
static void intel_pstste_sched_itmt_work_fn ( struct work_struct * work )
{
sched_set_itmt_support ( ) ;
}
static DECLARE_WORK ( sched_itmt_work , intel_pstste_sched_itmt_work_fn ) ;
static void intel_pstate_set_itmt_prio ( int cpu )
{
struct cppc_perf_caps cppc_perf ;
static u32 max_highest_perf = 0 , min_highest_perf = U32_MAX ;
int ret ;
ret = cppc_get_perf_caps ( cpu , & cppc_perf ) ;
if ( ret )
return ;
/*
* The priorities can be set regardless of whether or not
* sched_set_itmt_support ( true ) has been called and it is valid to
* update them at any time after it has been called .
*/
sched_set_itmt_core_prio ( cppc_perf . highest_perf , cpu ) ;
if ( max_highest_perf < = min_highest_perf ) {
if ( cppc_perf . highest_perf > max_highest_perf )
max_highest_perf = cppc_perf . highest_perf ;
if ( cppc_perf . highest_perf < min_highest_perf )
min_highest_perf = cppc_perf . highest_perf ;
if ( max_highest_perf > min_highest_perf ) {
/*
* This code can be run during CPU online under the
* CPU hotplug locks , so sched_set_itmt_support ( )
* cannot be called from here . Queue up a work item
* to invoke it .
*/
schedule_work ( & sched_itmt_work ) ;
}
}
}
# else
static void intel_pstate_set_itmt_prio ( int cpu )
{
}
# endif
2016-04-28 01:48:06 +03:00
static void intel_pstate_init_acpi_perf_limits ( struct cpufreq_policy * policy )
{
struct cpudata * cpu ;
int ret ;
int i ;
2016-11-22 23:24:00 +03:00
if ( hwp_active ) {
intel_pstate_set_itmt_prio ( policy - > cpu ) ;
2016-05-05 01:07:34 +03:00
return ;
2016-11-22 23:24:00 +03:00
}
2016-05-05 01:07:34 +03:00
2016-04-28 01:48:08 +03:00
if ( ! intel_pstate_get_ppc_enable_status ( ) )
2016-04-28 01:48:06 +03:00
return ;
cpu = all_cpu_data [ policy - > cpu ] ;
ret = acpi_processor_register_performance ( & cpu - > acpi_perf_data ,
policy - > cpu ) ;
if ( ret )
return ;
/*
* Check if the control value in _PSS is for PERF_CTL MSR , which should
* guarantee that the states returned by it map to the states in our
* list directly .
*/
if ( cpu - > acpi_perf_data . control_register . space_id ! =
ACPI_ADR_SPACE_FIXED_HARDWARE )
goto err ;
/*
* If there is only one entry _PSS , simply ignore _PSS and continue as
* usual without taking _PSS into account
*/
if ( cpu - > acpi_perf_data . state_count < 2 )
goto err ;
pr_debug ( " CPU%u - ACPI _PSS perf data \n " , policy - > cpu ) ;
for ( i = 0 ; i < cpu - > acpi_perf_data . state_count ; i + + ) {
pr_debug ( " %cP%d: %u MHz, %u mW, 0x%x \n " ,
( i = = cpu - > acpi_perf_data . state ? ' * ' : ' ' ) , i ,
( u32 ) cpu - > acpi_perf_data . states [ i ] . core_frequency ,
( u32 ) cpu - > acpi_perf_data . states [ i ] . power ,
( u32 ) cpu - > acpi_perf_data . states [ i ] . control ) ;
}
/*
* The _PSS table doesn ' t contain whole turbo frequency range .
* This just contains + 1 MHZ above the max non turbo frequency ,
* with control value corresponding to max turbo ratio . But
* when cpufreq set policy is called , it will call with this
* max frequency , which will cause a reduced performance as
* this driver uses real max turbo frequency as the max
* frequency . So correct this frequency in _PSS table to
2016-06-15 09:12:59 +03:00
* correct max turbo frequency based on the turbo state .
2016-04-28 01:48:06 +03:00
* Also need to convert to MHz as _PSS freq is in MHz .
*/
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
if ( ! global . turbo_disabled )
2016-04-28 01:48:06 +03:00
cpu - > acpi_perf_data . states [ 0 ] . core_frequency =
policy - > cpuinfo . max_freq / 1000 ;
cpu - > valid_pss_table = true ;
2016-05-30 09:31:23 +03:00
pr_debug ( " _PPC limits will be enforced \n " ) ;
2016-04-28 01:48:06 +03:00
return ;
err :
cpu - > valid_pss_table = false ;
acpi_processor_unregister_performance ( policy - > cpu ) ;
}
static void intel_pstate_exit_perf_limits ( struct cpufreq_policy * policy )
{
struct cpudata * cpu ;
cpu = all_cpu_data [ policy - > cpu ] ;
if ( ! cpu - > valid_pss_table )
return ;
acpi_processor_unregister_performance ( policy - > cpu ) ;
}
# else
2016-11-25 19:50:20 +03:00
static inline void intel_pstate_init_acpi_perf_limits ( struct cpufreq_policy * policy )
2016-04-28 01:48:06 +03:00
{
}
2016-11-25 19:50:20 +03:00
static inline void intel_pstate_exit_perf_limits ( struct cpufreq_policy * policy )
2016-04-28 01:48:06 +03:00
{
}
# endif
2014-10-13 19:37:41 +04:00
static inline void update_turbo_state ( void )
{
u64 misc_en ;
struct cpudata * cpu ;
cpu = all_cpu_data [ 0 ] ;
rdmsrl ( MSR_IA32_MISC_ENABLE , misc_en ) ;
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
global . turbo_disabled =
2014-10-13 19:37:41 +04:00
( misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE | |
cpu - > pstate . max_pstate = = cpu - > pstate . turbo_pstate ) ;
}
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
static int min_perf_pct_min ( void )
{
struct cpudata * cpu = all_cpu_data [ 0 ] ;
2017-06-05 15:51:18 +03:00
int turbo_pstate = cpu - > pstate . turbo_pstate ;
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
2017-06-05 15:51:18 +03:00
return turbo_pstate ?
2017-07-11 02:23:52 +03:00
( cpu - > pstate . min_pstate * 100 / turbo_pstate ) : 0 ;
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
}
2016-11-25 03:07:10 +03:00
static s16 intel_pstate_get_epb ( struct cpudata * cpu_data )
{
u64 epb ;
int ret ;
if ( ! static_cpu_has ( X86_FEATURE_EPB ) )
return - ENXIO ;
ret = rdmsrl_on_cpu ( cpu_data - > cpu , MSR_IA32_ENERGY_PERF_BIAS , & epb ) ;
if ( ret )
return ( s16 ) ret ;
return ( s16 ) ( epb & 0x0f ) ;
}
static s16 intel_pstate_get_epp ( struct cpudata * cpu_data , u64 hwp_req_data )
{
s16 epp ;
2016-12-07 00:32:16 +03:00
if ( static_cpu_has ( X86_FEATURE_HWP_EPP ) ) {
/*
* When hwp_req_data is 0 , means that caller didn ' t read
* MSR_HWP_REQUEST , so need to read and get EPP .
*/
if ( ! hwp_req_data ) {
epp = rdmsrl_on_cpu ( cpu_data - > cpu , MSR_HWP_REQUEST ,
& hwp_req_data ) ;
if ( epp )
return epp ;
}
2016-11-25 03:07:10 +03:00
epp = ( hwp_req_data > > 24 ) & 0xff ;
2016-12-07 00:32:16 +03:00
} else {
2016-11-25 03:07:10 +03:00
/* When there is no EPP present, HWP uses EPB settings */
epp = intel_pstate_get_epb ( cpu_data ) ;
2016-12-07 00:32:16 +03:00
}
2016-11-25 03:07:10 +03:00
return epp ;
}
2016-12-07 00:32:16 +03:00
static int intel_pstate_set_epb ( int cpu , s16 pref )
2016-11-25 03:07:10 +03:00
{
u64 epb ;
2016-12-07 00:32:16 +03:00
int ret ;
2016-11-25 03:07:10 +03:00
if ( ! static_cpu_has ( X86_FEATURE_EPB ) )
2016-12-07 00:32:16 +03:00
return - ENXIO ;
2016-11-25 03:07:10 +03:00
2016-12-07 00:32:16 +03:00
ret = rdmsrl_on_cpu ( cpu , MSR_IA32_ENERGY_PERF_BIAS , & epb ) ;
if ( ret )
return ret ;
2016-11-25 03:07:10 +03:00
epb = ( epb & ~ 0x0f ) | pref ;
wrmsrl_on_cpu ( cpu , MSR_IA32_ENERGY_PERF_BIAS , epb ) ;
2016-12-07 00:32:16 +03:00
return 0 ;
2016-11-25 03:07:10 +03:00
}
2016-12-07 00:32:16 +03:00
/*
* EPP / EPB display strings corresponding to EPP index in the
* energy_perf_strings [ ]
* index String
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* 0 default
* 1 performance
* 2 balance_performance
* 3 balance_power
* 4 power
*/
static const char * const energy_perf_strings [ ] = {
" default " ,
" performance " ,
" balance_performance " ,
" balance_power " ,
" power " ,
NULL
} ;
2017-05-02 06:06:08 +03:00
static const unsigned int epp_values [ ] = {
HWP_EPP_PERFORMANCE ,
HWP_EPP_BALANCE_PERFORMANCE ,
HWP_EPP_BALANCE_POWERSAVE ,
HWP_EPP_POWERSAVE
} ;
2016-12-07 00:32:16 +03:00
static int intel_pstate_get_energy_pref_index ( struct cpudata * cpu_data )
{
s16 epp ;
int index = - EINVAL ;
epp = intel_pstate_get_epp ( cpu_data , 0 ) ;
if ( epp < 0 )
return epp ;
if ( static_cpu_has ( X86_FEATURE_HWP_EPP ) ) {
2017-05-02 06:06:08 +03:00
if ( epp = = HWP_EPP_PERFORMANCE )
return 1 ;
if ( epp < = HWP_EPP_BALANCE_PERFORMANCE )
return 2 ;
if ( epp < = HWP_EPP_BALANCE_POWERSAVE )
return 3 ;
else
return 4 ;
2016-12-07 00:32:16 +03:00
} else if ( static_cpu_has ( X86_FEATURE_EPB ) ) {
/*
* Range :
* 0x00 - 0x03 : Performance
* 0x04 - 0x07 : Balance performance
* 0x08 - 0x0B : Balance power
* 0x0C - 0x0F : Power
* The EPB is a 4 bit value , but our ranges restrict the
* value which can be set . Here only using top two bits
* effectively .
*/
index = ( epp > > 2 ) + 1 ;
}
return index ;
}
static int intel_pstate_set_energy_pref_index ( struct cpudata * cpu_data ,
int pref_index )
{
int epp = - EINVAL ;
int ret ;
if ( ! pref_index )
epp = cpu_data - > epp_default ;
mutex_lock ( & intel_pstate_limits_lock ) ;
if ( static_cpu_has ( X86_FEATURE_HWP_EPP ) ) {
u64 value ;
ret = rdmsrl_on_cpu ( cpu_data - > cpu , MSR_HWP_REQUEST , & value ) ;
if ( ret )
goto return_pref ;
value & = ~ GENMASK_ULL ( 31 , 24 ) ;
if ( epp = = - EINVAL )
2017-05-02 06:06:08 +03:00
epp = epp_values [ pref_index - 1 ] ;
2016-12-07 00:32:16 +03:00
value | = ( u64 ) epp < < 24 ;
ret = wrmsrl_on_cpu ( cpu_data - > cpu , MSR_HWP_REQUEST , value ) ;
} else {
if ( epp = = - EINVAL )
epp = ( pref_index - 1 ) < < 2 ;
ret = intel_pstate_set_epb ( cpu_data - > cpu , epp ) ;
}
return_pref :
mutex_unlock ( & intel_pstate_limits_lock ) ;
return ret ;
}
static ssize_t show_energy_performance_available_preferences (
struct cpufreq_policy * policy , char * buf )
{
int i = 0 ;
int ret = 0 ;
while ( energy_perf_strings [ i ] ! = NULL )
ret + = sprintf ( & buf [ ret ] , " %s " , energy_perf_strings [ i + + ] ) ;
ret + = sprintf ( & buf [ ret ] , " \n " ) ;
return ret ;
}
cpufreq_freq_attr_ro ( energy_performance_available_preferences ) ;
static ssize_t store_energy_performance_preference (
struct cpufreq_policy * policy , const char * buf , size_t count )
{
struct cpudata * cpu_data = all_cpu_data [ policy - > cpu ] ;
char str_preference [ 21 ] ;
int ret , i = 0 ;
ret = sscanf ( buf , " %20s " , str_preference ) ;
if ( ret ! = 1 )
return - EINVAL ;
while ( energy_perf_strings [ i ] ! = NULL ) {
if ( ! strcmp ( str_preference , energy_perf_strings [ i ] ) ) {
intel_pstate_set_energy_pref_index ( cpu_data , i ) ;
return count ;
}
+ + i ;
}
return - EINVAL ;
}
static ssize_t show_energy_performance_preference (
struct cpufreq_policy * policy , char * buf )
{
struct cpudata * cpu_data = all_cpu_data [ policy - > cpu ] ;
int preference ;
preference = intel_pstate_get_energy_pref_index ( cpu_data ) ;
if ( preference < 0 )
return preference ;
return sprintf ( buf , " %s \n " , energy_perf_strings [ preference ] ) ;
}
cpufreq_freq_attr_rw ( energy_performance_preference ) ;
static struct freq_attr * hwp_cpufreq_attrs [ ] = {
& energy_performance_preference ,
& energy_performance_available_preferences ,
NULL ,
} ;
2017-06-13 02:30:27 +03:00
static void intel_pstate_get_hwp_max ( unsigned int cpu , int * phy_max ,
int * current_max )
2014-11-06 20:40:47 +03:00
{
2017-06-13 02:30:27 +03:00
u64 cap ;
2015-09-09 21:41:22 +03:00
2017-03-28 01:22:16 +03:00
rdmsrl_on_cpu ( cpu , MSR_HWP_CAPABILITIES , & cap ) ;
if ( global . no_turbo )
2017-06-13 02:30:27 +03:00
* current_max = HWP_GUARANTEED_PERF ( cap ) ;
2017-03-28 01:22:16 +03:00
else
2017-06-13 02:30:27 +03:00
* current_max = HWP_HIGHEST_PERF ( cap ) ;
* phy_max = HWP_HIGHEST_PERF ( cap ) ;
}
static void intel_pstate_hwp_set ( unsigned int cpu )
{
struct cpudata * cpu_data = all_cpu_data [ cpu ] ;
int max , min ;
u64 value ;
s16 epp ;
max = cpu_data - > max_perf_ratio ;
min = cpu_data - > min_perf_ratio ;
2016-10-25 23:20:40 +03:00
2017-03-28 01:22:16 +03:00
if ( cpu_data - > policy = = CPUFREQ_POLICY_PERFORMANCE )
min = max ;
2017-03-14 04:30:12 +03:00
2017-03-28 01:22:16 +03:00
rdmsrl_on_cpu ( cpu , MSR_HWP_REQUEST , & value ) ;
2014-11-06 20:40:47 +03:00
2017-03-28 01:22:16 +03:00
value & = ~ HWP_MIN_PERF ( ~ 0L ) ;
value | = HWP_MIN_PERF ( min ) ;
2016-11-25 03:07:10 +03:00
2017-03-28 01:22:16 +03:00
value & = ~ HWP_MAX_PERF ( ~ 0L ) ;
value | = HWP_MAX_PERF ( max ) ;
2016-11-25 03:07:10 +03:00
2017-03-28 01:22:16 +03:00
if ( cpu_data - > epp_policy = = cpu_data - > policy )
goto skip_epp ;
2016-11-25 03:07:10 +03:00
2017-03-28 01:22:16 +03:00
cpu_data - > epp_policy = cpu_data - > policy ;
2016-12-07 00:32:16 +03:00
2017-03-28 01:22:16 +03:00
if ( cpu_data - > epp_saved > = 0 ) {
epp = cpu_data - > epp_saved ;
cpu_data - > epp_saved = - EINVAL ;
goto update_epp ;
}
2016-11-25 03:07:10 +03:00
2017-03-28 01:22:16 +03:00
if ( cpu_data - > policy = = CPUFREQ_POLICY_PERFORMANCE ) {
epp = intel_pstate_get_epp ( cpu_data , value ) ;
cpu_data - > epp_powersave = epp ;
/* If EPP read was failed, then don't try to write */
if ( epp < 0 )
goto skip_epp ;
2016-11-25 03:07:10 +03:00
2017-03-28 01:22:16 +03:00
epp = 0 ;
} else {
/* skip setting EPP, when saved value is invalid */
if ( cpu_data - > epp_powersave < 0 )
goto skip_epp ;
2016-11-25 03:07:10 +03:00
2017-03-28 01:22:16 +03:00
/*
* No need to restore EPP when it is not zero . This
* means :
* - Policy is not changed
* - user has manually changed
* - Error reading EPB
*/
epp = intel_pstate_get_epp ( cpu_data , value ) ;
if ( epp )
goto skip_epp ;
2016-11-25 03:07:10 +03:00
2017-03-28 01:22:16 +03:00
epp = cpu_data - > epp_powersave ;
}
2016-12-07 00:32:16 +03:00
update_epp :
2017-03-28 01:22:16 +03:00
if ( static_cpu_has ( X86_FEATURE_HWP_EPP ) ) {
value & = ~ GENMASK_ULL ( 31 , 24 ) ;
value | = ( u64 ) epp < < 24 ;
} else {
intel_pstate_set_epb ( cpu , epp ) ;
2014-11-06 20:40:47 +03:00
}
2017-03-28 01:22:16 +03:00
skip_epp :
wrmsrl_on_cpu ( cpu , MSR_HWP_REQUEST , value ) ;
intel_pstate: Update frequencies of policy->cpus only from ->set_policy()
The intel-pstate driver is using intel_pstate_hwp_set() from two
separate paths, i.e. ->set_policy() callback and sysfs update path for
the files present in /sys/devices/system/cpu/intel_pstate/ directory.
While an update to the sysfs path applies to all the CPUs being managed
by the driver (which essentially means all the online CPUs), the update
via the ->set_policy() callback applies to a smaller group of CPUs
managed by the policy for which ->set_policy() is called.
And so, intel_pstate_hwp_set() should update frequencies of only the
CPUs that are part of policy->cpus mask, while it is called from
->set_policy() callback.
In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set()
and apply the frequency changes only to the concerned CPUs.
For ->set_policy() path, we are only concerned about policy->cpus, and
so policy->rwsem lock taken by the core prior to calling ->set_policy()
is enough to take care of any races. The larger lock acquired by
get_online_cpus() is required only for the updates to sysfs files.
Add another routine, intel_pstate_hwp_set_online_cpus(), and call it
from the sysfs update paths.
This also fixes a lockdep reported recently, where policy->rwsem and
get_online_cpus() could have been acquired in any order causing an ABBA
deadlock. The sequence of events leading to that was:
intel_pstate_init(...)
...cpufreq_online(...)
down_write(&policy->rwsem); // Locks policy->rwsem
...
cpufreq_init_policy(policy);
...intel_pstate_hwp_set();
get_online_cpus(); // Temporarily locks cpu_hotplug.lock
...
up_write(&policy->rwsem);
pm_suspend(...)
...disable_nonboot_cpus()
_cpu_down()
cpu_hotplug_begin(); // Locks cpu_hotplug.lock
__cpu_notify(CPU_DOWN_PREPARE, ...);
...cpufreq_offline_prepare();
down_write(&policy->rwsem); // Locks policy->rwsem
Reported-and-tested-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-02-22 07:57:46 +03:00
}
2014-11-06 20:40:47 +03:00
2016-12-07 00:32:16 +03:00
static int intel_pstate_hwp_save_state ( struct cpufreq_policy * policy )
{
struct cpudata * cpu_data = all_cpu_data [ policy - > cpu ] ;
if ( ! hwp_active )
return 0 ;
cpu_data - > epp_saved = intel_pstate_get_epp ( cpu_data , 0 ) ;
return 0 ;
}
2018-01-29 05:27:57 +03:00
static void intel_pstate_hwp_enable ( struct cpudata * cpudata ) ;
2016-11-25 03:07:10 +03:00
static int intel_pstate_resume ( struct cpufreq_policy * policy )
{
if ( ! hwp_active )
return 0 ;
2016-12-30 17:56:14 +03:00
mutex_lock ( & intel_pstate_limits_lock ) ;
2018-01-29 05:27:57 +03:00
if ( policy - > cpu = = 0 )
intel_pstate_hwp_enable ( all_cpu_data [ policy - > cpu ] ) ;
2016-11-25 03:07:10 +03:00
all_cpu_data [ policy - > cpu ] - > epp_policy = 0 ;
2017-03-28 01:22:16 +03:00
intel_pstate_hwp_set ( policy - > cpu ) ;
2016-12-30 17:56:14 +03:00
mutex_unlock ( & intel_pstate_limits_lock ) ;
2017-03-09 18:30:38 +03:00
return 0 ;
2016-11-25 03:07:10 +03:00
}
2016-12-30 17:58:21 +03:00
static void intel_pstate_update_policies ( void )
intel_pstate: Update frequencies of policy->cpus only from ->set_policy()
The intel-pstate driver is using intel_pstate_hwp_set() from two
separate paths, i.e. ->set_policy() callback and sysfs update path for
the files present in /sys/devices/system/cpu/intel_pstate/ directory.
While an update to the sysfs path applies to all the CPUs being managed
by the driver (which essentially means all the online CPUs), the update
via the ->set_policy() callback applies to a smaller group of CPUs
managed by the policy for which ->set_policy() is called.
And so, intel_pstate_hwp_set() should update frequencies of only the
CPUs that are part of policy->cpus mask, while it is called from
->set_policy() callback.
In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set()
and apply the frequency changes only to the concerned CPUs.
For ->set_policy() path, we are only concerned about policy->cpus, and
so policy->rwsem lock taken by the core prior to calling ->set_policy()
is enough to take care of any races. The larger lock acquired by
get_online_cpus() is required only for the updates to sysfs files.
Add another routine, intel_pstate_hwp_set_online_cpus(), and call it
from the sysfs update paths.
This also fixes a lockdep reported recently, where policy->rwsem and
get_online_cpus() could have been acquired in any order causing an ABBA
deadlock. The sequence of events leading to that was:
intel_pstate_init(...)
...cpufreq_online(...)
down_write(&policy->rwsem); // Locks policy->rwsem
...
cpufreq_init_policy(policy);
...intel_pstate_hwp_set();
get_online_cpus(); // Temporarily locks cpu_hotplug.lock
...
up_write(&policy->rwsem);
pm_suspend(...)
...disable_nonboot_cpus()
_cpu_down()
cpu_hotplug_begin(); // Locks cpu_hotplug.lock
__cpu_notify(CPU_DOWN_PREPARE, ...);
...cpufreq_offline_prepare();
down_write(&policy->rwsem); // Locks policy->rwsem
Reported-and-tested-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-02-22 07:57:46 +03:00
{
2016-12-30 17:58:21 +03:00
int cpu ;
for_each_possible_cpu ( cpu )
cpufreq_update_policy ( cpu ) ;
2014-11-06 20:40:47 +03:00
}
2013-02-06 21:02:13 +04:00
/************************** sysfs begin ************************/
# define show_one(file_name, object) \
static ssize_t show_ # # file_name \
( struct kobject * kobj , struct attribute * attr , char * buf ) \
{ \
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
return sprintf ( buf , " %u \n " , global . object ) ; \
2013-02-06 21:02:13 +04:00
}
2017-01-05 04:53:12 +03:00
static ssize_t intel_pstate_show_status ( char * buf ) ;
static int intel_pstate_update_status ( const char * buf , size_t size ) ;
static ssize_t show_status ( struct kobject * kobj ,
struct attribute * attr , char * buf )
{
ssize_t ret ;
mutex_lock ( & intel_pstate_driver_lock ) ;
ret = intel_pstate_show_status ( buf ) ;
mutex_unlock ( & intel_pstate_driver_lock ) ;
return ret ;
}
static ssize_t store_status ( struct kobject * a , struct attribute * b ,
const char * buf , size_t count )
{
char * p = memchr ( buf , ' \n ' , count ) ;
int ret ;
mutex_lock ( & intel_pstate_driver_lock ) ;
ret = intel_pstate_update_status ( buf , p ? p - buf : count ) ;
mutex_unlock ( & intel_pstate_driver_lock ) ;
return ret < 0 ? ret : count ;
}
2015-01-29 02:03:27 +03:00
static ssize_t show_turbo_pct ( struct kobject * kobj ,
struct attribute * attr , char * buf )
{
struct cpudata * cpu ;
int total , no_turbo , turbo_pct ;
uint32_t turbo_fp ;
2017-01-11 06:12:16 +03:00
mutex_lock ( & intel_pstate_driver_lock ) ;
2017-03-28 01:13:00 +03:00
if ( ! intel_pstate_driver ) {
2017-01-11 06:12:16 +03:00
mutex_unlock ( & intel_pstate_driver_lock ) ;
return - EAGAIN ;
}
2015-01-29 02:03:27 +03:00
cpu = all_cpu_data [ 0 ] ;
total = cpu - > pstate . turbo_pstate - cpu - > pstate . min_pstate + 1 ;
no_turbo = cpu - > pstate . max_pstate - cpu - > pstate . min_pstate + 1 ;
2016-04-09 02:25:58 +03:00
turbo_fp = div_fp ( no_turbo , total ) ;
2015-01-29 02:03:27 +03:00
turbo_pct = 100 - fp_toint ( mul_fp ( turbo_fp , int_tofp ( 100 ) ) ) ;
2017-01-11 06:12:16 +03:00
mutex_unlock ( & intel_pstate_driver_lock ) ;
2015-01-29 02:03:27 +03:00
return sprintf ( buf , " %u \n " , turbo_pct ) ;
}
2015-01-29 02:03:28 +03:00
static ssize_t show_num_pstates ( struct kobject * kobj ,
struct attribute * attr , char * buf )
{
struct cpudata * cpu ;
int total ;
2017-01-11 06:12:16 +03:00
mutex_lock ( & intel_pstate_driver_lock ) ;
2017-03-28 01:13:00 +03:00
if ( ! intel_pstate_driver ) {
2017-01-11 06:12:16 +03:00
mutex_unlock ( & intel_pstate_driver_lock ) ;
return - EAGAIN ;
}
2015-01-29 02:03:28 +03:00
cpu = all_cpu_data [ 0 ] ;
total = cpu - > pstate . turbo_pstate - cpu - > pstate . min_pstate + 1 ;
2017-01-11 06:12:16 +03:00
mutex_unlock ( & intel_pstate_driver_lock ) ;
2015-01-29 02:03:28 +03:00
return sprintf ( buf , " %u \n " , total ) ;
}
2014-10-13 19:37:41 +04:00
static ssize_t show_no_turbo ( struct kobject * kobj ,
struct attribute * attr , char * buf )
{
ssize_t ret ;
2017-01-11 06:12:16 +03:00
mutex_lock ( & intel_pstate_driver_lock ) ;
2017-03-28 01:13:00 +03:00
if ( ! intel_pstate_driver ) {
2017-01-11 06:12:16 +03:00
mutex_unlock ( & intel_pstate_driver_lock ) ;
return - EAGAIN ;
}
2014-10-13 19:37:41 +04:00
update_turbo_state ( ) ;
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
if ( global . turbo_disabled )
ret = sprintf ( buf , " %u \n " , global . turbo_disabled ) ;
2014-10-13 19:37:41 +04:00
else
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
ret = sprintf ( buf , " %u \n " , global . no_turbo ) ;
2014-10-13 19:37:41 +04:00
2017-01-11 06:12:16 +03:00
mutex_unlock ( & intel_pstate_driver_lock ) ;
2014-10-13 19:37:41 +04:00
return ret ;
}
2013-02-06 21:02:13 +04:00
static ssize_t store_no_turbo ( struct kobject * a , struct attribute * b ,
2014-07-18 19:37:23 +04:00
const char * buf , size_t count )
2013-02-06 21:02:13 +04:00
{
unsigned int input ;
int ret ;
2014-07-18 19:37:19 +04:00
2013-02-06 21:02:13 +04:00
ret = sscanf ( buf , " %u " , & input ) ;
if ( ret ! = 1 )
return - EINVAL ;
2014-10-13 19:37:41 +04:00
2017-01-11 06:12:16 +03:00
mutex_lock ( & intel_pstate_driver_lock ) ;
2017-03-28 01:13:00 +03:00
if ( ! intel_pstate_driver ) {
2017-01-11 06:12:16 +03:00
mutex_unlock ( & intel_pstate_driver_lock ) ;
return - EAGAIN ;
}
2016-10-28 20:44:52 +03:00
mutex_lock ( & intel_pstate_limits_lock ) ;
2014-10-13 19:37:41 +04:00
update_turbo_state ( ) ;
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
if ( global . turbo_disabled ) {
2016-04-05 23:28:23 +03:00
pr_warn ( " Turbo disabled by BIOS or unavailable on processor \n " ) ;
2016-10-28 20:44:52 +03:00
mutex_unlock ( & intel_pstate_limits_lock ) ;
2017-01-11 06:12:16 +03:00
mutex_unlock ( & intel_pstate_driver_lock ) ;
2014-10-13 19:37:41 +04:00
return - EPERM ;
2014-06-20 18:27:59 +04:00
}
2014-11-06 20:40:47 +03:00
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
global . no_turbo = clamp_t ( int , input , 0 , 1 ) ;
2016-12-30 17:58:21 +03:00
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
if ( global . no_turbo ) {
struct cpudata * cpu = all_cpu_data [ 0 ] ;
int pct = cpu - > pstate . max_pstate * 100 / cpu - > pstate . turbo_pstate ;
/* Squash the global minimum into the permitted range. */
if ( global . min_perf_pct > pct )
global . min_perf_pct = pct ;
}
2017-03-01 02:07:36 +03:00
mutex_unlock ( & intel_pstate_limits_lock ) ;
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
intel_pstate_update_policies ( ) ;
2017-01-11 06:12:16 +03:00
mutex_unlock ( & intel_pstate_driver_lock ) ;
2013-02-06 21:02:13 +04:00
return count ;
}
static ssize_t store_max_perf_pct ( struct kobject * a , struct attribute * b ,
2014-07-18 19:37:23 +04:00
const char * buf , size_t count )
2013-02-06 21:02:13 +04:00
{
unsigned int input ;
int ret ;
2014-07-18 19:37:19 +04:00
2013-02-06 21:02:13 +04:00
ret = sscanf ( buf , " %u " , & input ) ;
if ( ret ! = 1 )
return - EINVAL ;
2017-01-11 06:12:16 +03:00
mutex_lock ( & intel_pstate_driver_lock ) ;
2017-03-28 01:13:00 +03:00
if ( ! intel_pstate_driver ) {
2017-01-11 06:12:16 +03:00
mutex_unlock ( & intel_pstate_driver_lock ) ;
return - EAGAIN ;
}
2016-10-28 20:44:52 +03:00
mutex_lock ( & intel_pstate_limits_lock ) ;
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
global . max_perf_pct = clamp_t ( int , input , global . min_perf_pct , 100 ) ;
2016-12-30 17:58:21 +03:00
2017-03-01 02:07:36 +03:00
mutex_unlock ( & intel_pstate_limits_lock ) ;
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
intel_pstate_update_policies ( ) ;
2017-01-11 06:12:16 +03:00
mutex_unlock ( & intel_pstate_driver_lock ) ;
2013-02-06 21:02:13 +04:00
return count ;
}
static ssize_t store_min_perf_pct ( struct kobject * a , struct attribute * b ,
2014-07-18 19:37:23 +04:00
const char * buf , size_t count )
2013-02-06 21:02:13 +04:00
{
unsigned int input ;
int ret ;
2014-07-18 19:37:19 +04:00
2013-02-06 21:02:13 +04:00
ret = sscanf ( buf , " %u " , & input ) ;
if ( ret ! = 1 )
return - EINVAL ;
2015-01-30 00:03:52 +03:00
2017-01-11 06:12:16 +03:00
mutex_lock ( & intel_pstate_driver_lock ) ;
2017-03-28 01:13:00 +03:00
if ( ! intel_pstate_driver ) {
2017-01-11 06:12:16 +03:00
mutex_unlock ( & intel_pstate_driver_lock ) ;
return - EAGAIN ;
}
2016-10-28 20:44:52 +03:00
mutex_lock ( & intel_pstate_limits_lock ) ;
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
global . min_perf_pct = clamp_t ( int , input ,
min_perf_pct_min ( ) , global . max_perf_pct ) ;
2016-12-30 17:58:21 +03:00
2017-03-01 02:07:36 +03:00
mutex_unlock ( & intel_pstate_limits_lock ) ;
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
intel_pstate_update_policies ( ) ;
2017-01-11 06:12:16 +03:00
mutex_unlock ( & intel_pstate_driver_lock ) ;
2013-02-06 21:02:13 +04:00
return count ;
}
show_one ( max_perf_pct , max_perf_pct ) ;
show_one ( min_perf_pct , min_perf_pct ) ;
2017-01-05 04:53:12 +03:00
define_one_global_rw ( status ) ;
2013-02-06 21:02:13 +04:00
define_one_global_rw ( no_turbo ) ;
define_one_global_rw ( max_perf_pct ) ;
define_one_global_rw ( min_perf_pct ) ;
2015-01-29 02:03:27 +03:00
define_one_global_ro ( turbo_pct ) ;
2015-01-29 02:03:28 +03:00
define_one_global_ro ( num_pstates ) ;
2013-02-06 21:02:13 +04:00
static struct attribute * intel_pstate_attributes [ ] = {
2017-01-05 04:53:12 +03:00
& status . attr ,
2013-02-06 21:02:13 +04:00
& no_turbo . attr ,
2015-01-29 02:03:27 +03:00
& turbo_pct . attr ,
2015-01-29 02:03:28 +03:00
& num_pstates . attr ,
2013-02-06 21:02:13 +04:00
NULL
} ;
2017-07-03 11:10:33 +03:00
static const struct attribute_group intel_pstate_attr_group = {
2013-02-06 21:02:13 +04:00
. attrs = intel_pstate_attributes ,
} ;
2014-07-18 19:37:17 +04:00
static void __init intel_pstate_sysfs_expose_params ( void )
2013-02-06 21:02:13 +04:00
{
2014-07-18 19:37:17 +04:00
struct kobject * intel_pstate_kobject ;
2013-02-06 21:02:13 +04:00
int rc ;
intel_pstate_kobject = kobject_create_and_add ( " intel_pstate " ,
& cpu_subsys . dev_root - > kobj ) ;
2016-10-25 23:20:40 +03:00
if ( WARN_ON ( ! intel_pstate_kobject ) )
return ;
2014-07-18 19:37:20 +04:00
rc = sysfs_create_group ( intel_pstate_kobject , & intel_pstate_attr_group ) ;
2016-10-25 23:20:40 +03:00
if ( WARN_ON ( rc ) )
return ;
/*
* If per cpu limits are enforced there are no global limits , so
* return without creating max / min_perf_pct attributes
*/
if ( per_cpu_limits )
return ;
rc = sysfs_create_file ( intel_pstate_kobject , & max_perf_pct . attr ) ;
WARN_ON ( rc ) ;
rc = sysfs_create_file ( intel_pstate_kobject , & min_perf_pct . attr ) ;
WARN_ON ( rc ) ;
2013-02-06 21:02:13 +04:00
}
/************************** sysfs end ************************/
2014-11-06 20:40:47 +03:00
2015-07-14 19:46:23 +03:00
static void intel_pstate_hwp_enable ( struct cpudata * cpudata )
2014-11-06 20:40:47 +03:00
{
2016-02-26 02:09:31 +03:00
/* First disable HWP notification interrupt as we don't process them */
2016-07-20 02:52:01 +03:00
if ( static_cpu_has ( X86_FEATURE_HWP_NOTIFY ) )
wrmsrl_on_cpu ( cpudata - > cpu , MSR_HWP_INTERRUPT , 0x00 ) ;
2016-02-26 02:09:31 +03:00
2015-07-14 19:46:23 +03:00
wrmsrl_on_cpu ( cpudata - > cpu , MSR_PM_ENABLE , 0x1 ) ;
2016-11-25 03:07:10 +03:00
cpudata - > epp_policy = 0 ;
2016-12-07 00:32:16 +03:00
if ( cpudata - > epp_default = = - EINVAL )
cpudata - > epp_default = intel_pstate_get_epp ( cpudata , 0 ) ;
2014-11-06 20:40:47 +03:00
}
cpufreq: intel_pstate: Disable energy efficiency optimization
Some Kabylake desktop processors may not reach max turbo when running in
HWP mode, even if running under sustained 100% utilization.
This occurs when the HWP.EPP (Energy Performance Preference) is set to
"balance_power" (0x80) -- the default on most systems.
It occurs because the platform BIOS may erroneously enable an
energy-efficiency setting -- MSR_IA32_POWER_CTL BIT-EE, which is not
recommended to be enabled on this SKU.
On the failing systems, this BIOS issue was not discovered when the
desktop motherboard was tested with Windows, because the BIOS also
neglects to provide the ACPI/CPPC table, that Windows requires to enable
HWP, and so Windows runs in legacy P-state mode, where this setting has
no effect.
Linux' intel_pstate driver does not require ACPI/CPPC to enable HWP, and
so it runs in HWP mode, exposing this incorrect BIOS configuration.
There are several ways to address this problem.
First, Linux can also run in legacy P-state mode on this system.
As intel_pstate is how Linux enables HWP, booting with
"intel_pstate=disable"
will run in acpi-cpufreq/ondemand legacy p-state mode.
Or second, the "performance" governor can be used with intel_pstate,
which will modify HWP.EPP to 0.
Or third, starting in 4.10, the
/sys/devices/system/cpu/cpufreq/policy*/energy_performance_preference
attribute in can be updated from "balance_power" to "performance".
Or fourth, apply this patch, which fixes the erroneous setting of
MSR_IA32_POWER_CTL BIT_EE on this model, allowing the default
configuration to function as designed.
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
Cc: 4.6+ <stable@vger.kernel.org> # 4.6+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-02-04 01:18:39 +03:00
# define MSR_IA32_POWER_CTL_BIT_EE 19
/* Disable energy efficiency optimization */
static void intel_pstate_disable_ee ( int cpu )
{
u64 power_ctl ;
int ret ;
ret = rdmsrl_on_cpu ( cpu , MSR_IA32_POWER_CTL , & power_ctl ) ;
if ( ret )
return ;
if ( ! ( power_ctl & BIT ( MSR_IA32_POWER_CTL_BIT_EE ) ) ) {
pr_info ( " Disabling energy efficiency optimization \n " ) ;
power_ctl | = BIT ( MSR_IA32_POWER_CTL_BIT_EE ) ;
wrmsrl_on_cpu ( cpu , MSR_IA32_POWER_CTL , power_ctl ) ;
}
}
2015-11-10 04:40:46 +03:00
static int atom_get_min_pstate ( void )
2013-10-21 20:20:35 +04:00
{
u64 value ;
2014-07-18 19:37:19 +04:00
2017-02-26 00:55:17 +03:00
rdmsrl ( MSR_ATOM_CORE_RATIOS , value ) ;
2014-06-20 18:27:58 +04:00
return ( value > > 8 ) & 0x7F ;
2013-10-21 20:20:35 +04:00
}
2015-11-10 04:40:46 +03:00
static int atom_get_max_pstate ( void )
2013-10-21 20:20:35 +04:00
{
u64 value ;
2014-07-18 19:37:19 +04:00
2017-02-26 00:55:17 +03:00
rdmsrl ( MSR_ATOM_CORE_RATIOS , value ) ;
2014-06-20 18:27:58 +04:00
return ( value > > 16 ) & 0x7F ;
2013-10-21 20:20:35 +04:00
}
2013-02-06 21:02:13 +04:00
2015-11-10 04:40:46 +03:00
static int atom_get_turbo_pstate ( void )
2014-02-12 22:01:07 +04:00
{
u64 value ;
2014-07-18 19:37:19 +04:00
2017-02-26 00:55:17 +03:00
rdmsrl ( MSR_ATOM_CORE_TURBO_RATIOS , value ) ;
2014-06-20 18:27:58 +04:00
return value & 0x7F ;
2014-02-12 22:01:07 +04:00
}
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts. However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().
The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated. However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely. Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.
To fix the problem at hand, rearrange the code taking the above
observations into account.
First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.
Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().
Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.
Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.
Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 01:20:02 +03:00
static u64 atom_get_val ( struct cpudata * cpudata , int pstate )
2013-12-18 22:32:39 +04:00
{
u64 val ;
int32_t vid_fp ;
u32 vid ;
2015-07-29 18:53:10 +03:00
val = ( u64 ) pstate < < 8 ;
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
if ( global . no_turbo & & ! global . turbo_disabled )
2013-12-18 22:32:39 +04:00
val | = ( u64 ) 1 < < 32 ;
vid_fp = cpudata - > vid . min + mul_fp (
int_tofp ( pstate - cpudata - > pstate . min_pstate ) ,
cpudata - > vid . ratio ) ;
vid_fp = clamp_t ( int32_t , vid_fp , cpudata - > vid . min , cpudata - > vid . max ) ;
2014-10-13 19:37:44 +04:00
vid = ceiling_fp ( vid_fp ) ;
2013-12-18 22:32:39 +04:00
2014-05-08 23:57:23 +04:00
if ( pstate > cpudata - > pstate . max_pstate )
vid = cpudata - > vid . turbo ;
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts. However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().
The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated. However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely. Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.
To fix the problem at hand, rearrange the code taking the above
observations into account.
First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.
Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().
Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.
Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.
Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 01:20:02 +03:00
return val | vid ;
2013-12-18 22:32:39 +04:00
}
2015-11-10 04:40:47 +03:00
static int silvermont_get_scaling ( void )
2014-10-13 19:37:43 +04:00
{
u64 value ;
int i ;
2015-11-10 04:40:47 +03:00
/* Defined in Table 35-6 from SDM (Sept 2015) */
static int silvermont_freq_table [ ] = {
83300 , 100000 , 133300 , 116700 , 80000 } ;
2014-10-13 19:37:43 +04:00
rdmsrl ( MSR_FSB_FREQ , value ) ;
2015-11-10 04:40:47 +03:00
i = value & 0x7 ;
WARN_ON ( i > 4 ) ;
2014-10-13 19:37:43 +04:00
2015-11-10 04:40:47 +03:00
return silvermont_freq_table [ i ] ;
}
2014-10-13 19:37:43 +04:00
2015-11-10 04:40:47 +03:00
static int airmont_get_scaling ( void )
{
u64 value ;
int i ;
/* Defined in Table 35-10 from SDM (Sept 2015) */
static int airmont_freq_table [ ] = {
83300 , 100000 , 133300 , 116700 , 80000 ,
93300 , 90000 , 88900 , 87500 } ;
rdmsrl ( MSR_FSB_FREQ , value ) ;
i = value & 0xF ;
WARN_ON ( i > 8 ) ;
return airmont_freq_table [ i ] ;
2014-10-13 19:37:43 +04:00
}
2015-11-10 04:40:46 +03:00
static void atom_get_vid ( struct cpudata * cpudata )
2013-12-18 22:32:39 +04:00
{
u64 value ;
2017-02-26 00:55:17 +03:00
rdmsrl ( MSR_ATOM_CORE_VIDS , value ) ;
2014-06-20 18:27:58 +04:00
cpudata - > vid . min = int_tofp ( ( value > > 8 ) & 0x7f ) ;
cpudata - > vid . max = int_tofp ( ( value > > 16 ) & 0x7f ) ;
2013-12-18 22:32:39 +04:00
cpudata - > vid . ratio = div_fp (
cpudata - > vid . max - cpudata - > vid . min ,
int_tofp ( cpudata - > pstate . max_pstate -
cpudata - > pstate . min_pstate ) ) ;
2014-05-08 23:57:23 +04:00
2017-02-26 00:55:17 +03:00
rdmsrl ( MSR_ATOM_CORE_TURBO_VIDS , value ) ;
2014-05-08 23:57:23 +04:00
cpudata - > vid . turbo = value & 0x7f ;
2013-12-18 22:32:39 +04:00
}
2013-10-21 20:20:34 +04:00
static int core_get_min_pstate ( void )
2013-02-06 21:02:13 +04:00
{
u64 value ;
2014-07-18 19:37:19 +04:00
2013-03-20 18:21:10 +04:00
rdmsrl ( MSR_PLATFORM_INFO , value ) ;
2013-02-06 21:02:13 +04:00
return ( value > > 40 ) & 0xFF ;
}
2015-10-15 02:12:00 +03:00
static int core_get_max_pstate_physical ( void )
2013-02-06 21:02:13 +04:00
{
u64 value ;
2014-07-18 19:37:19 +04:00
2013-03-20 18:21:10 +04:00
rdmsrl ( MSR_PLATFORM_INFO , value ) ;
2013-02-06 21:02:13 +04:00
return ( value > > 8 ) & 0xFF ;
}
2017-01-20 02:03:14 +03:00
static int core_get_tdp_ratio ( u64 plat_info )
{
/* Check how many TDP levels present */
if ( plat_info & 0x600000000 ) {
u64 tdp_ctrl ;
u64 tdp_ratio ;
int tdp_msr ;
int err ;
/* Get the TDP level (0, 1, 2) to get ratios */
err = rdmsrl_safe ( MSR_CONFIG_TDP_CONTROL , & tdp_ctrl ) ;
if ( err )
return err ;
/* TDP MSR are continuous starting at 0x648 */
tdp_msr = MSR_CONFIG_TDP_NOMINAL + ( tdp_ctrl & 0x03 ) ;
err = rdmsrl_safe ( tdp_msr , & tdp_ratio ) ;
if ( err )
return err ;
/* For level 1 and 2, bits[23:16] contain the ratio */
if ( tdp_ctrl & 0x03 )
tdp_ratio > > = 16 ;
tdp_ratio & = 0xff ; /* ratios are only 8 bits long */
pr_debug ( " tdp_ratio %x \n " , ( int ) tdp_ratio ) ;
return ( int ) tdp_ratio ;
}
return - ENXIO ;
}
2013-10-21 20:20:34 +04:00
static int core_get_max_pstate ( void )
2013-02-06 21:02:13 +04:00
{
2015-10-15 02:11:59 +03:00
u64 tar ;
u64 plat_info ;
int max_pstate ;
2017-01-20 02:03:14 +03:00
int tdp_ratio ;
2015-10-15 02:11:59 +03:00
int err ;
rdmsrl ( MSR_PLATFORM_INFO , plat_info ) ;
max_pstate = ( plat_info > > 8 ) & 0xFF ;
2017-01-20 02:03:14 +03:00
tdp_ratio = core_get_tdp_ratio ( plat_info ) ;
if ( tdp_ratio < = 0 )
return max_pstate ;
if ( hwp_active ) {
/* Turbo activation ratio is not used on HWP platforms */
return tdp_ratio ;
}
2015-10-15 02:11:59 +03:00
err = rdmsrl_safe ( MSR_TURBO_ACTIVATION_RATIO , & tar ) ;
if ( ! err ) {
2017-01-20 02:03:14 +03:00
int tar_levels ;
2015-10-15 02:11:59 +03:00
/* Do some sanity checking for safety */
2017-01-20 02:03:14 +03:00
tar_levels = tar & 0xff ;
if ( tdp_ratio - 1 = = tar_levels ) {
max_pstate = tar_levels ;
pr_debug ( " max_pstate=TAC %x \n " , max_pstate ) ;
2015-10-15 02:11:59 +03:00
}
}
2014-07-18 19:37:19 +04:00
2015-10-15 02:11:59 +03:00
return max_pstate ;
2013-02-06 21:02:13 +04:00
}
2013-10-21 20:20:34 +04:00
static int core_get_turbo_pstate ( void )
2013-02-06 21:02:13 +04:00
{
u64 value ;
int nont , ret ;
2014-07-18 19:37:19 +04:00
2016-07-07 02:07:55 +03:00
rdmsrl ( MSR_TURBO_RATIO_LIMIT , value ) ;
2013-10-21 20:20:34 +04:00
nont = core_get_max_pstate ( ) ;
2014-07-18 19:37:21 +04:00
ret = ( value ) & 255 ;
2013-02-06 21:02:13 +04:00
if ( ret < = nont )
ret = nont ;
return ret ;
}
2014-10-13 19:37:43 +04:00
static inline int core_get_scaling ( void )
{
return 100000 ;
}
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts. However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().
The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated. However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely. Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.
To fix the problem at hand, rearrange the code taking the above
observations into account.
First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.
Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().
Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.
Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.
Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 01:20:02 +03:00
static u64 core_get_val ( struct cpudata * cpudata , int pstate )
2013-10-21 20:20:34 +04:00
{
u64 val ;
2015-07-29 18:53:10 +03:00
val = ( u64 ) pstate < < 8 ;
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
if ( global . no_turbo & & ! global . turbo_disabled )
2013-10-21 20:20:34 +04:00
val | = ( u64 ) 1 < < 32 ;
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts. However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().
The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated. However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely. Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.
To fix the problem at hand, rearrange the code taking the above
observations into account.
First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.
Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().
Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.
Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.
Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 01:20:02 +03:00
return val ;
2013-10-21 20:20:34 +04:00
}
2017-07-14 01:03:51 +03:00
static int knl_get_aperf_mperf_shift ( void )
{
return 10 ;
}
2015-04-10 20:22:18 +03:00
static int knl_get_turbo_pstate ( void )
{
u64 value ;
int nont , ret ;
2016-07-07 02:07:55 +03:00
rdmsrl ( MSR_TURBO_RATIO_LIMIT , value ) ;
2015-04-10 20:22:18 +03:00
nont = core_get_max_pstate ( ) ;
ret = ( ( ( value ) > > 8 ) & 0xFF ) ;
if ( ret < = nont )
ret = nont ;
return ret ;
}
2017-03-28 01:24:26 +03:00
static int intel_pstate_get_base_pstate ( struct cpudata * cpu )
2013-02-06 21:02:13 +04:00
{
2017-03-28 01:24:26 +03:00
return global . no_turbo | | global . turbo_disabled ?
cpu - > pstate . max_pstate : cpu - > pstate . turbo_pstate ;
2013-02-06 21:02:13 +04:00
}
2016-10-19 03:57:22 +03:00
static void intel_pstate_set_pstate ( struct cpudata * cpu , int pstate )
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts. However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().
The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated. However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely. Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.
To fix the problem at hand, rearrange the code taking the above
observations into account.
First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.
Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().
Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.
Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.
Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 01:20:02 +03:00
{
2016-07-19 16:10:37 +03:00
trace_cpu_frequency ( pstate * cpu - > pstate . scaling , cpu - > cpu ) ;
cpu - > pstate . current_pstate = pstate ;
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts. However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().
The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated. However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely. Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.
To fix the problem at hand, rearrange the code taking the above
observations into account.
First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.
Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().
Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.
Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.
Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 01:20:02 +03:00
/*
* Generally , there is no guarantee that this code will always run on
* the CPU being updated , so force the register update to run on the
* right CPU .
*/
wrmsrl_on_cpu ( cpu - > cpu , MSR_IA32_PERF_CTL ,
pstate_funcs . get_val ( cpu , pstate ) ) ;
2013-02-06 21:02:13 +04:00
}
2016-10-19 03:57:22 +03:00
static void intel_pstate_set_min_pstate ( struct cpudata * cpu )
{
intel_pstate_set_pstate ( cpu , cpu - > pstate . min_pstate ) ;
}
static void intel_pstate_max_within_limits ( struct cpudata * cpu )
{
2017-03-28 01:24:26 +03:00
int pstate ;
2016-10-19 03:57:22 +03:00
update_turbo_state ( ) ;
2017-03-28 01:24:26 +03:00
pstate = intel_pstate_get_base_pstate ( cpu ) ;
2017-06-13 02:30:27 +03:00
pstate = max ( cpu - > pstate . min_pstate , cpu - > max_perf_ratio ) ;
2017-03-28 01:24:26 +03:00
intel_pstate_set_pstate ( cpu , pstate ) ;
2016-10-19 03:57:22 +03:00
}
2013-02-06 21:02:13 +04:00
static void intel_pstate_get_cpu_pstates ( struct cpudata * cpu )
{
2013-10-21 20:20:34 +04:00
cpu - > pstate . min_pstate = pstate_funcs . get_min ( ) ;
cpu - > pstate . max_pstate = pstate_funcs . get_max ( ) ;
2015-10-15 02:12:00 +03:00
cpu - > pstate . max_pstate_physical = pstate_funcs . get_max_physical ( ) ;
2013-10-21 20:20:34 +04:00
cpu - > pstate . turbo_pstate = pstate_funcs . get_turbo ( ) ;
2014-10-13 19:37:43 +04:00
cpu - > pstate . scaling = pstate_funcs . get_scaling ( ) ;
2016-11-18 01:34:17 +03:00
cpu - > pstate . max_freq = cpu - > pstate . max_pstate * cpu - > pstate . scaling ;
cpu - > pstate . turbo_freq = cpu - > pstate . turbo_pstate * cpu - > pstate . scaling ;
2013-02-06 21:02:13 +04:00
2017-07-14 01:03:51 +03:00
if ( pstate_funcs . get_aperf_mperf_shift )
cpu - > aperf_mperf_shift = pstate_funcs . get_aperf_mperf_shift ( ) ;
2013-12-18 22:32:39 +04:00
if ( pstate_funcs . get_vid )
pstate_funcs . get_vid ( cpu ) ;
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts. However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().
The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated. However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely. Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.
To fix the problem at hand, rearrange the code taking the above
observations into account.
First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.
Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().
Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.
Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.
Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 01:20:02 +03:00
intel_pstate_set_min_pstate ( cpu ) ;
2013-02-06 21:02:13 +04:00
}
2016-05-11 20:09:12 +03:00
static inline void intel_pstate_calc_avg_perf ( struct cpudata * cpu )
2013-02-06 21:02:13 +04:00
{
2014-04-29 21:53:49 +04:00
struct sample * sample = & cpu - > sample ;
2014-02-25 22:35:37 +04:00
2016-05-11 20:09:12 +03:00
sample - > core_avg_perf = div_ext_fp ( sample - > aperf , sample - > mperf ) ;
2013-02-06 21:02:13 +04:00
}
2016-03-11 01:45:19 +03:00
static inline bool intel_pstate_sample ( struct cpudata * cpu , u64 time )
2013-02-06 21:02:13 +04:00
{
u64 aperf , mperf ;
2014-07-18 19:37:24 +04:00
unsigned long flags ;
2015-04-12 07:10:26 +03:00
u64 tsc ;
2013-02-06 21:02:13 +04:00
2014-07-18 19:37:24 +04:00
local_irq_save ( flags ) ;
2013-02-06 21:02:13 +04:00
rdmsrl ( MSR_IA32_APERF , aperf ) ;
rdmsrl ( MSR_IA32_MPERF , mperf ) ;
2015-12-04 19:40:32 +03:00
tsc = rdtsc ( ) ;
2016-03-11 01:45:19 +03:00
if ( cpu - > prev_mperf = = mperf | | cpu - > prev_tsc = = tsc ) {
2015-10-15 22:34:21 +03:00
local_irq_restore ( flags ) ;
2016-03-11 01:45:19 +03:00
return false ;
2015-10-15 22:34:21 +03:00
}
2014-07-18 19:37:24 +04:00
local_irq_restore ( flags ) ;
2014-01-16 22:32:25 +04:00
2014-05-29 20:32:24 +04:00
cpu - > last_sample_time = cpu - > sample . time ;
2016-02-05 03:45:30 +03:00
cpu - > sample . time = time ;
2014-02-12 22:01:04 +04:00
cpu - > sample . aperf = aperf ;
cpu - > sample . mperf = mperf ;
2015-04-12 07:10:26 +03:00
cpu - > sample . tsc = tsc ;
2014-02-12 22:01:04 +04:00
cpu - > sample . aperf - = cpu - > prev_aperf ;
cpu - > sample . mperf - = cpu - > prev_mperf ;
2015-04-12 07:10:26 +03:00
cpu - > sample . tsc - = cpu - > prev_tsc ;
2013-05-07 19:20:25 +04:00
2013-02-06 21:02:13 +04:00
cpu - > prev_aperf = aperf ;
cpu - > prev_mperf = mperf ;
2015-04-12 07:10:26 +03:00
cpu - > prev_tsc = tsc ;
2016-04-02 02:06:21 +03:00
/*
* First time this function is invoked in a given cycle , all of the
* previous sample data fields are equal to zero or stale and they must
* be populated with meaningful numbers for things to work , so assume
* that sample . time will always be reset before setting the utilization
* update hook and make the caller skip the sample then .
*/
2017-03-28 01:15:37 +03:00
if ( cpu - > last_sample_time ) {
intel_pstate_calc_avg_perf ( cpu ) ;
return true ;
}
return false ;
2013-02-06 21:02:13 +04:00
}
2016-03-06 10:34:06 +03:00
static inline int32_t get_avg_frequency ( struct cpudata * cpu )
{
2017-08-09 00:05:12 +03:00
return mul_ext_fp ( cpu - > sample . core_avg_perf , cpu_khz ) ;
2016-03-06 10:34:06 +03:00
}
2016-04-22 21:46:09 +03:00
static inline int32_t get_avg_pstate ( struct cpudata * cpu )
{
2016-05-11 20:10:42 +03:00
return mul_ext_fp ( cpu - > pstate . max_pstate_physical ,
cpu - > sample . core_avg_perf ) ;
2016-04-22 21:46:09 +03:00
}
2017-08-10 02:09:16 +03:00
static inline int32_t get_target_pstate ( struct cpudata * cpu )
2015-12-04 19:40:32 +03:00
{
struct sample * sample = & cpu - > sample ;
2016-09-14 03:28:13 +03:00
int32_t busy_frac , boost ;
2016-10-06 15:07:51 +03:00
int target , avg_pstate ;
2015-12-04 19:40:32 +03:00
2017-07-14 01:03:51 +03:00
busy_frac = div_fp ( sample - > mperf < < cpu - > aperf_mperf_shift ,
sample - > tsc ) ;
2015-12-04 19:40:35 +03:00
2016-09-14 03:28:13 +03:00
boost = cpu - > iowait_boost ;
cpu - > iowait_boost > > = 1 ;
2015-12-04 19:40:35 +03:00
2016-09-14 03:28:13 +03:00
if ( busy_frac < boost )
busy_frac = boost ;
2015-12-04 19:40:35 +03:00
2016-09-14 03:28:13 +03:00
sample - > busy_scaled = busy_frac * 100 ;
2016-10-06 15:07:51 +03:00
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
target = global . no_turbo | | global . turbo_disabled ?
2016-10-06 15:07:51 +03:00
cpu - > pstate . max_pstate : cpu - > pstate . turbo_pstate ;
target + = target > > 2 ;
target = mul_fp ( target , busy_frac ) ;
if ( target < cpu - > pstate . min_pstate )
target = cpu - > pstate . min_pstate ;
/*
* If the average P - state during the previous cycle was higher than the
* current target , add 50 % of the difference to the target to reduce
* possible performance oscillations and offset possible performance
* loss related to moving the workload from one CPU to another within
* a package / module .
*/
avg_pstate = get_avg_pstate ( cpu ) ;
if ( avg_pstate > target )
target + = ( avg_pstate - target ) > > 1 ;
return target ;
2015-12-04 19:40:32 +03:00
}
2016-11-18 01:34:17 +03:00
static int intel_pstate_prepare_request ( struct cpudata * cpu , int pstate )
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts. However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().
The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated. However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely. Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.
To fix the problem at hand, rearrange the code taking the above
observations into account.
First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.
Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().
Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.
Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.
Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 01:20:02 +03:00
{
2017-03-28 01:24:26 +03:00
int max_pstate = intel_pstate_get_base_pstate ( cpu ) ;
int min_pstate ;
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts. However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().
The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated. However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely. Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.
To fix the problem at hand, rearrange the code taking the above
observations into account.
First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.
Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().
Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.
Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.
Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 01:20:02 +03:00
2017-06-13 02:30:27 +03:00
min_pstate = max ( cpu - > pstate . min_pstate , cpu - > min_perf_ratio ) ;
max_pstate = max ( min_pstate , cpu - > max_perf_ratio ) ;
2017-03-28 01:24:26 +03:00
return clamp_t ( int , pstate , min_pstate , max_pstate ) ;
2016-11-18 01:34:17 +03:00
}
static void intel_pstate_update_pstate ( struct cpudata * cpu , int pstate )
{
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts. However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().
The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated. However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely. Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.
To fix the problem at hand, rearrange the code taking the above
observations into account.
First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.
Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().
Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.
Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.
Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 01:20:02 +03:00
if ( pstate = = cpu - > pstate . current_pstate )
return ;
2016-07-19 16:10:37 +03:00
cpu - > pstate . current_pstate = pstate ;
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts. However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().
The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated. However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely. Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.
To fix the problem at hand, rearrange the code taking the above
observations into account.
First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.
Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().
Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.
Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.
Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 01:20:02 +03:00
wrmsrl ( MSR_IA32_PERF_CTL , pstate_funcs . get_val ( cpu , pstate ) ) ;
}
2017-08-10 02:08:56 +03:00
static void intel_pstate_adjust_pstate ( struct cpudata * cpu )
2013-02-06 21:02:13 +04:00
{
2017-03-28 01:17:10 +03:00
int from = cpu - > pstate . current_pstate ;
2015-04-12 07:10:26 +03:00
struct sample * sample ;
2017-08-10 02:08:56 +03:00
int target_pstate ;
2015-04-12 07:10:26 +03:00
2016-11-18 01:34:17 +03:00
update_turbo_state ( ) ;
2017-08-10 02:09:16 +03:00
target_pstate = get_target_pstate ( cpu ) ;
2017-03-04 01:51:31 +03:00
target_pstate = intel_pstate_prepare_request ( cpu , target_pstate ) ;
trace_cpu_frequency ( target_pstate * cpu - > pstate . scaling , cpu - > cpu ) ;
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts. However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().
The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated. However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely. Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.
To fix the problem at hand, rearrange the code taking the above
observations into account.
First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.
Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().
Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.
Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.
Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 01:20:02 +03:00
intel_pstate_update_pstate ( cpu , target_pstate ) ;
2015-04-12 07:10:26 +03:00
sample = & cpu - > sample ;
2016-05-11 20:09:12 +03:00
trace_pstate_sample ( mul_ext_fp ( 100 , sample - > core_avg_perf ) ,
2015-12-04 19:40:30 +03:00
fp_toint ( sample - > busy_scaled ) ,
2015-04-12 07:10:26 +03:00
from ,
cpu - > pstate . current_pstate ,
sample - > mperf ,
sample - > aperf ,
sample - > tsc ,
2016-09-14 03:41:33 +03:00
get_avg_frequency ( cpu ) ,
fp_toint ( cpu - > iowait_boost * 100 ) ) ;
2013-02-06 21:02:13 +04:00
}
2016-02-05 03:45:30 +03:00
static void intel_pstate_update_util ( struct update_util_data * data , u64 time ,
2016-08-16 23:14:55 +03:00
unsigned int flags )
2013-02-06 21:02:13 +04:00
{
2016-02-05 03:45:30 +03:00
struct cpudata * cpu = container_of ( data , struct cpudata , update_util ) ;
2016-09-14 03:28:13 +03:00
u64 delta_ns ;
sched: cpufreq: Allow remote cpufreq callbacks
With Android UI and benchmarks the latency of cpufreq response to
certain scheduling events can become very critical. Currently, callbacks
into cpufreq governors are only made from the scheduler if the target
CPU of the event is the same as the current CPU. This means there are
certain situations where a target CPU may not run the cpufreq governor
for some time.
One testcase to show this behavior is where a task starts running on
CPU0, then a new task is also spawned on CPU0 by a task on CPU1. If the
system is configured such that the new tasks should receive maximum
demand initially, this should result in CPU0 increasing frequency
immediately. But because of the above mentioned limitation though, this
does not occur.
This patch updates the scheduler core to call the cpufreq callbacks for
remote CPUs as well.
The schedutil, ondemand and conservative governors are updated to
process cpufreq utilization update hooks called for remote CPUs where
the remote CPU is managed by the cpufreq policy of the local CPU.
The intel_pstate driver is updated to always reject remote callbacks.
This is tested with couple of usecases (Android: hackbench, recentfling,
galleryfling, vellamo, Ubuntu: hackbench) on ARM hikey board (64 bit
octa-core, single policy). Only galleryfling showed minor improvements,
while others didn't had much deviation.
The reason being that this patch only targets a corner case, where
following are required to be true to improve performance and that
doesn't happen too often with these tests:
- Task is migrated to another CPU.
- The task has high demand, and should take the target CPU to higher
OPPs.
- And the target CPU doesn't call into the cpufreq governor until the
next tick.
Based on initial work from Steve Muckle.
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Saravana Kannan <skannan@codeaurora.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-07-28 09:46:38 +03:00
/* Don't allow remote callbacks */
if ( smp_processor_id ( ) ! = cpu - > cpu )
return ;
2017-03-28 01:15:37 +03:00
if ( flags & SCHED_CPUFREQ_IOWAIT ) {
cpu - > iowait_boost = int_tofp ( 1 ) ;
2017-08-04 05:03:14 +03:00
cpu - > last_update = time ;
/*
* The last time the busy was 100 % so P - state was max anyway
* so avoid overhead of computation .
*/
if ( fp_toint ( cpu - > sample . busy_scaled ) = = 100 )
return ;
goto set_pstate ;
2017-03-28 01:15:37 +03:00
} else if ( cpu - > iowait_boost ) {
/* Clear iowait_boost if the CPU may have been idle. */
delta_ns = time - cpu - > last_update ;
if ( delta_ns > TICK_NSEC )
cpu - > iowait_boost = 0 ;
2016-09-14 03:28:13 +03:00
}
2017-03-28 01:15:37 +03:00
cpu - > last_update = time ;
2016-09-14 03:28:13 +03:00
delta_ns = time - cpu - > sample . time ;
2017-08-10 02:09:16 +03:00
if ( ( s64 ) delta_ns < INTEL_PSTATE_SAMPLING_INTERVAL )
2017-03-28 01:15:37 +03:00
return ;
2016-03-11 01:45:19 +03:00
2017-08-04 05:03:14 +03:00
set_pstate :
2017-08-10 02:08:56 +03:00
if ( intel_pstate_sample ( cpu , time ) )
intel_pstate_adjust_pstate ( cpu ) ;
2017-03-28 01:17:10 +03:00
}
2017-03-28 01:15:37 +03:00
2017-03-28 01:19:03 +03:00
static struct pstate_funcs core_funcs = {
. get_max = core_get_max_pstate ,
. get_max_physical = core_get_max_pstate_physical ,
. get_min = core_get_min_pstate ,
. get_turbo = core_get_turbo_pstate ,
. get_scaling = core_get_scaling ,
. get_val = core_get_val ,
2017-03-28 01:18:02 +03:00
} ;
2017-03-28 01:19:03 +03:00
static const struct pstate_funcs silvermont_funcs = {
. get_max = atom_get_max_pstate ,
. get_max_physical = atom_get_max_pstate ,
. get_min = atom_get_min_pstate ,
. get_turbo = atom_get_turbo_pstate ,
. get_val = atom_get_val ,
. get_scaling = silvermont_get_scaling ,
. get_vid = atom_get_vid ,
2017-03-28 01:18:02 +03:00
} ;
2017-03-28 01:19:03 +03:00
static const struct pstate_funcs airmont_funcs = {
. get_max = atom_get_max_pstate ,
. get_max_physical = atom_get_max_pstate ,
. get_min = atom_get_min_pstate ,
. get_turbo = atom_get_turbo_pstate ,
. get_val = atom_get_val ,
. get_scaling = airmont_get_scaling ,
. get_vid = atom_get_vid ,
2017-03-28 01:18:02 +03:00
} ;
2017-03-28 01:19:03 +03:00
static const struct pstate_funcs knl_funcs = {
. get_max = core_get_max_pstate ,
. get_max_physical = core_get_max_pstate_physical ,
. get_min = core_get_min_pstate ,
. get_turbo = knl_get_turbo_pstate ,
2017-07-14 01:03:51 +03:00
. get_aperf_mperf_shift = knl_get_aperf_mperf_shift ,
2017-03-28 01:19:03 +03:00
. get_scaling = core_get_scaling ,
. get_val = core_get_val ,
2017-03-28 01:18:02 +03:00
} ;
2013-02-06 21:02:13 +04:00
# define ICPU(model, policy) \
2014-01-06 22:59:16 +04:00
{ X86_VENDOR_INTEL , 6 , model , X86_FEATURE_APERFMPERF , \
( unsigned long ) & policy }
2013-02-06 21:02:13 +04:00
static const struct x86_cpu_id intel_pstate_cpu_ids [ ] = {
2017-03-28 01:19:03 +03:00
ICPU ( INTEL_FAM6_SANDYBRIDGE , core_funcs ) ,
ICPU ( INTEL_FAM6_SANDYBRIDGE_X , core_funcs ) ,
ICPU ( INTEL_FAM6_ATOM_SILVERMONT1 , silvermont_funcs ) ,
ICPU ( INTEL_FAM6_IVYBRIDGE , core_funcs ) ,
ICPU ( INTEL_FAM6_HASWELL_CORE , core_funcs ) ,
ICPU ( INTEL_FAM6_BROADWELL_CORE , core_funcs ) ,
ICPU ( INTEL_FAM6_IVYBRIDGE_X , core_funcs ) ,
ICPU ( INTEL_FAM6_HASWELL_X , core_funcs ) ,
ICPU ( INTEL_FAM6_HASWELL_ULT , core_funcs ) ,
ICPU ( INTEL_FAM6_HASWELL_GT3E , core_funcs ) ,
ICPU ( INTEL_FAM6_BROADWELL_GT3E , core_funcs ) ,
ICPU ( INTEL_FAM6_ATOM_AIRMONT , airmont_funcs ) ,
ICPU ( INTEL_FAM6_SKYLAKE_MOBILE , core_funcs ) ,
ICPU ( INTEL_FAM6_BROADWELL_X , core_funcs ) ,
ICPU ( INTEL_FAM6_SKYLAKE_DESKTOP , core_funcs ) ,
ICPU ( INTEL_FAM6_BROADWELL_XEON_D , core_funcs ) ,
ICPU ( INTEL_FAM6_XEON_PHI_KNL , knl_funcs ) ,
ICPU ( INTEL_FAM6_XEON_PHI_KNM , knl_funcs ) ,
2018-01-10 22:38:51 +03:00
ICPU ( INTEL_FAM6_ATOM_GOLDMONT , core_funcs ) ,
ICPU ( INTEL_FAM6_ATOM_GEMINI_LAKE , core_funcs ) ,
2018-01-10 22:38:52 +03:00
ICPU ( INTEL_FAM6_SKYLAKE_X , core_funcs ) ,
2013-02-06 21:02:13 +04:00
{ }
} ;
MODULE_DEVICE_TABLE ( x86cpu , intel_pstate_cpu_ids ) ;
2016-06-27 13:07:17 +03:00
static const struct x86_cpu_id intel_pstate_cpu_oob_ids [ ] __initconst = {
2017-03-28 01:19:03 +03:00
ICPU ( INTEL_FAM6_BROADWELL_XEON_D , core_funcs ) ,
ICPU ( INTEL_FAM6_BROADWELL_X , core_funcs ) ,
ICPU ( INTEL_FAM6_SKYLAKE_X , core_funcs ) ,
2014-11-06 20:40:47 +03:00
{ }
} ;
cpufreq: intel_pstate: Disable energy efficiency optimization
Some Kabylake desktop processors may not reach max turbo when running in
HWP mode, even if running under sustained 100% utilization.
This occurs when the HWP.EPP (Energy Performance Preference) is set to
"balance_power" (0x80) -- the default on most systems.
It occurs because the platform BIOS may erroneously enable an
energy-efficiency setting -- MSR_IA32_POWER_CTL BIT-EE, which is not
recommended to be enabled on this SKU.
On the failing systems, this BIOS issue was not discovered when the
desktop motherboard was tested with Windows, because the BIOS also
neglects to provide the ACPI/CPPC table, that Windows requires to enable
HWP, and so Windows runs in legacy P-state mode, where this setting has
no effect.
Linux' intel_pstate driver does not require ACPI/CPPC to enable HWP, and
so it runs in HWP mode, exposing this incorrect BIOS configuration.
There are several ways to address this problem.
First, Linux can also run in legacy P-state mode on this system.
As intel_pstate is how Linux enables HWP, booting with
"intel_pstate=disable"
will run in acpi-cpufreq/ondemand legacy p-state mode.
Or second, the "performance" governor can be used with intel_pstate,
which will modify HWP.EPP to 0.
Or third, starting in 4.10, the
/sys/devices/system/cpu/cpufreq/policy*/energy_performance_preference
attribute in can be updated from "balance_power" to "performance".
Or fourth, apply this patch, which fixes the erroneous setting of
MSR_IA32_POWER_CTL BIT_EE on this model, allowing the default
configuration to function as designed.
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
Cc: 4.6+ <stable@vger.kernel.org> # 4.6+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-02-04 01:18:39 +03:00
static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids [ ] = {
2017-03-28 01:19:03 +03:00
ICPU ( INTEL_FAM6_KABYLAKE_DESKTOP , core_funcs ) ,
cpufreq: intel_pstate: Disable energy efficiency optimization
Some Kabylake desktop processors may not reach max turbo when running in
HWP mode, even if running under sustained 100% utilization.
This occurs when the HWP.EPP (Energy Performance Preference) is set to
"balance_power" (0x80) -- the default on most systems.
It occurs because the platform BIOS may erroneously enable an
energy-efficiency setting -- MSR_IA32_POWER_CTL BIT-EE, which is not
recommended to be enabled on this SKU.
On the failing systems, this BIOS issue was not discovered when the
desktop motherboard was tested with Windows, because the BIOS also
neglects to provide the ACPI/CPPC table, that Windows requires to enable
HWP, and so Windows runs in legacy P-state mode, where this setting has
no effect.
Linux' intel_pstate driver does not require ACPI/CPPC to enable HWP, and
so it runs in HWP mode, exposing this incorrect BIOS configuration.
There are several ways to address this problem.
First, Linux can also run in legacy P-state mode on this system.
As intel_pstate is how Linux enables HWP, booting with
"intel_pstate=disable"
will run in acpi-cpufreq/ondemand legacy p-state mode.
Or second, the "performance" governor can be used with intel_pstate,
which will modify HWP.EPP to 0.
Or third, starting in 4.10, the
/sys/devices/system/cpu/cpufreq/policy*/energy_performance_preference
attribute in can be updated from "balance_power" to "performance".
Or fourth, apply this patch, which fixes the erroneous setting of
MSR_IA32_POWER_CTL BIT_EE on this model, allowing the default
configuration to function as designed.
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
Cc: 4.6+ <stable@vger.kernel.org> # 4.6+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-02-04 01:18:39 +03:00
{ }
} ;
2013-02-06 21:02:13 +04:00
static int intel_pstate_init_cpu ( unsigned int cpunum )
{
struct cpudata * cpu ;
2016-10-25 23:20:40 +03:00
cpu = all_cpu_data [ cpunum ] ;
if ( ! cpu ) {
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
cpu = kzalloc ( sizeof ( * cpu ) , GFP_KERNEL ) ;
2016-10-25 23:20:40 +03:00
if ( ! cpu )
return - ENOMEM ;
all_cpu_data [ cpunum ] = cpu ;
2016-12-07 00:32:16 +03:00
cpu - > epp_default = - EINVAL ;
cpu - > epp_powersave = - EINVAL ;
cpu - > epp_saved = - EINVAL ;
2016-10-25 23:20:40 +03:00
}
2013-02-06 21:02:13 +04:00
cpu = all_cpu_data [ cpunum ] ;
cpu - > cpu = cpunum ;
2015-07-14 19:46:23 +03:00
2016-02-05 03:45:30 +03:00
if ( hwp_active ) {
cpufreq: intel_pstate: Disable energy efficiency optimization
Some Kabylake desktop processors may not reach max turbo when running in
HWP mode, even if running under sustained 100% utilization.
This occurs when the HWP.EPP (Energy Performance Preference) is set to
"balance_power" (0x80) -- the default on most systems.
It occurs because the platform BIOS may erroneously enable an
energy-efficiency setting -- MSR_IA32_POWER_CTL BIT-EE, which is not
recommended to be enabled on this SKU.
On the failing systems, this BIOS issue was not discovered when the
desktop motherboard was tested with Windows, because the BIOS also
neglects to provide the ACPI/CPPC table, that Windows requires to enable
HWP, and so Windows runs in legacy P-state mode, where this setting has
no effect.
Linux' intel_pstate driver does not require ACPI/CPPC to enable HWP, and
so it runs in HWP mode, exposing this incorrect BIOS configuration.
There are several ways to address this problem.
First, Linux can also run in legacy P-state mode on this system.
As intel_pstate is how Linux enables HWP, booting with
"intel_pstate=disable"
will run in acpi-cpufreq/ondemand legacy p-state mode.
Or second, the "performance" governor can be used with intel_pstate,
which will modify HWP.EPP to 0.
Or third, starting in 4.10, the
/sys/devices/system/cpu/cpufreq/policy*/energy_performance_preference
attribute in can be updated from "balance_power" to "performance".
Or fourth, apply this patch, which fixes the erroneous setting of
MSR_IA32_POWER_CTL BIT_EE on this model, allowing the default
configuration to function as designed.
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Len Brown <len.brown@intel.com>
Cc: 4.6+ <stable@vger.kernel.org> # 4.6+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-02-04 01:18:39 +03:00
const struct x86_cpu_id * id ;
id = x86_match_cpu ( intel_pstate_cpu_ee_disable_ids ) ;
if ( id )
intel_pstate_disable_ee ( cpunum ) ;
2015-07-14 19:46:23 +03:00
intel_pstate_hwp_enable ( cpu ) ;
2016-02-05 03:45:30 +03:00
}
2015-07-14 19:46:23 +03:00
2014-07-05 03:51:33 +04:00
intel_pstate_get_cpu_pstates ( cpu ) ;
2013-10-21 20:20:34 +04:00
2016-04-05 23:28:23 +03:00
pr_debug ( " controlling: cpu %d \n " , cpunum ) ;
2013-02-06 21:02:13 +04:00
return 0 ;
}
2016-04-02 02:06:21 +03:00
static void intel_pstate_set_update_util_hook ( unsigned int cpu_num )
2016-03-31 18:42:15 +03:00
{
2016-04-02 02:06:21 +03:00
struct cpudata * cpu = all_cpu_data [ cpu_num ] ;
2017-06-24 08:11:53 +03:00
if ( hwp_active )
return ;
2016-06-28 00:47:15 +03:00
if ( cpu - > update_util_set )
return ;
2016-04-02 02:06:21 +03:00
/* Prevent intel_pstate_update_util() from using stale data. */
cpu - > sample . time = 0 ;
2017-03-28 01:17:10 +03:00
cpufreq_add_update_util_hook ( cpu_num , & cpu - > update_util ,
2017-07-25 01:12:20 +03:00
intel_pstate_update_util ) ;
2016-05-11 09:33:08 +03:00
cpu - > update_util_set = true ;
2016-03-31 18:42:15 +03:00
}
static void intel_pstate_clear_update_util_hook ( unsigned int cpu )
{
2016-05-11 09:33:08 +03:00
struct cpudata * cpu_data = all_cpu_data [ cpu ] ;
if ( ! cpu_data - > update_util_set )
return ;
2016-04-02 02:08:43 +03:00
cpufreq_remove_update_util_hook ( cpu ) ;
2016-05-11 09:33:08 +03:00
cpu_data - > update_util_set = false ;
2016-03-31 18:42:15 +03:00
synchronize_sched ( ) ;
}
2017-03-23 02:00:47 +03:00
static int intel_pstate_get_max_freq ( struct cpudata * cpu )
{
return global . turbo_disabled | | global . no_turbo ?
cpu - > pstate . max_freq : cpu - > pstate . turbo_freq ;
}
2016-10-25 23:20:40 +03:00
static void intel_pstate_update_perf_limits ( struct cpufreq_policy * policy ,
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
struct cpudata * cpu )
2016-10-25 23:20:40 +03:00
{
2017-03-23 02:00:47 +03:00
int max_freq = intel_pstate_get_max_freq ( cpu ) ;
2017-03-14 18:18:34 +03:00
int32_t max_policy_perf , min_policy_perf ;
2017-06-13 02:30:27 +03:00
int max_state , turbo_max ;
2016-10-28 20:44:52 +03:00
2017-06-13 02:30:27 +03:00
/*
* HWP needs some special consideration , because on BDX the
* HWP_REQUEST uses abstract value to represent performance
* rather than pure ratios .
*/
if ( hwp_active ) {
intel_pstate_get_hwp_max ( cpu - > cpu , & turbo_max , & max_state ) ;
} else {
max_state = intel_pstate_get_base_pstate ( cpu ) ;
turbo_max = cpu - > pstate . turbo_pstate ;
}
max_policy_perf = max_state * policy - > max / max_freq ;
2016-10-25 23:20:41 +03:00
if ( policy - > max = = policy - > min ) {
2017-03-14 18:18:34 +03:00
min_policy_perf = max_policy_perf ;
2016-10-25 23:20:41 +03:00
} else {
2017-06-13 02:30:27 +03:00
min_policy_perf = max_state * policy - > min / max_freq ;
2017-03-14 18:18:34 +03:00
min_policy_perf = clamp_t ( int32_t , min_policy_perf ,
0 , max_policy_perf ) ;
2016-10-25 23:20:41 +03:00
}
2016-10-25 23:20:40 +03:00
2017-06-13 02:30:27 +03:00
pr_debug ( " cpu:%d max_state %d min_policy_perf:%d max_policy_perf:%d \n " ,
policy - > cpu , max_state ,
min_policy_perf , max_policy_perf ) ;
2017-03-14 18:18:34 +03:00
/* Normalize user input to [min_perf, max_perf] */
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
if ( per_cpu_limits ) {
2017-06-13 02:30:27 +03:00
cpu - > min_perf_ratio = min_policy_perf ;
cpu - > max_perf_ratio = max_policy_perf ;
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
} else {
int32_t global_min , global_max ;
/* Global limits are in percent of the maximum turbo P-state. */
2017-06-13 02:30:27 +03:00
global_max = DIV_ROUND_UP ( turbo_max * global . max_perf_pct , 100 ) ;
global_min = DIV_ROUND_UP ( turbo_max * global . min_perf_pct , 100 ) ;
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
global_min = clamp_t ( int32_t , global_min , 0 , global_max ) ;
2016-10-25 23:20:40 +03:00
2017-06-13 02:30:27 +03:00
pr_debug ( " cpu:%d global_min:%d global_max:%d \n " , policy - > cpu ,
global_min , global_max ) ;
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
2017-06-13 02:30:27 +03:00
cpu - > min_perf_ratio = max ( min_policy_perf , global_min ) ;
cpu - > min_perf_ratio = min ( cpu - > min_perf_ratio , max_policy_perf ) ;
cpu - > max_perf_ratio = min ( max_policy_perf , global_max ) ;
cpu - > max_perf_ratio = max ( min_policy_perf , cpu - > max_perf_ratio ) ;
2016-10-25 23:20:40 +03:00
2017-06-13 02:30:27 +03:00
/* Make sure min_perf <= max_perf */
cpu - > min_perf_ratio = min ( cpu - > min_perf_ratio ,
cpu - > max_perf_ratio ) ;
2016-10-25 23:20:40 +03:00
2017-06-13 02:30:27 +03:00
}
pr_debug ( " cpu:%d max_perf_ratio:%d min_perf_ratio:%d \n " , policy - > cpu ,
cpu - > max_perf_ratio ,
cpu - > min_perf_ratio ) ;
2016-10-25 23:20:40 +03:00
}
2013-02-06 21:02:13 +04:00
static int intel_pstate_set_policy ( struct cpufreq_policy * policy )
{
cpufreq: intel_pstate: Adjust policy->max
When policy->max is changed via _PPC or sysfs and is more than the max non
turbo frequency, it does not really change resulting performance in some
processors. When policy->max results in a P-State ratio more than the
turbo activation ratio, then processor can choose any P-State up to max
turbo. So the user or _PPC setting has no value, but this can cause
undesirable side effects like:
- Showing reduced max percentage in Intel P-State sysfs
- It can cause reduced max performance under certain boundary conditions:
The requested max scaling frequency either via _PPC or via cpufreq-sysfs,
will be converted into a fixed floating point max percent scale. In
majority of the cases this will result in correct max. But not 100% of the
time. If the _PPC is requested at a point where the calculation lead to a
lower max, this can result in a lower P-State then expected and it will
impact performance.
Example of this condition using a Broadwell laptop with config TDP.
ACPI _PSS table from a Broadwell laptop
2301000 2300000 2200000 2000000 1900000 1800000 1700000 1500000 1400000
1300000 1100000 1000000 900000 800000 600000 500000
The actual results by disabling config TDP so that we can get what is
requested on or below 2300000Khz.
scaling_max_freq Max Requested P-State Resultant scaling
max
---------------------------------------- ----------------------
2400000 18 2900000 (max
turbo)
2300000 17 2300000 (max
physical non turbo)
2200000 15 2100000
2100000 15 2100000
2000000 13 1900000
1900000 13 1900000
1800000 12 1800000
1700000 11 1700000
1600000 10 1600000
1500000 f 1500000
1400000 e 1400000
1300000 d 1300000
1200000 c 1200000
1100000 a 1000000
1000000 a 1000000
900000 9 900000
800000 8 800000
700000 7 700000
600000 6 600000
500000 5 500000
------------------------------------------------------------------
Now set the config TDP level 1 ratio as 0x0b (equivalent to 1100000KHz)
in BIOS (not every system will let you adjust this).
The turbo activation ratio will be set to one less than that, which will
be 0x0a (So any request above 1000000KHz should result in turbo region
assuming no thermal limits).
Here _PPC will request max to 1100000KHz (which basically should still
result in turbo as this is more than the turbo activation ratio up to
max allowable turbo frequency), but actual calculation resulted in a max
ceiling P-State which is 0x0a. So under any load condition, this driver
will not request turbo P-States. This will be a huge performance hit.
When config TDP feature is ON, if the _PPC points to a frequency above
turbo activation ratio, the performance can still reach max turbo. In this
case we don't need to treat this as the reduced frequency in set_policy
callback.
In this change when config TDP is active (by checking if the physical max
non turbo ratio is more than the current max non turbo ratio), any request
above current max non turbo is treated as full performance.
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
[ rjw : Minor cleanups ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-04-28 01:48:07 +03:00
struct cpudata * cpu ;
2013-03-06 02:15:26 +04:00
if ( ! policy - > cpuinfo . max_freq )
return - ENODEV ;
2016-06-08 03:38:52 +03:00
pr_debug ( " set_policy cpuinfo.max %u policy->max %u \n " ,
policy - > cpuinfo . max_freq , policy - > max ) ;
2016-10-19 03:57:22 +03:00
cpu = all_cpu_data [ policy - > cpu ] ;
2016-10-25 00:20:25 +03:00
cpu - > policy = policy - > policy ;
2016-12-07 00:32:15 +03:00
mutex_lock ( & intel_pstate_limits_lock ) ;
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
intel_pstate_update_perf_limits ( policy , cpu ) ;
cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy
If the current P-state selection algorithm is set to "performance"
in intel_pstate_set_policy(), the limits may be initialized from
scratch, but only if no_turbo is not set and the maximum frequency
allowed for the given CPU (i.e. the policy object representing it)
is at least equal to the max frequency supported by the CPU. In all
of the other cases, the limits will not be updated.
For example, the following can happen:
# cat intel_pstate/status
active
# echo performance > cpufreq/policy0/scaling_governor
# cat intel_pstate/min_perf_pct
100
# echo 94 > intel_pstate/min_perf_pct
# cat intel_pstate/min_perf_pct
100
# cat cpufreq/policy0/scaling_max_freq
3100000
echo 3000000 > cpufreq/policy0/scaling_max_freq
# cat intel_pstate/min_perf_pct
94
# echo 95 > intel_pstate/min_perf_pct
# cat intel_pstate/min_perf_pct
95
That is confusing for two reasons. First, the initial attempt to
change min_perf_pct to 94 seems to have no effect, even though
setting the global limits should always work. Second, after
changing scaling_max_freq for policy0 the global min_perf_pct
attribute shows 94, even though it should have not been affected
by that operation in principle.
Moreover, the final attempt to change min_perf_pct to 95 worked
as expected, because scaling_max_freq for the only policy with
scaling_governor equal to "performance" was different from the
maximum at that time.
To make all that confusion go away, modify intel_pstate_set_policy()
so that it doesn't reinitialize the limits at all.
At the same time, change intel_pstate_set_performance_limits() to
set min_sysfs_pct to 100 in the "performance" limits set so that
switching the P-state selection algorithm to "performance" causes
intel_pstate/min_perf_pct in sysfs to go to 100 (or whatever value
min_sysfs_pct in the "performance" limits is set to later).
That requires per-CPU limits to be initialized explicitly rather
than by copying the global limits to avoid setting min_sysfs_pct
in the per-CPU limits to 100.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-03 01:29:12 +03:00
2016-10-25 00:20:25 +03:00
if ( cpu - > policy = = CPUFREQ_POLICY_PERFORMANCE ) {
2016-10-19 03:57:22 +03:00
/*
* NOHZ_FULL CPUs need this as the governor callback may not
* be invoked on them .
*/
intel_pstate_clear_update_util_hook ( policy - > cpu ) ;
intel_pstate_max_within_limits ( cpu ) ;
2017-06-24 08:11:54 +03:00
} else {
intel_pstate_set_update_util_hook ( policy - > cpu ) ;
2016-10-19 03:57:22 +03:00
}
2017-03-09 18:30:38 +03:00
if ( hwp_active )
2017-03-28 01:22:16 +03:00
intel_pstate_hwp_set ( policy - > cpu ) ;
2014-11-06 20:40:47 +03:00
2016-12-07 00:32:15 +03:00
mutex_unlock ( & intel_pstate_limits_lock ) ;
2013-02-06 21:02:13 +04:00
return 0 ;
}
2017-03-23 02:00:47 +03:00
static void intel_pstate_adjust_policy_max ( struct cpufreq_policy * policy ,
struct cpudata * cpu )
{
if ( cpu - > pstate . max_pstate_physical > cpu - > pstate . max_pstate & &
policy - > max < policy - > cpuinfo . max_freq & &
policy - > max > cpu - > pstate . max_freq ) {
pr_debug ( " policy->max > max non turbo frequency \n " ) ;
policy - > max = policy - > cpuinfo . max_freq ;
}
}
2013-02-06 21:02:13 +04:00
static int intel_pstate_verify_policy ( struct cpufreq_policy * policy )
{
2017-01-18 21:48:23 +03:00
struct cpudata * cpu = all_cpu_data [ policy - > cpu ] ;
update_turbo_state ( ) ;
2017-03-23 02:00:47 +03:00
cpufreq_verify_within_limits ( policy , policy - > cpuinfo . min_freq ,
intel_pstate_get_max_freq ( cpu ) ) ;
2013-02-06 21:02:13 +04:00
2014-07-18 19:37:21 +04:00
if ( policy - > policy ! = CPUFREQ_POLICY_POWERSAVE & &
2014-07-18 19:37:23 +04:00
policy - > policy ! = CPUFREQ_POLICY_PERFORMANCE )
2013-02-06 21:02:13 +04:00
return - EINVAL ;
2017-03-23 02:00:47 +03:00
intel_pstate_adjust_policy_max ( policy , cpu ) ;
2013-02-06 21:02:13 +04:00
return 0 ;
}
2016-11-18 01:34:17 +03:00
static void intel_cpufreq_stop_cpu ( struct cpufreq_policy * policy )
{
intel_pstate_set_min_pstate ( all_cpu_data [ policy - > cpu ] ) ;
}
2014-03-19 19:45:54 +04:00
static void intel_pstate_stop_cpu ( struct cpufreq_policy * policy )
2013-02-06 21:02:13 +04:00
{
2016-11-18 01:34:17 +03:00
pr_debug ( " CPU %d exiting \n " , policy - > cpu ) ;
2013-02-06 21:02:13 +04:00
2016-11-18 01:34:17 +03:00
intel_pstate_clear_update_util_hook ( policy - > cpu ) ;
2016-12-07 00:32:16 +03:00
if ( hwp_active )
intel_pstate_hwp_save_state ( policy ) ;
else
2016-11-18 01:34:17 +03:00
intel_cpufreq_stop_cpu ( policy ) ;
}
2014-03-19 19:45:54 +04:00
2016-11-18 01:34:17 +03:00
static int intel_pstate_cpu_exit ( struct cpufreq_policy * policy )
{
intel_pstate_exit_perf_limits ( policy ) ;
2016-02-05 03:45:30 +03:00
2016-11-18 01:34:17 +03:00
policy - > fast_switch_possible = false ;
2014-11-06 20:40:47 +03:00
2016-11-18 01:34:17 +03:00
return 0 ;
2013-02-06 21:02:13 +04:00
}
2016-11-18 01:34:17 +03:00
static int __intel_pstate_cpu_init ( struct cpufreq_policy * policy )
2013-02-06 21:02:13 +04:00
{
struct cpudata * cpu ;
2013-10-15 22:06:14 +04:00
int rc ;
2013-02-06 21:02:13 +04:00
rc = intel_pstate_init_cpu ( policy - > cpu ) ;
if ( rc )
return rc ;
cpu = all_cpu_data [ policy - > cpu ] ;
2017-06-13 02:30:27 +03:00
cpu - > max_perf_ratio = 0xFF ;
cpu - > min_perf_ratio = 0 ;
2013-02-06 21:02:13 +04:00
2014-10-13 19:37:43 +04:00
policy - > min = cpu - > pstate . min_pstate * cpu - > pstate . scaling ;
policy - > max = cpu - > pstate . turbo_pstate * cpu - > pstate . scaling ;
2013-02-06 21:02:13 +04:00
/* cpuinfo and default policy values */
2014-10-13 19:37:43 +04:00
policy - > cpuinfo . min_freq = cpu - > pstate . min_pstate * cpu - > pstate . scaling ;
2016-06-08 03:38:53 +03:00
update_turbo_state ( ) ;
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
policy - > cpuinfo . max_freq = global . turbo_disabled ?
2016-06-08 03:38:53 +03:00
cpu - > pstate . max_pstate : cpu - > pstate . turbo_pstate ;
policy - > cpuinfo . max_freq * = cpu - > pstate . scaling ;
2016-04-28 01:48:06 +03:00
intel_pstate_init_acpi_perf_limits ( policy ) ;
2013-02-06 21:02:13 +04:00
2016-11-18 01:34:17 +03:00
policy - > fast_switch_possible = true ;
2013-02-06 21:02:13 +04:00
return 0 ;
}
2016-11-18 01:34:17 +03:00
static int intel_pstate_cpu_init ( struct cpufreq_policy * policy )
2016-04-28 01:48:06 +03:00
{
2016-11-18 01:34:17 +03:00
int ret = __intel_pstate_cpu_init ( policy ) ;
if ( ret )
return ret ;
cpufreq: intel_pstate: One set of global limits in active mode
In the active mode intel_pstate currently uses two sets of global
limits, each associated with one of the possible scaling_governor
settings in that mode: "powersave" or "performance".
The driver switches over from one of those sets to the other
depending on the scaling_governor setting for the last CPU whose
per-policy cpufreq interface in sysfs was last used to change
parameters exposed in there. That obviously leads to no end of
issues when the scaling_governor settings differ between CPUs.
The most recent issue was introduced by commit a240c4aa5d0f (cpufreq:
intel_pstate: Do not reinit performance limits in ->setpolicy)
that eliminated the reinitialization of "performance" limits in
intel_pstate_set_policy() preventing the max limit from being set
to anything below 100, among other things.
Namely, an undesirable side effect of commit a240c4aa5d0f is that
now, after setting scaling_governor to "performance" in the active
mode, the per-policy limits for the CPU in question go to the highest
level and stay there even when it is switched back to "powersave"
later.
As it turns out, some distributions set scaling_governor to
"performance" temporarily for all CPUs to speed-up system
initialization, so that change causes them to misbehave later.
To fix that, get rid of the performance/powersave global limits
split and use just one set of global limits for everything.
From the user's persepctive, after this modification, when
scaling_governor is switched from "performance" to "powersave"
or the other way around on one CPU, the limits settings (ie. the
global max/min_perf_pct and per-policy scaling_max/min_freq for
any CPUs) will not change. Still, switching from "performance"
to "powersave" or the other way around changes the way in which
P-states are selected and in particular "performance" causes the
driver to always request the highest P-state it is allowed to ask
for for the given CPU.
Fixes: a240c4aa5d0f (cpufreq: intel_pstate: Do not reinit performance limits in ->setpolicy)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-18 02:57:39 +03:00
if ( IS_ENABLED ( CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE ) )
2016-11-18 01:34:17 +03:00
policy - > policy = CPUFREQ_POLICY_PERFORMANCE ;
else
policy - > policy = CPUFREQ_POLICY_POWERSAVE ;
2016-04-28 01:48:06 +03:00
return 0 ;
}
2016-11-18 01:34:17 +03:00
static struct cpufreq_driver intel_pstate = {
2013-02-06 21:02:13 +04:00
. flags = CPUFREQ_CONST_LOOPS ,
. verify = intel_pstate_verify_policy ,
. setpolicy = intel_pstate_set_policy ,
2016-12-07 00:32:16 +03:00
. suspend = intel_pstate_hwp_save_state ,
2016-11-25 03:07:10 +03:00
. resume = intel_pstate_resume ,
2013-02-06 21:02:13 +04:00
. init = intel_pstate_cpu_init ,
2016-04-28 01:48:06 +03:00
. exit = intel_pstate_cpu_exit ,
2014-03-19 19:45:54 +04:00
. stop_cpu = intel_pstate_stop_cpu ,
2013-02-06 21:02:13 +04:00
. name = " intel_pstate " ,
} ;
2016-11-18 01:34:17 +03:00
static int intel_cpufreq_verify_policy ( struct cpufreq_policy * policy )
{
struct cpudata * cpu = all_cpu_data [ policy - > cpu ] ;
update_turbo_state ( ) ;
2017-03-23 02:00:47 +03:00
cpufreq_verify_within_limits ( policy , policy - > cpuinfo . min_freq ,
intel_pstate_get_max_freq ( cpu ) ) ;
2016-11-18 01:34:17 +03:00
2017-03-23 02:00:47 +03:00
intel_pstate_adjust_policy_max ( policy , cpu ) ;
2016-11-18 01:34:17 +03:00
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
intel_pstate_update_perf_limits ( policy , cpu ) ;
2016-11-18 01:34:17 +03:00
return 0 ;
}
static int intel_cpufreq_target ( struct cpufreq_policy * policy ,
unsigned int target_freq ,
unsigned int relation )
{
struct cpudata * cpu = all_cpu_data [ policy - > cpu ] ;
struct cpufreq_freqs freqs ;
int target_pstate ;
2017-03-22 00:19:07 +03:00
update_turbo_state ( ) ;
2016-11-18 01:34:17 +03:00
freqs . old = policy - > cur ;
2017-03-22 00:19:07 +03:00
freqs . new = target_freq ;
2016-11-18 01:34:17 +03:00
cpufreq_freq_transition_begin ( policy , & freqs ) ;
switch ( relation ) {
case CPUFREQ_RELATION_L :
target_pstate = DIV_ROUND_UP ( freqs . new , cpu - > pstate . scaling ) ;
break ;
case CPUFREQ_RELATION_H :
target_pstate = freqs . new / cpu - > pstate . scaling ;
break ;
default :
target_pstate = DIV_ROUND_CLOSEST ( freqs . new , cpu - > pstate . scaling ) ;
break ;
}
target_pstate = intel_pstate_prepare_request ( cpu , target_pstate ) ;
if ( target_pstate ! = cpu - > pstate . current_pstate ) {
cpu - > pstate . current_pstate = target_pstate ;
wrmsrl_on_cpu ( policy - > cpu , MSR_IA32_PERF_CTL ,
pstate_funcs . get_val ( cpu , target_pstate ) ) ;
}
2017-03-04 01:51:31 +03:00
freqs . new = target_pstate * cpu - > pstate . scaling ;
2016-11-18 01:34:17 +03:00
cpufreq_freq_transition_end ( policy , & freqs , false ) ;
return 0 ;
}
static unsigned int intel_cpufreq_fast_switch ( struct cpufreq_policy * policy ,
unsigned int target_freq )
{
struct cpudata * cpu = all_cpu_data [ policy - > cpu ] ;
int target_pstate ;
2017-03-22 00:19:07 +03:00
update_turbo_state ( ) ;
2016-11-18 01:34:17 +03:00
target_pstate = DIV_ROUND_UP ( target_freq , cpu - > pstate . scaling ) ;
2017-03-04 01:51:31 +03:00
target_pstate = intel_pstate_prepare_request ( cpu , target_pstate ) ;
2016-11-18 01:34:17 +03:00
intel_pstate_update_pstate ( cpu , target_pstate ) ;
2017-03-04 01:51:31 +03:00
return target_pstate * cpu - > pstate . scaling ;
2016-11-18 01:34:17 +03:00
}
static int intel_cpufreq_cpu_init ( struct cpufreq_policy * policy )
{
int ret = __intel_pstate_cpu_init ( policy ) ;
if ( ret )
return ret ;
policy - > cpuinfo . transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY ;
2017-04-11 01:20:41 +03:00
policy - > transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY ;
2016-11-18 01:34:17 +03:00
/* This reflects the intel_pstate_get_cpu_pstates() setting. */
policy - > cur = policy - > cpuinfo . min_freq ;
return 0 ;
}
static struct cpufreq_driver intel_cpufreq = {
. flags = CPUFREQ_CONST_LOOPS ,
. verify = intel_cpufreq_verify_policy ,
. target = intel_cpufreq_target ,
. fast_switch = intel_cpufreq_fast_switch ,
. init = intel_cpufreq_cpu_init ,
. exit = intel_pstate_cpu_exit ,
. stop_cpu = intel_cpufreq_stop_cpu ,
. name = " intel_cpufreq " ,
} ;
2017-03-28 01:13:00 +03:00
static struct cpufreq_driver * default_driver = & intel_pstate ;
2016-11-18 01:34:17 +03:00
2017-01-05 04:53:12 +03:00
static void intel_pstate_driver_cleanup ( void )
{
unsigned int cpu ;
get_online_cpus ( ) ;
for_each_online_cpu ( cpu ) {
if ( all_cpu_data [ cpu ] ) {
if ( intel_pstate_driver = = & intel_pstate )
intel_pstate_clear_update_util_hook ( cpu ) ;
kfree ( all_cpu_data [ cpu ] ) ;
all_cpu_data [ cpu ] = NULL ;
}
}
put_online_cpus ( ) ;
2017-03-28 01:13:00 +03:00
intel_pstate_driver = NULL ;
2017-01-05 04:53:12 +03:00
}
2017-03-28 01:13:00 +03:00
static int intel_pstate_register_driver ( struct cpufreq_driver * driver )
2017-01-05 04:53:12 +03:00
{
int ret ;
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
memset ( & global , 0 , sizeof ( global ) ) ;
global . max_perf_pct = 100 ;
2017-02-28 02:05:01 +03:00
2017-03-28 01:13:00 +03:00
intel_pstate_driver = driver ;
2017-01-05 04:53:12 +03:00
ret = cpufreq_register_driver ( intel_pstate_driver ) ;
if ( ret ) {
intel_pstate_driver_cleanup ( ) ;
return ret ;
}
cpufreq: intel_pstate: Active mode P-state limits rework
The coordination of P-state limits used by intel_pstate in the active
mode (ie. by default) is problematic, because it synchronizes all of
the limits (ie. the global ones and the per-policy ones) so as to use
one common pair of P-state limits (min and max) across all CPUs in
the system. The drawbacks of that are as follows:
- If P-states are coordinated in hardware, it is not necessary
to coordinate them in software on top of that, so in that case
all of the above activity is in vain.
- If P-states are not coordinated in hardware, then the processor
is actually capable of setting different P-states for different
CPUs and coordinating them at the software level simply doesn't
allow that capability to be utilized.
- The coordination works in such a way that setting a per-policy
limit (eg. scaling_max_freq) for one CPU causes the common
effective limit to change (and it will affect all of the other
CPUs too), but subsequent reads from the corresponding sysfs
attributes for the other CPUs will return stale values (which
is confusing).
- Reads from the global P-state limit attributes, min_perf_pct and
max_perf_pct, return the effective common values and not the last
values set through these attributes. However, the last values
set through these attributes become hard limits that cannot be
exceeded by writes to scaling_min_freq and scaling_max_freq,
respectively, and they are not exposed, so essentially users
have to remember what they are.
All of that is painful enough to warrant a change of the management
of P-state limits in the active mode.
To that end, redesign the active mode P-state limits management in
intel_pstate in accordance with the following rules:
(1) All CPUs are affected by the global limits (that is, none of
them can be requested to run faster than the global max and
none of them can be requested to run slower than the global
min).
(2) Each individual CPU is affected by its own per-policy limits
(that is, it cannot be requested to run faster than its own
per-policy max and it cannot be requested to run slower than
its own per-policy min).
(3) The global and per-policy limits can be set independently.
Also, the global maximum and minimum P-state limits will be always
expressed as percentages of the maximum supported turbo P-state.
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2017-03-23 01:58:57 +03:00
global . min_perf_pct = min_perf_pct_min ( ) ;
2017-01-05 04:53:12 +03:00
return 0 ;
}
static int intel_pstate_unregister_driver ( void )
{
if ( hwp_active )
return - EBUSY ;
cpufreq_unregister_driver ( intel_pstate_driver ) ;
intel_pstate_driver_cleanup ( ) ;
return 0 ;
}
static ssize_t intel_pstate_show_status ( char * buf )
{
2017-03-28 01:13:00 +03:00
if ( ! intel_pstate_driver )
2017-01-05 04:53:12 +03:00
return sprintf ( buf , " off \n " ) ;
return sprintf ( buf , " %s \n " , intel_pstate_driver = = & intel_pstate ?
" active " : " passive " ) ;
}
static int intel_pstate_update_status ( const char * buf , size_t size )
{
int ret ;
if ( size = = 3 & & ! strncmp ( buf , " off " , size ) )
2017-03-28 01:13:00 +03:00
return intel_pstate_driver ?
2017-01-05 04:53:12 +03:00
intel_pstate_unregister_driver ( ) : - EINVAL ;
if ( size = = 6 & & ! strncmp ( buf , " active " , size ) ) {
2017-03-28 01:13:00 +03:00
if ( intel_pstate_driver ) {
2017-01-05 04:53:12 +03:00
if ( intel_pstate_driver = = & intel_pstate )
return 0 ;
ret = intel_pstate_unregister_driver ( ) ;
if ( ret )
return ret ;
}
2017-03-28 01:13:00 +03:00
return intel_pstate_register_driver ( & intel_pstate ) ;
2017-01-05 04:53:12 +03:00
}
if ( size = = 7 & & ! strncmp ( buf , " passive " , size ) ) {
2017-03-28 01:13:00 +03:00
if ( intel_pstate_driver ) {
2017-03-28 01:14:08 +03:00
if ( intel_pstate_driver = = & intel_cpufreq )
2017-01-05 04:53:12 +03:00
return 0 ;
ret = intel_pstate_unregister_driver ( ) ;
if ( ret )
return ret ;
}
2017-03-28 01:13:00 +03:00
return intel_pstate_register_driver ( & intel_cpufreq ) ;
2017-01-05 04:53:12 +03:00
}
return - EINVAL ;
}
2016-06-27 13:07:16 +03:00
static int no_load __initdata ;
static int no_hwp __initdata ;
static int hwp_only __initdata ;
2016-06-27 13:07:17 +03:00
static unsigned int force_load __initdata ;
2013-02-16 01:55:10 +04:00
2016-06-27 13:07:17 +03:00
static int __init intel_pstate_msrs_not_valid ( void )
2013-03-22 04:29:28 +04:00
{
2013-10-21 20:20:34 +04:00
if ( ! pstate_funcs . get_max ( ) | |
2014-07-18 19:37:23 +04:00
! pstate_funcs . get_min ( ) | |
! pstate_funcs . get_turbo ( ) )
2013-03-22 04:29:28 +04:00
return - ENODEV ;
return 0 ;
}
2013-10-21 20:20:34 +04:00
2016-06-27 13:07:17 +03:00
static void __init copy_cpu_funcs ( struct pstate_funcs * funcs )
2013-10-21 20:20:34 +04:00
{
pstate_funcs . get_max = funcs - > get_max ;
2015-10-15 02:12:00 +03:00
pstate_funcs . get_max_physical = funcs - > get_max_physical ;
2013-10-21 20:20:34 +04:00
pstate_funcs . get_min = funcs - > get_min ;
pstate_funcs . get_turbo = funcs - > get_turbo ;
2014-10-13 19:37:43 +04:00
pstate_funcs . get_scaling = funcs - > get_scaling ;
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts
After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with
utilization update callbacks) wrmsrl_on_cpu() cannot be called in the
intel_pstate_adjust_busy_pstate() path as that is executed with
disabled interrupts. However, atom_set_pstate() called from there
via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the
IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in
smp_call_function_single().
The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is
because intel_pstate_set_pstate() calling it is also invoked during
the initialization and cleanup of the driver and in those cases it is
not guaranteed to be run on the CPU that is being updated. However,
in the case when intel_pstate_set_pstate() is called by
intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update
the register safely. Moreover, intel_pstate_set_pstate() already
contains code that only is executed if the function is called by
intel_pstate_adjust_busy_pstate() and there is a special argument
passed to it because of that.
To fix the problem at hand, rearrange the code taking the above
observations into account.
First, replace the ->set() callback in struct pstate_funcs with a
->get_val() one that will return the value to be written to the
IA32_PERF_CTL MSR without updating the register.
Second, split intel_pstate_set_pstate() into two functions,
intel_pstate_update_pstate() to be called by
intel_pstate_adjust_busy_pstate() that will contain all of the
intel_pstate_set_pstate() code which only needs to be executed in
that case and will use wrmsrl() to update the MSR (after obtaining
the value to write to it from the ->get_val() callback), and
intel_pstate_set_min_pstate() to be invoked during the
initialization and cleanup that will set the P-state to the
minimum one and will update the MSR using wrmsrl_on_cpu().
Finally, move the code shared between intel_pstate_update_pstate()
and intel_pstate_set_min_pstate() to a new static inline function
intel_pstate_record_pstate() and make them both call it.
Of course, that unifies the handling of the IA32_PERF_CTL MSR writes
between Atom and Core.
Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks)
Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 01:20:02 +03:00
pstate_funcs . get_val = funcs - > get_val ;
2013-12-18 22:32:39 +04:00
pstate_funcs . get_vid = funcs - > get_vid ;
2017-07-14 01:03:51 +03:00
pstate_funcs . get_aperf_mperf_shift = funcs - > get_aperf_mperf_shift ;
2013-10-21 20:20:34 +04:00
}
2016-04-28 01:48:06 +03:00
# ifdef CONFIG_ACPI
2013-10-31 19:24:05 +04:00
2016-06-27 13:07:17 +03:00
static bool __init intel_pstate_no_acpi_pss ( void )
2013-10-31 19:24:05 +04:00
{
int i ;
for_each_possible_cpu ( i ) {
acpi_status status ;
union acpi_object * pss ;
struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER , NULL } ;
struct acpi_processor * pr = per_cpu ( processors , i ) ;
if ( ! pr )
continue ;
status = acpi_evaluate_object ( pr - > handle , " _PSS " , NULL , & buffer ) ;
if ( ACPI_FAILURE ( status ) )
continue ;
pss = buffer . pointer ;
if ( pss & & pss - > type = = ACPI_TYPE_PACKAGE ) {
kfree ( pss ) ;
return false ;
}
kfree ( pss ) ;
}
return true ;
}
2016-06-27 13:07:17 +03:00
static bool __init intel_pstate_has_acpi_ppc ( void )
2014-12-01 05:32:08 +03:00
{
int i ;
for_each_possible_cpu ( i ) {
struct acpi_processor * pr = per_cpu ( processors , i ) ;
if ( ! pr )
continue ;
if ( acpi_has_method ( pr - > handle , " _PPC " ) )
return true ;
}
return false ;
}
enum {
PSS ,
PPC ,
} ;
2013-10-31 19:24:05 +04:00
/* Hardware vendor-specific info that has its own power management modes */
2017-08-24 01:54:44 +03:00
static struct acpi_platform_list plat_info [ ] __initdata = {
{ " HP " , " ProLiant " , 0 , ACPI_SIG_FADT , all_versions , 0 , PSS } ,
{ " ORACLE " , " X4-2 " , 0 , ACPI_SIG_FADT , all_versions , 0 , PPC } ,
{ " ORACLE " , " X4-2L " , 0 , ACPI_SIG_FADT , all_versions , 0 , PPC } ,
{ " ORACLE " , " X4-2B " , 0 , ACPI_SIG_FADT , all_versions , 0 , PPC } ,
{ " ORACLE " , " X3-2 " , 0 , ACPI_SIG_FADT , all_versions , 0 , PPC } ,
{ " ORACLE " , " X3-2L " , 0 , ACPI_SIG_FADT , all_versions , 0 , PPC } ,
{ " ORACLE " , " X3-2B " , 0 , ACPI_SIG_FADT , all_versions , 0 , PPC } ,
{ " ORACLE " , " X4470M2 " , 0 , ACPI_SIG_FADT , all_versions , 0 , PPC } ,
{ " ORACLE " , " X4270M3 " , 0 , ACPI_SIG_FADT , all_versions , 0 , PPC } ,
{ " ORACLE " , " X4270M2 " , 0 , ACPI_SIG_FADT , all_versions , 0 , PPC } ,
{ " ORACLE " , " X4170M2 " , 0 , ACPI_SIG_FADT , all_versions , 0 , PPC } ,
{ " ORACLE " , " X4170 M3 " , 0 , ACPI_SIG_FADT , all_versions , 0 , PPC } ,
{ " ORACLE " , " X4275 M3 " , 0 , ACPI_SIG_FADT , all_versions , 0 , PPC } ,
{ " ORACLE " , " X6-2 " , 0 , ACPI_SIG_FADT , all_versions , 0 , PPC } ,
{ " ORACLE " , " Sudbury " , 0 , ACPI_SIG_FADT , all_versions , 0 , PPC } ,
{ } /* End */
2013-10-31 19:24:05 +04:00
} ;
2016-06-27 13:07:17 +03:00
static bool __init intel_pstate_platform_pwr_mgmt_exists ( void )
2013-10-31 19:24:05 +04:00
{
2014-11-06 20:40:47 +03:00
const struct x86_cpu_id * id ;
u64 misc_pwr ;
2017-08-24 01:54:44 +03:00
int idx ;
2014-11-06 20:40:47 +03:00
id = x86_match_cpu ( intel_pstate_cpu_oob_ids ) ;
if ( id ) {
rdmsrl ( MSR_MISC_PWR_MGMT , misc_pwr ) ;
if ( misc_pwr & ( 1 < < 8 ) )
return true ;
}
2013-10-31 19:24:05 +04:00
2017-08-24 01:54:44 +03:00
idx = acpi_match_platform_list ( plat_info ) ;
if ( idx < 0 )
2013-10-31 19:24:05 +04:00
return false ;
2017-08-24 01:54:44 +03:00
switch ( plat_info [ idx ] . data ) {
case PSS :
return intel_pstate_no_acpi_pss ( ) ;
case PPC :
return intel_pstate_has_acpi_ppc ( ) & & ! force_load ;
2013-10-31 19:24:05 +04:00
}
return false ;
}
2016-11-18 00:47:47 +03:00
static void intel_pstate_request_control_from_smm ( void )
{
/*
* It may be unsafe to request P - states control from SMM if _PPC support
* has not been enabled .
*/
if ( acpi_ppc )
acpi_processor_pstate_control ( ) ;
}
2013-10-31 19:24:05 +04:00
# else /* CONFIG_ACPI not enabled */
static inline bool intel_pstate_platform_pwr_mgmt_exists ( void ) { return false ; }
2014-12-01 05:32:08 +03:00
static inline bool intel_pstate_has_acpi_ppc ( void ) { return false ; }
2016-11-18 00:47:47 +03:00
static inline void intel_pstate_request_control_from_smm ( void ) { }
2013-10-31 19:24:05 +04:00
# endif /* CONFIG_ACPI */
2016-02-26 02:09:19 +03:00
static const struct x86_cpu_id hwp_support_ids [ ] __initconst = {
{ X86_VENDOR_INTEL , 6 , X86_MODEL_ANY , X86_FEATURE_HWP } ,
{ }
} ;
2013-02-06 21:02:13 +04:00
static int __init intel_pstate_init ( void )
{
2017-03-23 01:52:18 +03:00
int rc ;
2013-02-06 21:02:13 +04:00
2013-02-16 01:55:10 +04:00
if ( no_load )
return - ENODEV ;
2017-03-23 01:52:18 +03:00
if ( x86_match_cpu ( hwp_support_ids ) ) {
2017-03-28 01:19:03 +03:00
copy_cpu_funcs ( & core_funcs ) ;
2017-07-25 01:12:20 +03:00
if ( ! no_hwp ) {
2017-03-23 01:52:18 +03:00
hwp_active + + ;
intel_pstate . attr = hwp_cpufreq_attrs ;
goto hwp_cpu_matched ;
}
} else {
const struct x86_cpu_id * id ;
2016-02-26 02:09:19 +03:00
2017-03-23 01:52:18 +03:00
id = x86_match_cpu ( intel_pstate_cpu_ids ) ;
if ( ! id )
return - ENODEV ;
2013-02-06 21:02:13 +04:00
2017-03-28 01:19:03 +03:00
copy_cpu_funcs ( ( struct pstate_funcs * ) id - > driver_data ) ;
2017-03-23 01:52:18 +03:00
}
2013-10-21 20:20:34 +04:00
2013-03-22 04:29:28 +04:00
if ( intel_pstate_msrs_not_valid ( ) )
return - ENODEV ;
2016-02-26 02:09:19 +03:00
hwp_cpu_matched :
/*
* The Intel pstate driver will be ignored if the platform
* firmware has its own power management modes .
*/
if ( intel_pstate_platform_pwr_mgmt_exists ( ) )
return - ENODEV ;
2017-01-05 04:53:12 +03:00
if ( ! hwp_active & & hwp_only )
return - ENOTSUPP ;
2016-04-05 23:28:23 +03:00
pr_info ( " Intel P-state driver initializing \n " ) ;
2013-02-06 21:02:13 +04:00
2013-05-13 12:03:43 +04:00
all_cpu_data = vzalloc ( sizeof ( void * ) * num_possible_cpus ( ) ) ;
2013-02-06 21:02:13 +04:00
if ( ! all_cpu_data )
return - ENOMEM ;
2016-11-18 00:47:47 +03:00
intel_pstate_request_control_from_smm ( ) ;
2013-02-06 21:02:13 +04:00
intel_pstate_sysfs_expose_params ( ) ;
2014-01-16 22:32:25 +04:00
2017-01-11 06:12:16 +03:00
mutex_lock ( & intel_pstate_driver_lock ) ;
2017-03-28 01:13:00 +03:00
rc = intel_pstate_register_driver ( default_driver ) ;
2017-01-11 06:12:16 +03:00
mutex_unlock ( & intel_pstate_driver_lock ) ;
2017-01-05 04:53:12 +03:00
if ( rc )
return rc ;
2016-12-24 02:29:56 +03:00
2016-02-26 02:09:19 +03:00
if ( hwp_active )
2016-04-05 23:28:23 +03:00
pr_info ( " HWP enabled \n " ) ;
2016-02-26 02:09:19 +03:00
2017-01-05 04:53:12 +03:00
return 0 ;
2013-02-06 21:02:13 +04:00
}
device_initcall ( intel_pstate_init ) ;
2013-02-16 01:55:10 +04:00
static int __init intel_pstate_setup ( char * str )
{
if ( ! str )
return - EINVAL ;
2016-11-18 01:34:17 +03:00
if ( ! strcmp ( str , " disable " ) ) {
2013-02-16 01:55:10 +04:00
no_load = 1 ;
2016-11-18 01:34:17 +03:00
} else if ( ! strcmp ( str , " passive " ) ) {
pr_info ( " Passive mode enabled \n " ) ;
2017-03-28 01:13:00 +03:00
default_driver = & intel_cpufreq ;
2016-11-18 01:34:17 +03:00
no_hwp = 1 ;
}
2015-10-22 16:43:31 +03:00
if ( ! strcmp ( str , " no_hwp " ) ) {
2016-04-05 23:28:23 +03:00
pr_info ( " HWP disabled \n " ) ;
2014-11-06 20:40:47 +03:00
no_hwp = 1 ;
2015-10-22 16:43:31 +03:00
}
2014-12-09 04:43:19 +03:00
if ( ! strcmp ( str , " force " ) )
force_load = 1 ;
2015-02-07 00:41:55 +03:00
if ( ! strcmp ( str , " hwp_only " ) )
hwp_only = 1 ;
2016-10-25 23:20:40 +03:00
if ( ! strcmp ( str , " per_cpu_perf_limits " ) )
per_cpu_limits = true ;
2016-04-28 01:48:06 +03:00
# ifdef CONFIG_ACPI
if ( ! strcmp ( str , " support_acpi_ppc " ) )
acpi_ppc = true ;
# endif
2013-02-16 01:55:10 +04:00
return 0 ;
}
early_param ( " intel_pstate " , intel_pstate_setup ) ;
2013-02-06 21:02:13 +04:00
MODULE_AUTHOR ( " Dirk Brandewie <dirk.j.brandewie@intel.com> " ) ;
MODULE_DESCRIPTION ( " 'intel_pstate' - P state driver Intel Core processors " ) ;
MODULE_LICENSE ( " GPL " ) ;