2019-06-04 11:11:33 +03:00
// SPDX-License-Identifier: GPL-2.0-only
2005-04-17 02:20:36 +04:00
/*
* drivers / cpufreq / cpufreq_ondemand . c
*
* Copyright ( C ) 2001 Russell King
* ( C ) 2003 Venkatesh Pallipadi < venkatesh . pallipadi @ intel . com > .
* Jun Nakajima < jun . nakajima @ intel . com >
*/
2012-10-26 02:47:42 +04:00
# define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2013-08-06 21:23:03 +04:00
# include <linux/cpu.h>
2012-10-26 02:47:42 +04:00
# include <linux/percpu-defs.h>
2013-03-27 19:58:58 +04:00
# include <linux/slab.h>
2008-08-04 22:59:12 +04:00
# include <linux/tick.h>
2017-02-08 20:51:31 +03:00
# include <linux/sched/cpufreq.h>
2016-02-18 20:40:14 +03:00
# include "cpufreq_ondemand.h"
2005-04-17 02:20:36 +04:00
2013-02-08 21:24:18 +04:00
/* On-demand governor macros */
2005-04-17 02:20:36 +04:00
# define DEF_FREQUENCY_UP_THRESHOLD (80)
2010-10-07 00:54:24 +04:00
# define DEF_SAMPLING_DOWN_FACTOR (1)
# define MAX_SAMPLING_DOWN_FACTOR (100000)
2008-08-04 22:59:12 +04:00
# define MICRO_FREQUENCY_UP_THRESHOLD (95)
2009-04-22 15:48:29 +04:00
# define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000)
2016-01-04 07:14:45 +03:00
# define MIN_FREQUENCY_UP_THRESHOLD (1)
2005-04-17 02:20:36 +04:00
# define MAX_FREQUENCY_UP_THRESHOLD (100)
2013-04-02 18:56:56 +04:00
static struct od_ops od_ops ;
2013-06-26 00:42:37 +04:00
static unsigned int default_powersave_bias ;
2012-10-26 02:47:42 +04:00
/*
* Not all CPUs want IO time to be accounted as busy ; this depends on how
* efficient idling at a higher frequency / voltage is .
* Pavel Machek says this is not so for various generations of AMD and old
* Intel systems .
2013-02-08 21:24:18 +04:00
* Mike Chan ( android . com ) claims this is also not true for ARM .
2012-10-26 02:47:42 +04:00
* Because of this , whitelist specific known ( series ) of CPUs by default , and
* leave all others up to the user .
*/
static int should_io_be_busy ( void )
{
# if defined(CONFIG_X86)
/*
2013-02-08 21:24:18 +04:00
* For Intel , Core 2 ( model 15 ) and later have an efficient idle .
2012-10-26 02:47:42 +04:00
*/
if ( boot_cpu_data . x86_vendor = = X86_VENDOR_INTEL & &
boot_cpu_data . x86 = = 6 & &
boot_cpu_data . x86_model > = 15 )
return 1 ;
# endif
return 0 ;
ondemand: Solve a big performance issue by counting IOWAIT time as busy
The ondemand cpufreq governor uses CPU busy time (e.g. not-idle
time) as a measure for scaling the CPU frequency up or down.
If the CPU is busy, the CPU frequency scales up, if it's idle,
the CPU frequency scales down. Effectively, it uses the CPU busy
time as proxy variable for the more nebulous "how critical is
performance right now" question.
This algorithm falls flat on its face in the light of workloads
where you're alternatingly disk and CPU bound, such as the ever
popular "git grep", but also things like startup of programs and
maildir using email clients... much to the chagarin of Andrew
Morton.
This patch changes the ondemand algorithm to count iowait time
as busy, not idle, time. As shown in the breakdown cases above,
iowait is performance critical often, and by counting iowait,
the proxy variable becomes a more accurate representation of the
"how critical is performance" question.
The problem and fix are both verified with the "perf timechar"
tool.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Jones <davej@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100509082606.3d9f00d0@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-09 19:26:06 +04:00
}
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
/*
* Find right freq to be set now with powersave_bias on .
2016-02-15 04:22:13 +03:00
* Returns the freq_hi to be used right now and will set freq_hi_delay_us ,
* freq_lo , and freq_lo_delay_us in percpu area for averaging freqs .
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
*/
2013-04-02 18:56:56 +04:00
static unsigned int generic_powersave_bias_target ( struct cpufreq_policy * policy ,
2012-10-26 02:47:42 +04:00
unsigned int freq_next , unsigned int relation )
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
{
unsigned int freq_req , freq_reduc , freq_avg ;
unsigned int freq_hi , freq_lo ;
2016-06-03 08:28:51 +03:00
unsigned int index ;
2016-02-15 04:22:13 +03:00
unsigned int delay_hi_us ;
2016-02-07 18:24:26 +03:00
struct policy_dbs_info * policy_dbs = policy - > governor_data ;
2016-02-18 20:40:14 +03:00
struct od_policy_dbs_info * dbs_info = to_dbs_info ( policy_dbs ) ;
2016-02-07 18:24:26 +03:00
struct dbs_data * dbs_data = policy_dbs - > dbs_data ;
2013-03-27 19:58:58 +04:00
struct od_dbs_tuners * od_tuners = dbs_data - > tuners ;
2016-06-03 08:28:48 +03:00
struct cpufreq_frequency_table * freq_table = policy - > freq_table ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
2016-06-03 08:28:48 +03:00
if ( ! freq_table ) {
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
dbs_info - > freq_lo = 0 ;
2016-02-15 04:22:13 +03:00
dbs_info - > freq_lo_delay_us = 0 ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
return freq_next ;
}
2016-06-03 08:28:51 +03:00
index = cpufreq_frequency_table_target ( policy , freq_next , relation ) ;
2016-06-03 08:28:48 +03:00
freq_req = freq_table [ index ] . frequency ;
2013-03-27 19:58:58 +04:00
freq_reduc = freq_req * od_tuners - > powersave_bias / 1000 ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
freq_avg = freq_req - freq_reduc ;
/* Find freq bounds for freq_avg in freq_table */
2021-09-08 17:05:28 +03:00
index = cpufreq_table_find_index_h ( policy , freq_avg ,
relation & CPUFREQ_RELATION_E ) ;
2016-06-03 08:28:48 +03:00
freq_lo = freq_table [ index ] . frequency ;
2021-09-08 17:05:28 +03:00
index = cpufreq_table_find_index_l ( policy , freq_avg ,
relation & CPUFREQ_RELATION_E ) ;
2016-06-03 08:28:48 +03:00
freq_hi = freq_table [ index ] . frequency ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
/* Find out how long we have to be in hi and lo freqs */
if ( freq_hi = = freq_lo ) {
dbs_info - > freq_lo = 0 ;
2016-02-15 04:22:13 +03:00
dbs_info - > freq_lo_delay_us = 0 ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
return freq_lo ;
}
2016-02-15 04:22:13 +03:00
delay_hi_us = ( freq_avg - freq_lo ) * dbs_data - > sampling_rate ;
delay_hi_us + = ( freq_hi - freq_lo ) / 2 ;
delay_hi_us / = freq_hi - freq_lo ;
dbs_info - > freq_hi_delay_us = delay_hi_us ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
dbs_info - > freq_lo = freq_lo ;
2016-02-15 04:22:13 +03:00
dbs_info - > freq_lo_delay_us = dbs_data - > sampling_rate - delay_hi_us ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
return freq_hi ;
}
2016-02-18 04:28:24 +03:00
static void ondemand_powersave_bias_init ( struct cpufreq_policy * policy )
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
{
2016-02-18 20:40:14 +03:00
struct od_policy_dbs_info * dbs_info = to_dbs_info ( policy - > governor_data ) ;
2016-02-18 04:28:24 +03:00
dbs_info - > freq_lo = 0 ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
}
2013-08-06 21:23:05 +04:00
static void dbs_freq_increase ( struct cpufreq_policy * policy , unsigned int freq )
2012-10-26 02:47:42 +04:00
{
2016-02-07 18:24:26 +03:00
struct policy_dbs_info * policy_dbs = policy - > governor_data ;
struct dbs_data * dbs_data = policy_dbs - > dbs_data ;
2013-03-27 19:58:58 +04:00
struct od_dbs_tuners * od_tuners = dbs_data - > tuners ;
if ( od_tuners - > powersave_bias )
2013-08-06 21:23:05 +04:00
freq = od_ops . powersave_bias_target ( policy , freq ,
2021-09-08 17:05:29 +03:00
CPUFREQ_RELATION_HE ) ;
2013-08-06 21:23:05 +04:00
else if ( policy - > cur = = policy - > max )
2012-10-26 02:47:42 +04:00
return ;
2009-07-24 17:25:06 +04:00
2013-08-06 21:23:05 +04:00
__cpufreq_driver_target ( policy , freq , od_tuners - > powersave_bias ?
2021-09-08 17:05:29 +03:00
CPUFREQ_RELATION_LE : CPUFREQ_RELATION_HE ) ;
2012-10-26 02:47:42 +04:00
}
/*
* Every sampling_rate , we check , if current idle time is less than 20 %
2013-06-05 20:01:25 +04:00
* ( default ) , then we try to increase frequency . Else , we adjust the frequency
* proportional to load .
2012-10-26 02:47:42 +04:00
*/
2016-02-15 04:19:31 +03:00
static void od_update ( struct cpufreq_policy * policy )
2005-04-17 02:20:36 +04:00
{
2016-02-18 20:40:14 +03:00
struct policy_dbs_info * policy_dbs = policy - > governor_data ;
struct od_policy_dbs_info * dbs_info = to_dbs_info ( policy_dbs ) ;
2016-02-07 18:24:26 +03:00
struct dbs_data * dbs_data = policy_dbs - > dbs_data ;
2013-03-27 19:58:58 +04:00
struct od_dbs_tuners * od_tuners = dbs_data - > tuners ;
2016-02-15 04:19:31 +03:00
unsigned int load = dbs_update ( policy ) ;
2012-10-26 02:47:42 +04:00
dbs_info - > freq_lo = 0 ;
/* Check for frequency increase */
2016-02-09 06:31:32 +03:00
if ( load > dbs_data - > up_threshold ) {
2012-10-26 02:47:42 +04:00
/* If switching to max speed, apply sampling_down_factor */
if ( policy - > cur < policy - > max )
2016-02-15 04:20:51 +03:00
policy_dbs - > rate_mult = dbs_data - > sampling_down_factor ;
2012-10-26 02:47:42 +04:00
dbs_freq_increase ( policy , policy - > max ) ;
2013-06-05 20:01:25 +04:00
} else {
/* Calculate the next frequency proportional to load */
cpufreq: ondemand: Eliminate the deadband effect
Currently, ondemand calculates the target frequency proportional to load
using the formula:
Target frequency = C * load
where C = policy->cpuinfo.max_freq / 100
Though, in many cases, the minimum available frequency is pretty high and
the above calculation introduces a dead band from load 0 to
100 * policy->cpuinfo.min_freq / policy->cpuinfo.max_freq where the target
frequency is always calculated to less than policy->cpuinfo.min_freq and
the minimum frequency is selected.
For example: on Intel i7-3770 @ 3.4GHz the policy->cpuinfo.min_freq = 1600000
and the policy->cpuinfo.max_freq = 3400000 (without turbo). Thus, the CPU
starts to scale up at a load above 47.
On quad core 1500MHz Krait the policy->cpuinfo.min_freq = 384000
and the policy->cpuinfo.max_freq = 1512000. Thus, the CPU starts to scale
at load above 25.
Change the calculation of target frequency to eliminate the above effect using
the formula:
Target frequency = A + B * load
where A = policy->cpuinfo.min_freq and
B = (policy->cpuinfo.max_freq - policy->cpuinfo->min_freq) / 100
This will map load values 0 to 100 linearly to cpuinfo.min_freq to
cpuinfo.max_freq.
Also, use the CPUFREQ_RELATION_C in __cpufreq_driver_target to select the
closest frequency in frequency_table. This is necessary to avoid selection
of minimum frequency only when load equals to 0. It will also help for selection
of frequencies using a more 'fair' criterion.
Tables below show the difference in selected frequency for specific values
of load without and with this patch. On Intel i7-3770 @ 3.40GHz:
Without With
Load Target Selected Target Selected
0 0 1600000 1600000 1600000
5 170050 1600000 1690050 1700000
10 340100 1600000 1780100 1700000
15 510150 1600000 1870150 1900000
20 680200 1600000 1960200 2000000
25 850250 1600000 2050250 2100000
30 1020300 1600000 2140300 2100000
35 1190350 1600000 2230350 2200000
40 1360400 1600000 2320400 2400000
45 1530450 1600000 2410450 2400000
50 1700500 1900000 2500500 2500000
55 1870550 1900000 2590550 2600000
60 2040600 2100000 2680600 2600000
65 2210650 2400000 2770650 2800000
70 2380700 2400000 2860700 2800000
75 2550750 2600000 2950750 3000000
80 2720800 2800000 3040800 3000000
85 2890850 2900000 3130850 3100000
90 3060900 3100000 3220900 3300000
95 3230950 3300000 3310950 3300000
100 3401000 3401000 3401000 3401000
On ARM quad core 1500MHz Krait:
Without With
Load Target Selected Target Selected
0 0 384000 384000 384000
5 75600 384000 440400 486000
10 151200 384000 496800 486000
15 226800 384000 553200 594000
20 302400 384000 609600 594000
25 378000 384000 666000 702000
30 453600 486000 722400 702000
35 529200 594000 778800 810000
40 604800 702000 835200 810000
45 680400 702000 891600 918000
50 756000 810000 948000 918000
55 831600 918000 1004400 1026000
60 907200 918000 1060800 1026000
65 982800 1026000 1117200 1134000
70 1058400 1134000 1173600 1134000
75 1134000 1134000 1230000 1242000
80 1209600 1242000 1286400 1242000
85 1285200 1350000 1342800 1350000
90 1360800 1458000 1399200 1350000
95 1436400 1458000 1455600 1458000
100 1512000 1512000 1512000 1512000
Tested on Intel i7-3770 CPU @ 3.40GHz and on ARM quad core 1500MHz Krait
(Android smartphone).
Benchmarks on Intel i7 shows a performance improvement on low and medium
work loads with lower power consumption. Specifics:
Phoronix Linux Kernel Compilation 3.1:
Time: -0.40%, energy: -0.07%
Phoronix Apache:
Time: -4.98%, energy: -2.35%
Phoronix FFMPEG:
Time: -6.29%, energy: -4.02%
Also, running mp3 decoding (very low load) shows no differences with and
without this patch.
Signed-off-by: Stratos Karafotis <stratosk@semaphore.gr>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2014-06-30 20:59:34 +04:00
unsigned int freq_next , min_f , max_f ;
min_f = policy - > cpuinfo . min_freq ;
max_f = policy - > cpuinfo . max_freq ;
freq_next = min_f + load * ( max_f - min_f ) / 100 ;
2012-10-26 02:47:42 +04:00
/* No longer fully busy, reset rate_mult */
2016-02-15 04:20:51 +03:00
policy_dbs - > rate_mult = 1 ;
2012-10-26 02:47:42 +04:00
2016-02-16 23:02:24 +03:00
if ( od_tuners - > powersave_bias )
freq_next = od_ops . powersave_bias_target ( policy ,
freq_next ,
2021-09-08 17:05:29 +03:00
CPUFREQ_RELATION_LE ) ;
2016-02-16 23:02:24 +03:00
2021-09-08 17:05:29 +03:00
__cpufreq_driver_target ( policy , freq_next , CPUFREQ_RELATION_CE ) ;
2012-10-26 02:47:42 +04:00
}
2005-04-17 02:20:36 +04:00
}
2016-11-08 08:36:33 +03:00
static unsigned int od_dbs_update ( struct cpufreq_policy * policy )
2012-10-26 02:47:42 +04:00
{
2016-02-07 18:24:26 +03:00
struct policy_dbs_info * policy_dbs = policy - > governor_data ;
struct dbs_data * dbs_data = policy_dbs - > dbs_data ;
2016-02-18 20:40:14 +03:00
struct od_policy_dbs_info * dbs_info = to_dbs_info ( policy_dbs ) ;
2016-02-15 04:21:35 +03:00
int sample_type = dbs_info - > sample_type ;
2013-01-31 21:28:02 +04:00
2012-10-26 02:47:42 +04:00
/* Common NORMAL_SAMPLE setup */
2015-07-18 09:01:00 +03:00
dbs_info - > sample_type = OD_NORMAL_SAMPLE ;
2016-02-15 04:19:31 +03:00
/*
* OD_SUB_SAMPLE doesn ' t make sense if sample_delay_ns is 0 , so ignore
* it then .
*/
if ( sample_type = = OD_SUB_SAMPLE & & policy_dbs - > sample_delay_ns > 0 ) {
2015-07-18 09:01:00 +03:00
__cpufreq_driver_target ( policy , dbs_info - > freq_lo ,
2021-09-08 17:05:29 +03:00
CPUFREQ_RELATION_HE ) ;
2016-02-15 04:22:13 +03:00
return dbs_info - > freq_lo_delay_us ;
2016-02-15 04:21:35 +03:00
}
od_update ( policy ) ;
if ( dbs_info - > freq_lo ) {
2016-11-08 08:36:33 +03:00
/* Setup SUB_SAMPLE */
2016-02-15 04:21:35 +03:00
dbs_info - > sample_type = OD_SUB_SAMPLE ;
2016-02-15 04:22:13 +03:00
return dbs_info - > freq_hi_delay_us ;
2012-10-26 02:47:42 +04:00
}
2016-02-15 04:22:13 +03:00
return dbs_data - > sampling_rate * policy_dbs - > rate_mult ;
2012-12-27 18:55:40 +04:00
}
2012-10-26 02:47:42 +04:00
/************************** sysfs interface ************************/
2016-02-07 18:05:07 +03:00
static struct dbs_governor od_dbs_gov ;
2005-04-17 02:20:36 +04:00
2016-03-22 04:47:51 +03:00
static ssize_t store_io_is_busy ( struct gov_attr_set * attr_set , const char * buf ,
size_t count )
2010-05-09 19:26:51 +04:00
{
2016-03-22 04:47:51 +03:00
struct dbs_data * dbs_data = to_dbs_data ( attr_set ) ;
2010-05-09 19:26:51 +04:00
unsigned int input ;
int ret ;
ret = sscanf ( buf , " %u " , & input ) ;
if ( ret ! = 1 )
return - EINVAL ;
2016-02-18 04:20:13 +03:00
dbs_data - > io_is_busy = ! ! input ;
2013-02-28 20:57:32 +04:00
/* we need to re-evaluate prev_cpu_idle */
2016-02-21 02:51:27 +03:00
gov_update_cpu_data ( dbs_data ) ;
2016-02-18 04:26:55 +03:00
2010-05-09 19:26:51 +04:00
return count ;
}
2016-03-22 04:47:51 +03:00
static ssize_t store_up_threshold ( struct gov_attr_set * attr_set ,
const char * buf , size_t count )
2005-04-17 02:20:36 +04:00
{
2016-03-22 04:47:51 +03:00
struct dbs_data * dbs_data = to_dbs_data ( attr_set ) ;
2005-04-17 02:20:36 +04:00
unsigned int input ;
int ret ;
2006-06-29 00:52:18 +04:00
ret = sscanf ( buf , " %u " , & input ) ;
2005-04-17 02:20:36 +04:00
2006-02-28 08:43:23 +03:00
if ( ret ! = 1 | | input > MAX_FREQUENCY_UP_THRESHOLD | |
2005-06-01 06:03:50 +04:00
input < MIN_FREQUENCY_UP_THRESHOLD ) {
2005-04-17 02:20:36 +04:00
return - EINVAL ;
}
2013-02-06 16:34:00 +04:00
2016-02-09 06:31:32 +03:00
dbs_data - > up_threshold = input ;
2005-04-17 02:20:36 +04:00
return count ;
}
2016-03-22 04:47:51 +03:00
static ssize_t store_sampling_down_factor ( struct gov_attr_set * attr_set ,
const char * buf , size_t count )
2010-10-07 00:54:24 +04:00
{
2016-03-22 04:47:51 +03:00
struct dbs_data * dbs_data = to_dbs_data ( attr_set ) ;
2016-02-15 04:20:51 +03:00
struct policy_dbs_info * policy_dbs ;
unsigned int input ;
2010-10-07 00:54:24 +04:00
int ret ;
ret = sscanf ( buf , " %u " , & input ) ;
if ( ret ! = 1 | | input > MAX_SAMPLING_DOWN_FACTOR | | input < 1 )
return - EINVAL ;
2016-02-15 04:20:51 +03:00
2016-02-09 06:31:32 +03:00
dbs_data - > sampling_down_factor = input ;
2010-10-07 00:54:24 +04:00
/* Reset down sampling multiplier in case it was active */
2016-03-22 04:47:51 +03:00
list_for_each_entry ( policy_dbs , & attr_set - > policy_list , list ) {
2016-02-15 04:20:51 +03:00
/*
* Doing this without locking might lead to using different
2016-11-08 08:36:33 +03:00
* rate_mult values in od_update ( ) and od_dbs_update ( ) .
2016-02-15 04:20:51 +03:00
*/
2016-11-08 08:36:33 +03:00
mutex_lock ( & policy_dbs - > update_mutex ) ;
2016-02-15 04:20:51 +03:00
policy_dbs - > rate_mult = 1 ;
2016-11-08 08:36:33 +03:00
mutex_unlock ( & policy_dbs - > update_mutex ) ;
2010-10-07 00:54:24 +04:00
}
2016-02-15 04:20:51 +03:00
2010-10-07 00:54:24 +04:00
return count ;
}
2016-03-22 04:47:51 +03:00
static ssize_t store_ignore_nice_load ( struct gov_attr_set * attr_set ,
const char * buf , size_t count )
2005-06-01 06:03:47 +04:00
{
2016-03-22 04:47:51 +03:00
struct dbs_data * dbs_data = to_dbs_data ( attr_set ) ;
2005-06-01 06:03:47 +04:00
unsigned int input ;
int ret ;
2006-06-29 00:52:18 +04:00
ret = sscanf ( buf , " %u " , & input ) ;
2009-01-18 09:43:44 +03:00
if ( ret ! = 1 )
2005-06-01 06:03:47 +04:00
return - EINVAL ;
2009-01-18 09:43:44 +03:00
if ( input > 1 )
2005-06-01 06:03:47 +04:00
input = 1 ;
2006-02-28 08:43:23 +03:00
2016-02-09 06:31:32 +03:00
if ( input = = dbs_data - > ignore_nice_load ) { /* nothing to do */
2005-06-01 06:03:47 +04:00
return count ;
}
2016-02-09 06:31:32 +03:00
dbs_data - > ignore_nice_load = input ;
2005-06-01 06:03:47 +04:00
2006-06-29 00:49:52 +04:00
/* we need to re-evaluate prev_cpu_idle */
2016-02-21 02:51:27 +03:00
gov_update_cpu_data ( dbs_data ) ;
2009-01-23 17:25:02 +03:00
2005-06-01 06:03:47 +04:00
return count ;
}
2016-03-22 04:47:51 +03:00
static ssize_t store_powersave_bias ( struct gov_attr_set * attr_set ,
const char * buf , size_t count )
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
{
2016-03-22 04:47:51 +03:00
struct dbs_data * dbs_data = to_dbs_data ( attr_set ) ;
2013-03-27 19:58:58 +04:00
struct od_dbs_tuners * od_tuners = dbs_data - > tuners ;
2016-02-18 04:28:24 +03:00
struct policy_dbs_info * policy_dbs ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
unsigned int input ;
int ret ;
ret = sscanf ( buf , " %u " , & input ) ;
if ( ret ! = 1 )
return - EINVAL ;
if ( input > 1000 )
input = 1000 ;
2013-03-27 19:58:58 +04:00
od_tuners - > powersave_bias = input ;
2016-02-18 04:28:24 +03:00
2016-03-22 04:47:51 +03:00
list_for_each_entry ( policy_dbs , & attr_set - > policy_list , list )
2016-02-18 04:28:24 +03:00
ondemand_powersave_bias_init ( policy_dbs - > policy ) ;
[CPUFREQ][2/2] ondemand: updated add powersave_bias tunable
ondemand selects the minimum frequency that can retire
a workload with negligible idle time -- ideally resulting in the highest
performance/power efficiency with negligible performance impact.
But on some systems and some workloads, this algorithm
is more performance biased than necessary, and
de-tuning it a bit to allow some performance impact
can save measurable power.
This patch adds a "powersave_bias" tunable to ondemand
to allow it to reduce its target frequency by a specified percent.
By default, the powersave_bias is 0 and has no effect.
powersave_bias is in units of 0.1%, so it has an effective range
of 1 through 1000, resulting in 0.1% to 100% impact.
In practice, users will not be able to detect a difference between
0.1% increments, but 1.0% increments turned out to be too large.
Also, the max value of 1000 (100%) would simply peg the system
in its deepest power saving P-state, unless the processor really has
a hardware P-state at 0Hz:-)
For example, If ondemand requests 2.0GHz based on utilization,
and powersave_bias=100, this code will knock 10% off the target
and seek a target of 1.8GHz instead of 2.0GHz until the
next sampling. If 1.8 is an exact match with an hardware frequency
we use it, otherwise we average our time between the frequency
next higher than 1.8 and next lower than 1.8.
Note that a user or administrative program can change powersave_bias
at run-time depending on how they expect the system to be used.
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi at intel.com>
Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy at intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
2006-07-31 22:28:12 +04:00
return count ;
}
cpufreq: governor: New sysfs show/store callbacks for governor tunables
The ondemand and conservative governors use the global-attr or freq-attr
structures to represent sysfs attributes corresponding to their tunables
(which of them is actually used depends on whether or not different
policy objects can use the same governor with different tunables at the
same time and, consequently, on where those attributes are located in
sysfs).
Unfortunately, in the freq-attr case, the standard cpufreq show/store
sysfs attribute callbacks are applied to the governor tunable attributes
and they always acquire the policy->rwsem lock before carrying out the
operation. That may lead to an ABBA deadlock if governor tunable
attributes are removed under policy->rwsem while one of them is being
accessed concurrently (if sysfs attributes removal wins the race, it
will wait for the access to complete with policy->rwsem held while the
attribute callback will block on policy->rwsem indefinitely).
We attempted to address this issue by dropping policy->rwsem around
governor tunable attributes removal (that is, around invocations of the
->governor callback with the event arg equal to CPUFREQ_GOV_POLICY_EXIT)
in cpufreq_set_policy(), but that opened up race conditions that had not
been possible with policy->rwsem held all the time. Therefore
policy->rwsem cannot be dropped in cpufreq_set_policy() at any point,
but the deadlock situation described above must be avoided too.
To that end, use the observation that in principle governor tunables may
be represented by the same data type regardless of whether the governor
is system-wide or per-policy and introduce a new structure, struct
governor_attr, for representing them and new corresponding macros for
creating show/store sysfs callbacks for them. Also make their parent
kobject use a new kobject type whose default show/store callbacks are
not related to the standard core cpufreq ones in any way (and they don't
acquire policy->rwsem in particular).
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
[ rjw: Subject & changelog + rebase ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-02-09 06:31:33 +03:00
gov_show_one_common ( sampling_rate ) ;
gov_show_one_common ( up_threshold ) ;
gov_show_one_common ( sampling_down_factor ) ;
gov_show_one_common ( ignore_nice_load ) ;
2016-02-18 04:20:13 +03:00
gov_show_one_common ( io_is_busy ) ;
cpufreq: governor: New sysfs show/store callbacks for governor tunables
The ondemand and conservative governors use the global-attr or freq-attr
structures to represent sysfs attributes corresponding to their tunables
(which of them is actually used depends on whether or not different
policy objects can use the same governor with different tunables at the
same time and, consequently, on where those attributes are located in
sysfs).
Unfortunately, in the freq-attr case, the standard cpufreq show/store
sysfs attribute callbacks are applied to the governor tunable attributes
and they always acquire the policy->rwsem lock before carrying out the
operation. That may lead to an ABBA deadlock if governor tunable
attributes are removed under policy->rwsem while one of them is being
accessed concurrently (if sysfs attributes removal wins the race, it
will wait for the access to complete with policy->rwsem held while the
attribute callback will block on policy->rwsem indefinitely).
We attempted to address this issue by dropping policy->rwsem around
governor tunable attributes removal (that is, around invocations of the
->governor callback with the event arg equal to CPUFREQ_GOV_POLICY_EXIT)
in cpufreq_set_policy(), but that opened up race conditions that had not
been possible with policy->rwsem held all the time. Therefore
policy->rwsem cannot be dropped in cpufreq_set_policy() at any point,
but the deadlock situation described above must be avoided too.
To that end, use the observation that in principle governor tunables may
be represented by the same data type regardless of whether the governor
is system-wide or per-policy and introduce a new structure, struct
governor_attr, for representing them and new corresponding macros for
creating show/store sysfs callbacks for them. Also make their parent
kobject use a new kobject type whose default show/store callbacks are
not related to the standard core cpufreq ones in any way (and they don't
acquire policy->rwsem in particular).
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
[ rjw: Subject & changelog + rebase ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-02-09 06:31:33 +03:00
gov_show_one ( od , powersave_bias ) ;
gov_attr_rw ( sampling_rate ) ;
gov_attr_rw ( io_is_busy ) ;
gov_attr_rw ( up_threshold ) ;
gov_attr_rw ( sampling_down_factor ) ;
gov_attr_rw ( ignore_nice_load ) ;
gov_attr_rw ( powersave_bias ) ;
2021-12-28 16:19:12 +03:00
static struct attribute * od_attrs [ ] = {
cpufreq: governor: New sysfs show/store callbacks for governor tunables
The ondemand and conservative governors use the global-attr or freq-attr
structures to represent sysfs attributes corresponding to their tunables
(which of them is actually used depends on whether or not different
policy objects can use the same governor with different tunables at the
same time and, consequently, on where those attributes are located in
sysfs).
Unfortunately, in the freq-attr case, the standard cpufreq show/store
sysfs attribute callbacks are applied to the governor tunable attributes
and they always acquire the policy->rwsem lock before carrying out the
operation. That may lead to an ABBA deadlock if governor tunable
attributes are removed under policy->rwsem while one of them is being
accessed concurrently (if sysfs attributes removal wins the race, it
will wait for the access to complete with policy->rwsem held while the
attribute callback will block on policy->rwsem indefinitely).
We attempted to address this issue by dropping policy->rwsem around
governor tunable attributes removal (that is, around invocations of the
->governor callback with the event arg equal to CPUFREQ_GOV_POLICY_EXIT)
in cpufreq_set_policy(), but that opened up race conditions that had not
been possible with policy->rwsem held all the time. Therefore
policy->rwsem cannot be dropped in cpufreq_set_policy() at any point,
but the deadlock situation described above must be avoided too.
To that end, use the observation that in principle governor tunables may
be represented by the same data type regardless of whether the governor
is system-wide or per-policy and introduce a new structure, struct
governor_attr, for representing them and new corresponding macros for
creating show/store sysfs callbacks for them. Also make their parent
kobject use a new kobject type whose default show/store callbacks are
not related to the standard core cpufreq ones in any way (and they don't
acquire policy->rwsem in particular).
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Shilpasri G Bhat <shilpa.bhat@linux.vnet.ibm.com>
[ rjw: Subject & changelog + rebase ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-02-09 06:31:33 +03:00
& sampling_rate . attr ,
& up_threshold . attr ,
& sampling_down_factor . attr ,
& ignore_nice_load . attr ,
& powersave_bias . attr ,
& io_is_busy . attr ,
2005-04-17 02:20:36 +04:00
NULL
} ;
2021-12-28 16:19:12 +03:00
ATTRIBUTE_GROUPS ( od ) ;
2005-04-17 02:20:36 +04:00
/************************** sysfs end ************************/
2016-02-18 20:40:14 +03:00
static struct policy_dbs_info * od_alloc ( void )
{
struct od_policy_dbs_info * dbs_info ;
dbs_info = kzalloc ( sizeof ( * dbs_info ) , GFP_KERNEL ) ;
return dbs_info ? & dbs_info - > policy_dbs : NULL ;
}
static void od_free ( struct policy_dbs_info * policy_dbs )
{
kfree ( to_dbs_info ( policy_dbs ) ) ;
}
2016-05-18 23:59:49 +03:00
static int od_init ( struct dbs_data * dbs_data )
2013-03-27 19:58:58 +04:00
{
struct od_dbs_tuners * tuners ;
u64 idle_time ;
int cpu ;
2013-08-06 21:23:06 +04:00
tuners = kzalloc ( sizeof ( * tuners ) , GFP_KERNEL ) ;
2016-05-18 15:25:26 +03:00
if ( ! tuners )
2013-03-27 19:58:58 +04:00
return - ENOMEM ;
cpu = get_cpu ( ) ;
idle_time = get_cpu_idle_time_us ( cpu , NULL ) ;
put_cpu ( ) ;
if ( idle_time ! = - 1ULL ) {
/* Idle micro accounting is supported. Use finer thresholds */
2016-02-09 06:31:32 +03:00
dbs_data - > up_threshold = MICRO_FREQUENCY_UP_THRESHOLD ;
2013-03-27 19:58:58 +04:00
} else {
2016-02-09 06:31:32 +03:00
dbs_data - > up_threshold = DEF_FREQUENCY_UP_THRESHOLD ;
2013-03-27 19:58:58 +04:00
}
2016-02-09 06:31:32 +03:00
dbs_data - > sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR ;
dbs_data - > ignore_nice_load = 0 ;
2013-06-26 00:42:37 +04:00
tuners - > powersave_bias = default_powersave_bias ;
2016-02-18 04:20:13 +03:00
dbs_data - > io_is_busy = should_io_be_busy ( ) ;
2013-03-27 19:58:58 +04:00
dbs_data - > tuners = tuners ;
return 0 ;
}
2016-05-18 23:59:49 +03:00
static void od_exit ( struct dbs_data * dbs_data )
2013-03-27 19:58:58 +04:00
{
kfree ( dbs_data - > tuners ) ;
}
2016-02-18 04:21:21 +03:00
static void od_start ( struct cpufreq_policy * policy )
{
2016-02-18 20:40:14 +03:00
struct od_policy_dbs_info * dbs_info = to_dbs_info ( policy - > governor_data ) ;
2016-02-18 04:21:21 +03:00
dbs_info - > sample_type = OD_NORMAL_SAMPLE ;
2016-02-18 04:28:24 +03:00
ondemand_powersave_bias_init ( policy ) ;
2016-02-18 04:21:21 +03:00
}
2012-10-26 02:47:42 +04:00
static struct od_ops od_ops = {
2013-04-02 18:56:56 +04:00
. powersave_bias_target = generic_powersave_bias_target ,
2012-10-26 02:47:42 +04:00
} ;
2006-06-29 00:51:19 +04:00
2016-02-07 18:05:07 +03:00
static struct dbs_governor od_dbs_gov = {
2016-06-03 00:24:15 +03:00
. gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER ( " ondemand " ) ,
2021-12-28 16:19:12 +03:00
. kobj_type = { . default_groups = od_groups } ,
2016-11-08 08:36:33 +03:00
. gov_dbs_update = od_dbs_update ,
2016-02-18 20:40:14 +03:00
. alloc = od_alloc ,
. free = od_free ,
2013-03-27 19:58:58 +04:00
. init = od_init ,
. exit = od_exit ,
2016-02-18 04:21:21 +03:00
. start = od_start ,
2012-10-26 02:47:42 +04:00
} ;
2005-04-17 02:20:36 +04:00
2020-06-29 11:24:59 +03:00
# define CPU_FREQ_GOV_ONDEMAND (od_dbs_gov.gov)
2016-02-05 05:16:08 +03:00
2013-04-02 18:56:56 +04:00
static void od_set_powersave_bias ( unsigned int powersave_bias )
{
unsigned int cpu ;
cpumask_t done ;
2013-06-26 00:42:37 +04:00
default_powersave_bias = powersave_bias ;
2013-04-02 18:56:56 +04:00
cpumask_clear ( & done ) ;
2021-08-03 17:16:11 +03:00
cpus_read_lock ( ) ;
2013-04-02 18:56:56 +04:00
for_each_online_cpu ( cpu ) {
2016-02-21 02:51:27 +03:00
struct cpufreq_policy * policy ;
2016-02-10 19:07:44 +03:00
struct policy_dbs_info * policy_dbs ;
2016-02-21 02:51:27 +03:00
struct dbs_data * dbs_data ;
struct od_dbs_tuners * od_tuners ;
2015-07-18 09:00:59 +03:00
2013-04-02 18:56:56 +04:00
if ( cpumask_test_cpu ( cpu , & done ) )
continue ;
2016-02-21 02:51:27 +03:00
policy = cpufreq_cpu_get_raw ( cpu ) ;
2020-06-29 11:24:59 +03:00
if ( ! policy | | policy - > governor ! = & CPU_FREQ_GOV_ONDEMAND )
2016-02-21 02:51:27 +03:00
continue ;
policy_dbs = policy - > governor_data ;
2016-02-10 19:07:44 +03:00
if ( ! policy_dbs )
2013-06-26 00:42:37 +04:00
continue ;
2013-04-02 18:56:56 +04:00
cpumask_or ( & done , & done , policy - > cpus ) ;
2013-06-26 00:42:37 +04:00
2016-02-07 18:24:26 +03:00
dbs_data = policy_dbs - > dbs_data ;
2013-06-26 00:42:37 +04:00
od_tuners = dbs_data - > tuners ;
od_tuners - > powersave_bias = default_powersave_bias ;
2013-04-02 18:56:56 +04:00
}
2021-08-03 17:16:11 +03:00
cpus_read_unlock ( ) ;
2013-04-02 18:56:56 +04:00
}
void od_register_powersave_bias_handler ( unsigned int ( * f )
( struct cpufreq_policy * , unsigned int , unsigned int ) ,
unsigned int powersave_bias )
{
od_ops . powersave_bias_target = f ;
od_set_powersave_bias ( powersave_bias ) ;
}
EXPORT_SYMBOL_GPL ( od_register_powersave_bias_handler ) ;
void od_unregister_powersave_bias_handler ( void )
{
od_ops . powersave_bias_target = generic_powersave_bias_target ;
od_set_powersave_bias ( 0 ) ;
}
EXPORT_SYMBOL_GPL ( od_unregister_powersave_bias_handler ) ;
2006-06-29 00:52:18 +04:00
MODULE_AUTHOR ( " Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> " ) ;
MODULE_AUTHOR ( " Alexey Starikovskiy <alexey.y.starikovskiy@intel.com> " ) ;
MODULE_DESCRIPTION ( " 'cpufreq_ondemand' - A dynamic cpufreq governor for "
2009-01-18 09:43:44 +03:00
" Low Latency Frequency Transition capable processors " ) ;
2006-06-29 00:52:18 +04:00
MODULE_LICENSE ( " GPL " ) ;
2005-04-17 02:20:36 +04:00
2008-01-18 02:21:08 +03:00
# ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
2016-02-05 04:37:42 +03:00
struct cpufreq_governor * cpufreq_default_governor ( void )
{
2020-06-29 11:24:59 +03:00
return & CPU_FREQ_GOV_ONDEMAND ;
2016-02-05 04:37:42 +03:00
}
2008-01-18 02:21:08 +03:00
# endif
2020-06-29 11:24:59 +03:00
cpufreq_governor_init ( CPU_FREQ_GOV_ONDEMAND ) ;
cpufreq_governor_exit ( CPU_FREQ_GOV_ONDEMAND ) ;